SeqAn3 3.2.0
The Modern C++ library for sequence analysis.
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
cigar.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <algorithm>
16#include <concepts>
17#include <ranges>
18#include <seqan3/std/charconv>
19#include <sstream>
20
29
30namespace seqan3::detail
31{
34struct view_equality_fn
35{
37 template <std::ranges::forward_range rng1_type, std::ranges::forward_range rng2_type>
38 constexpr bool operator()(rng1_type && rng1, rng2_type && rng2) const
39 {
40 return std::ranges::equal(rng1, rng2);
41 }
42};
43
83template <typename reference_char_type, typename query_char_type>
84[[nodiscard]] constexpr cigar::operation map_aligned_values_to_cigar_op(reference_char_type const reference_char,
85 query_char_type const query_char,
86 bool const extended_cigar)
87 requires seqan3::detail::weakly_equality_comparable_with<reference_char_type, gap>
88 && seqan3::detail::weakly_equality_comparable_with<query_char_type, gap>
89{
90 constexpr std::array<char, 6> operators{'M', 'D', 'I', 'P', 'X', '='}; // contains the possible cigar operators.
91 uint8_t key = (static_cast<uint8_t>(reference_char == gap{}) << 1) | static_cast<uint8_t>(query_char == gap{});
92 if (extended_cigar && (key == 0)) // in extended format refine the substitution operator to match/mismatch.
93 key |= ((1 << 2) | static_cast<uint8_t>(query_char == reference_char)); // maps to [4, 5].
94
95 return assign_char_to(operators[key], cigar::operation{});
96}
97
105inline void update_alignment_lengths(int32_t & ref_length,
106 int32_t & seq_length,
107 char const cigar_operation,
108 uint32_t const cigar_count)
109{
110 switch (cigar_operation)
111 {
112 case 'M':
113 case '=':
114 case 'X':
115 ref_length += cigar_count, seq_length += cigar_count;
116 break;
117 case 'D':
118 case 'N':
119 ref_length += cigar_count;
120 break;
121 case 'I':
122 seq_length += cigar_count;
123 break;
124 case 'S':
125 case 'H':
126 case 'P':
127 break; // no op (soft-clipping or padding does not increase either length)
128 default:
129 throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}};
130 }
131}
132
147template <typename cigar_input_type>
148inline std::tuple<std::vector<cigar>, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input)
149{
150 std::vector<cigar> operations{};
151 std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
152 char cigar_operation{};
153 uint32_t cigar_count{};
154 int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query
155
156 // transform input into a single input view if it isn't already
157 auto cigar_view = cigar_input | views::single_pass_input;
158
159 // parse the rest of the cigar
160 // -------------------------------------------------------------------------------------------------------------
161 while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
162 {
163 auto buff_end = (std::ranges::copy(cigar_view | detail::take_until_or_throw(!is_digit), buffer.data())).out;
164 cigar_operation = *std::ranges::begin(cigar_view);
165 ++std::ranges::begin(cigar_view);
166
167 if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{})
168 throw format_error{"Corrupted cigar string encountered"};
169
170 update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
171 operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
172 }
173
174 return {operations, ref_length, seq_length};
175}
176
214template <seqan3::detail::pairwise_alignment alignment_type>
215[[nodiscard]] inline std::vector<cigar> get_cigar_vector(alignment_type && alignment,
216 uint32_t const query_start_pos = 0,
217 uint32_t const query_end_pos = 0,
218 bool const extended_cigar = false)
219{
220 using std::get;
221
222 auto & ref_seq = get<0>(alignment);
223 auto & query_seq = get<1>(alignment);
224
225 if (ref_seq.size() != query_seq.size())
226 throw std::logic_error{"The aligned sequences must have the same length."};
227
228 std::vector<cigar> result{};
229
230 if (!ref_seq.size())
231 return result; // return empty string if sequences are empty
232
233 // Add (S)oft-clipping at the start of the read
234 if (query_start_pos)
235 result.emplace_back(query_start_pos, 'S'_cigar_operation);
236
237 // Create cigar string from alignment
238 // -------------------------------------------------------------------------
239 // initialize first operation and count value:
240 cigar::operation operation{map_aligned_values_to_cigar_op(ref_seq[0], query_seq[0], extended_cigar)};
241 uint32_t count{0};
242
243 // go through alignment columns
244 for (auto column : views::zip(ref_seq, query_seq))
245 {
246 cigar::operation next_op =
247 map_aligned_values_to_cigar_op(std::get<0>(column), std::get<1>(column), extended_cigar);
248
249 if (operation == next_op)
250 {
251 ++count;
252 }
253 else
254 {
255 result.emplace_back(count, operation);
256 operation = next_op;
257 count = 1;
258 }
259 }
260
261 // append last cigar element
262 result.emplace_back(count, operation);
263
264 // Add (S)oft-clipping at the end of the read
265 if (query_end_pos)
266 result.emplace_back(query_end_pos, 'S'_cigar_operation);
267
268 return result;
269}
270
276[[nodiscard]] inline std::string get_cigar_string(std::vector<cigar> const & cigar_vector)
277{
278 std::string result{};
279 std::ranges::for_each(cigar_vector,
280 [&result](auto & cig)
281 {
282 result.append(static_cast<std::string_view>(cig.to_string()));
283 });
284 return result;
285}
286
320template <seqan3::detail::pairwise_alignment alignment_type>
321[[nodiscard]] inline std::string get_cigar_string(alignment_type && alignment,
322 uint32_t const query_start_pos = 0,
323 uint32_t const query_end_pos = 0,
324 bool const extended_cigar = false)
325{
326 return get_cigar_string(get_cigar_vector(alignment, query_start_pos, query_end_pos, extended_cigar));
327}
328
362template <seqan3::aligned_sequence ref_seq_type, seqan3::aligned_sequence query_seq_type>
363[[nodiscard]] inline std::string get_cigar_string(ref_seq_type && ref_seq,
364 query_seq_type && query_seq,
365 uint32_t const query_start_pos = 0,
366 uint32_t const query_end_pos = 0,
367 bool const extended_cigar = false)
368{
369 return get_cigar_string(std::tie(ref_seq, query_seq), query_start_pos, query_end_pos, extended_cigar);
370}
371
394template <seqan3::detail::writable_pairwise_alignment alignment_type>
395inline void alignment_from_cigar(alignment_type & alignment, std::vector<cigar> const & cigar_vector)
396{
397 using std::get;
398 auto current_ref_pos = std::ranges::begin(get<0>(alignment));
399 auto current_read_pos = std::ranges::begin(get<1>(alignment));
400
401 for (auto [cigar_count, cigar_operation] : cigar_vector)
402 {
403 // ignore since alignment shall contain sliced sequences
404 if (('S'_cigar_operation == cigar_operation) || ('H'_cigar_operation == cigar_operation))
405 continue;
406
407 assert(('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation)
408 || ('X'_cigar_operation == cigar_operation) || ('D'_cigar_operation == cigar_operation)
409 || ('N'_cigar_operation == cigar_operation) || ('I'_cigar_operation == cigar_operation)
410 || ('P'_cigar_operation == cigar_operation)); // checked during IO
411
412 if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation)
413 || ('X'_cigar_operation == cigar_operation))
414 {
415 std::ranges::advance(current_ref_pos, cigar_count);
416 std::ranges::advance(current_read_pos, cigar_count);
417 }
418 else if (('D'_cigar_operation == cigar_operation) || ('N'_cigar_operation == cigar_operation))
419 {
420 // insert gaps into read
421
422 assert(std::distance(current_read_pos, std::ranges::end(get<1>(alignment))) >= 0);
423 current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
424 ++current_read_pos;
425 std::ranges::advance(current_ref_pos, cigar_count);
426 }
427 else if (('I'_cigar_operation == cigar_operation)) // Insert gaps into ref
428 {
429 assert(std::ranges::distance(current_ref_pos, std::ranges::end(get<0>(alignment))) >= 0);
430 current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
431 ++current_ref_pos;
432 std::ranges::advance(current_read_pos, cigar_count);
433 }
434 else if (('P'_cigar_operation == cigar_operation)) // skip padding
435 {
436 current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
437 ++current_ref_pos;
438
439 current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
440 ++current_read_pos;
441 }
442 }
443}
444
447struct access_restrictor_fn
448{
450 template <typename chr_t>
451 [[noreturn]] chr_t operator()(chr_t) const
452 {
453 throw std::logic_error{"Access is not allowed because there is no sequence information."};
454 }
455};
456
457} // namespace seqan3::detail
Provides the seqan3::cigar alphabet.
T begin(T... args)
The <charconv> header from C++17's standard library.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:163
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:98
T copy(T... args)
T distance(T... args)
T emplace_back(T... args)
T equal(T... args)
T for_each(T... args)
T from_chars(T... args)
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: concept.hpp:524
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
constexpr auto is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:262
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:164
constexpr auto single_pass_input
A view adapter that decays most of the range properties and adds single pass behavior.
Definition: single_pass_input.hpp:348
constexpr auto zip
A view adaptor that takes several views and returns tuple-like values from every i-th element of each...
Definition: zip.hpp:573
constexpr auto const & get(configuration< configs_t... > const &config) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: configuration.hpp:415
Provides seqan3::detail::pairwise_alignment and seqan3::detail::writable_pairwise_alignment.
Provides character predicates for tokenisation.
Provides seqan3::views::single_pass_input.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
Auxiliary for pretty printing of exception messages.
Provides seqan3::tuple_like.
Provides seqan3::views::zip.