SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
cigar.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/std/algorithm>
16#include <seqan3/std/charconv>
17#include <seqan3/std/concepts>
18#include <seqan3/std/ranges>
19#include <sstream>
20
29
30namespace seqan3::detail
31{
34struct view_equality_fn
35{
37 template <std::ranges::forward_range rng1_type, std::ranges::forward_range rng2_type>
38 constexpr bool operator()(rng1_type && rng1, rng2_type && rng2) const
39 {
40 return std::ranges::equal(rng1, rng2);
41 }
42};
43
83template <typename reference_char_type, typename query_char_type>
84[[nodiscard]] constexpr cigar::operation map_aligned_values_to_cigar_op(reference_char_type const reference_char,
85 query_char_type const query_char,
86 bool const extended_cigar)
88 requires seqan3::detail::weakly_equality_comparable_with<reference_char_type, gap> &&
89 seqan3::detail::weakly_equality_comparable_with<query_char_type, gap>
91{
92 constexpr std::array<char, 6> operators{'M', 'D', 'I', 'P', 'X', '='}; // contains the possible cigar operators.
93 uint8_t key = (static_cast<uint8_t>(reference_char == gap{}) << 1) | static_cast<uint8_t>(query_char == gap{});
94 if (extended_cigar && (key == 0)) // in extended format refine the substitution operator to match/mismatch.
95 key |= ((1 << 2) | static_cast<uint8_t>(query_char == reference_char)); // maps to [4, 5].
96
97 return assign_char_to(operators[key], cigar::operation{});
98}
99
107inline void update_alignment_lengths(int32_t & ref_length,
108 int32_t & seq_length,
109 char const cigar_operation,
110 uint32_t const cigar_count)
111{
112 switch (cigar_operation)
113 {
114 case 'M': case '=': case 'X': ref_length += cigar_count, seq_length += cigar_count; break;
115 case 'D': case 'N': ref_length += cigar_count; break;
116 case 'I': seq_length += cigar_count; break;
117 case 'S': case 'H': case 'P': break; // no op (soft-clipping or padding does not increase either length)
118 default: throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}};
119 }
120}
121
136template <typename cigar_input_type>
137inline std::tuple<std::vector<cigar>, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input)
138{
139 std::vector<cigar> operations{};
140 std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
141 char cigar_operation{};
142 uint32_t cigar_count{};
143 int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query
144
145 // transform input into a single input view if it isn't already
146 auto cigar_view = cigar_input | views::single_pass_input;
147
148 // parse the rest of the cigar
149 // -------------------------------------------------------------------------------------------------------------
150 while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
151 {
152 auto buff_end = (std::ranges::copy(cigar_view | detail::take_until_or_throw(!is_digit), buffer.data())).out;
153 cigar_operation = *std::ranges::begin(cigar_view);
154 std::ranges::next(std::ranges::begin(cigar_view));
155
156 if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{})
157 throw format_error{"Corrupted cigar string encountered"};
158
159 update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
160 operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
161 }
162
163 return {operations, ref_length, seq_length};
164}
165
203template <seqan3::detail::pairwise_alignment alignment_type>
204[[nodiscard]] inline std::vector<cigar> get_cigar_vector(alignment_type && alignment,
205 uint32_t const query_start_pos = 0,
206 uint32_t const query_end_pos = 0,
207 bool const extended_cigar = false)
208{
209 using std::get;
210
211 auto & ref_seq = get<0>(alignment);
212 auto & query_seq = get<1>(alignment);
213
214 if (ref_seq.size() != query_seq.size())
215 throw std::logic_error{"The aligned sequences must have the same length."};
216
217 std::vector<cigar> result{};
218
219 if (!ref_seq.size())
220 return result; // return empty string if sequences are empty
221
222 // Add (S)oft-clipping at the start of the read
223 if (query_start_pos)
224 result.emplace_back(query_start_pos, 'S'_cigar_operation);
225
226 // Create cigar string from alignment
227 // -------------------------------------------------------------------------
228 // initialize first operation and count value:
229 cigar::operation operation{map_aligned_values_to_cigar_op(ref_seq[0], query_seq[0], extended_cigar)};
230 uint32_t count{0};
231
232 // go through alignment columns
233 for (auto column : views::zip(ref_seq, query_seq))
234 {
235 cigar::operation next_op = map_aligned_values_to_cigar_op(std::get<0>(column),
236 std::get<1>(column),
237 extended_cigar);
238
239 if (operation == next_op)
240 {
241 ++count;
242 }
243 else
244 {
245 result.emplace_back(count, operation);
246 operation = next_op;
247 count = 1;
248 }
249 }
250
251 // append last cigar element
252 result.emplace_back(count, operation);
253
254 // Add (S)oft-clipping at the end of the read
255 if (query_end_pos)
256 result.emplace_back(query_end_pos, 'S'_cigar_operation);
257
258 return result;
259}
260
266[[nodiscard]] inline std::string get_cigar_string(std::vector<cigar> const & cigar_vector)
267{
268 std::string result{};
269 std::ranges::for_each(cigar_vector, [&result] (auto & cig) { result.append(cig.to_string()); });
270 return result;
271}
272
306template <seqan3::detail::pairwise_alignment alignment_type>
307[[nodiscard]] inline std::string get_cigar_string(alignment_type && alignment,
308 uint32_t const query_start_pos = 0,
309 uint32_t const query_end_pos = 0,
310 bool const extended_cigar = false)
311{
312 return get_cigar_string(get_cigar_vector(alignment, query_start_pos, query_end_pos, extended_cigar));
313}
314
348template <seqan3::aligned_sequence ref_seq_type, seqan3::aligned_sequence query_seq_type>
349[[nodiscard]] inline std::string get_cigar_string(ref_seq_type && ref_seq,
350 query_seq_type && query_seq,
351 uint32_t const query_start_pos = 0,
352 uint32_t const query_end_pos = 0,
353 bool const extended_cigar = false)
354{
355 return get_cigar_string(std::tie(ref_seq, query_seq), query_start_pos, query_end_pos, extended_cigar);
356}
357
380template <seqan3::detail::writable_pairwise_alignment alignment_type>
381inline void alignment_from_cigar(alignment_type & alignment, std::vector<cigar> const & cigar_vector)
382{
383 using std::get;
384 auto current_ref_pos = std::ranges::begin(get<0>(alignment));
385 auto current_read_pos = std::ranges::begin(get<1>(alignment));
386
387 for (auto [cigar_count, cigar_operation] : cigar_vector)
388 {
389 // ignore since alignment shall contain sliced sequences
390 if (('S'_cigar_operation == cigar_operation) || ('H'_cigar_operation == cigar_operation))
391 continue;
392
393 assert(('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation) ||
394 ('X'_cigar_operation == cigar_operation) || ('D'_cigar_operation == cigar_operation) ||
395 ('N'_cigar_operation == cigar_operation) || ('I'_cigar_operation == cigar_operation) ||
396 ('P'_cigar_operation == cigar_operation)); // checked during IO
397
398 if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation) ||
399 ('X'_cigar_operation == cigar_operation))
400 {
401 std::ranges::advance(current_ref_pos , cigar_count);
402 std::ranges::advance(current_read_pos, cigar_count);
403 }
404 else if (('D'_cigar_operation == cigar_operation) || ('N'_cigar_operation == cigar_operation))
405 {
406 // insert gaps into read
407
408 assert(std::distance(current_read_pos, std::ranges::end(get<1>(alignment))) >= 0);
409 current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
410 ++current_read_pos;
411 std::ranges::advance(current_ref_pos , cigar_count);
412 }
413 else if (('I'_cigar_operation == cigar_operation)) // Insert gaps into ref
414 {
415 assert(std::ranges::distance(current_ref_pos, std::ranges::end(get<0>(alignment))) >= 0);
416 current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
417 ++current_ref_pos;
418 std::ranges::advance(current_read_pos, cigar_count);
419 }
420 else if (('P'_cigar_operation == cigar_operation)) // skip padding
421 {
422 current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
423 ++current_ref_pos;
424
425 current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
426 ++current_read_pos;
427 }
428 }
429}
430
433struct access_restrictor_fn
434{
436 template <typename chr_t>
437 [[noreturn]] chr_t operator()(chr_t) const
438 {
439 throw std::logic_error{"Access is not allowed because there is no sequence information."};
440 }
441};
442
443} // namespace seqan3::detail
The <algorithm> header from C++20's standard library.
Provides the seqan3::cigar alphabet.
T begin(T... args)
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:165
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
The <concepts> header from C++20's standard library.
T distance(T... args)
T emplace_back(T... args)
T from_chars(T... args)
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: concept.hpp:526
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
constexpr auto is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:269
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr auto single_pass_input
A view adapter that decays most of the range properties and adds single pass behavior.
Definition: single_pass_input.hpp:361
constexpr auto zip
A zip view.
Definition: zip.hpp:29
constexpr auto const & get(configuration< configs_t... > const &config) noexcept
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: configuration.hpp:429
Provides seqan3::detail::pairwise_alignment and seqan3::detail::writable_pairwise_alignment.
Provides character predicates for tokenisation.
The <ranges> header from C++20's standard library.
Provides seqan3::views::single_pass_input.
The <charconv> header from C++17's standard library.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
Auxiliary for pretty printing of exception messages.
Provides seqan3::tuple_like.
Provides seqan3::views::zip.