SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
cigar.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/algorithm>
16 #include <seqan3/std/charconv>
17 #include <seqan3/std/concepts>
18 #include <seqan3/std/ranges>
19 #include <sstream>
20 
29 
30 namespace seqan3::detail
31 {
33 struct view_equality_fn
34 {
36  template <std::ranges::forward_range rng1_type, std::ranges::forward_range rng2_type>
37  constexpr bool operator()(rng1_type && rng1, rng2_type && rng2) const
38  {
39  return std::ranges::equal(rng1, rng2);
40  }
41 };
42 
82 template <typename reference_char_type, typename query_char_type>
83 [[nodiscard]] constexpr cigar::operation map_aligned_values_to_cigar_op(reference_char_type const reference_char,
84  query_char_type const query_char,
85  bool const extended_cigar)
87  requires seqan3::detail::weakly_equality_comparable_with<reference_char_type, gap> &&
88  seqan3::detail::weakly_equality_comparable_with<query_char_type, gap>
90 {
91  constexpr std::array<char, 6> operators{'M', 'D', 'I', 'P', 'X', '='}; // contains the possible cigar operators.
92  uint8_t key = (static_cast<uint8_t>(reference_char == gap{}) << 1) | static_cast<uint8_t>(query_char == gap{});
93  if (extended_cigar && (key == 0)) // in extended format refine the substitution operator to match/mismatch.
94  key |= ((1 << 2) | static_cast<uint8_t>(query_char == reference_char)); // maps to [4, 5].
95 
96  return assign_char_to(operators[key], cigar::operation{});
97 }
98 
105 inline void update_alignment_lengths(int32_t & ref_length,
106  int32_t & seq_length,
107  char const cigar_operation,
108  uint32_t const cigar_count)
109 {
110  switch (cigar_operation)
111  {
112  case 'M': case '=': case 'X': ref_length += cigar_count, seq_length += cigar_count; break;
113  case 'D': case 'N': ref_length += cigar_count; break;
114  case 'I' : seq_length += cigar_count; break;
115  case 'S': case 'H': case 'P': break; // no op (soft-clipping or padding does not increase either length)
116  default: throw format_error{"Illegal cigar operation: " + std::string{cigar_operation}};
117  }
118 }
119 
133 template <typename cigar_input_type>
134 inline std::tuple<std::vector<cigar>, int32_t, int32_t> parse_cigar(cigar_input_type && cigar_input)
135 {
136  std::vector<cigar> operations{};
137  std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
138  char cigar_operation{};
139  uint32_t cigar_count{};
140  int32_t ref_length{}, seq_length{}; // length of aligned part for ref and query
141 
142  // transform input into a single input view if it isn't already
143  auto cigar_view = cigar_input | views::single_pass_input;
144 
145  // parse the rest of the cigar
146  // -------------------------------------------------------------------------------------------------------------
147  while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
148  {
149  auto buff_end = (std::ranges::copy(cigar_view | detail::take_until_or_throw(!is_digit), buffer.data())).out;
150  cigar_operation = *std::ranges::begin(cigar_view);
151  std::ranges::next(std::ranges::begin(cigar_view));
152 
153  if (std::from_chars(buffer.begin(), buff_end, cigar_count).ec != std::errc{})
154  throw format_error{"Corrupted cigar string encountered"};
155 
156  update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
157  operations.emplace_back(cigar_count, cigar::operation{}.assign_char(cigar_operation));
158  }
159 
160  return {operations, ref_length, seq_length};
161 }
162 
200 template <seqan3::detail::pairwise_alignment alignment_type>
201 [[nodiscard]] inline std::vector<cigar> get_cigar_vector(alignment_type && alignment,
202  uint32_t const query_start_pos = 0,
203  uint32_t const query_end_pos = 0,
204  bool const extended_cigar = false)
205 {
206  using std::get;
207 
208  auto & ref_seq = get<0>(alignment);
209  auto & query_seq = get<1>(alignment);
210 
211  if (ref_seq.size() != query_seq.size())
212  throw std::logic_error{"The aligned sequences must have the same length."};
213 
214  std::vector<cigar> result{};
215 
216  if (!ref_seq.size())
217  return result; // return empty string if sequences are empty
218 
219  // Add (S)oft-clipping at the start of the read
220  if (query_start_pos)
221  result.emplace_back(query_start_pos, 'S'_cigar_operation);
222 
223  // Create cigar string from alignment
224  // -------------------------------------------------------------------------
225  // initialize first operation and count value:
226  cigar::operation operation{map_aligned_values_to_cigar_op(ref_seq[0], query_seq[0], extended_cigar)};
227  uint32_t count{0};
228 
229  // go through alignment columns
230  for (auto column : views::zip(ref_seq, query_seq))
231  {
232  cigar::operation next_op = map_aligned_values_to_cigar_op(std::get<0>(column),
233  std::get<1>(column),
234  extended_cigar);
235 
236  if (operation == next_op)
237  {
238  ++count;
239  }
240  else
241  {
242  result.emplace_back(count, operation);
243  operation = next_op;
244  count = 1;
245  }
246  }
247 
248  // append last cigar element
249  result.emplace_back(count, operation);
250 
251  // Add (S)oft-clipping at the end of the read
252  if (query_end_pos)
253  result.emplace_back(query_end_pos, 'S'_cigar_operation);
254 
255  return result;
256 }
257 
263 [[nodiscard]] inline std::string get_cigar_string(std::vector<cigar> const & cigar_vector)
264 {
265  std::string result{};
266  std::ranges::for_each(cigar_vector, [&result] (auto & cig) { result.append(cig.to_string()); });
267  return result;
268 }
269 
303 template <seqan3::detail::pairwise_alignment alignment_type>
304 [[nodiscard]] inline std::string get_cigar_string(alignment_type && alignment,
305  uint32_t const query_start_pos = 0,
306  uint32_t const query_end_pos = 0,
307  bool const extended_cigar = false)
308 {
309  return get_cigar_string(get_cigar_vector(alignment, query_start_pos, query_end_pos, extended_cigar));
310 }
311 
345 template <seqan3::aligned_sequence ref_seq_type, seqan3::aligned_sequence query_seq_type>
346 [[nodiscard]] inline std::string get_cigar_string(ref_seq_type && ref_seq,
347  query_seq_type && query_seq,
348  uint32_t const query_start_pos = 0,
349  uint32_t const query_end_pos = 0,
350  bool const extended_cigar = false)
351 {
352  return get_cigar_string(std::tie(ref_seq, query_seq), query_start_pos, query_end_pos, extended_cigar);
353 }
354 
377 template <seqan3::detail::writable_pairwise_alignment alignment_type>
378 inline void alignment_from_cigar(alignment_type & alignment, std::vector<cigar> const & cigar_vector)
379 {
380  using std::get;
381  auto current_ref_pos = std::ranges::begin(get<0>(alignment));
382  auto current_read_pos = std::ranges::begin(get<1>(alignment));
383 
384  for (auto [cigar_count, cigar_operation] : cigar_vector)
385  {
386  // ignore since alignment shall contain sliced sequences
387  if (('S'_cigar_operation == cigar_operation) || ('H'_cigar_operation == cigar_operation))
388  continue;
389 
390  assert(('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation) ||
391  ('X'_cigar_operation == cigar_operation) || ('D'_cigar_operation == cigar_operation) ||
392  ('N'_cigar_operation == cigar_operation) || ('I'_cigar_operation == cigar_operation) ||
393  ('P'_cigar_operation == cigar_operation)); // checked during IO
394 
395  if (('M'_cigar_operation == cigar_operation) || ('='_cigar_operation == cigar_operation) ||
396  ('X'_cigar_operation == cigar_operation))
397  {
398  std::ranges::advance(current_ref_pos , cigar_count);
399  std::ranges::advance(current_read_pos, cigar_count);
400  }
401  else if (('D'_cigar_operation == cigar_operation) || ('N'_cigar_operation == cigar_operation))
402  {
403  // insert gaps into read
404 
405  assert(std::distance(current_read_pos, std::ranges::end(get<1>(alignment))) >= 0);
406  current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
407  ++current_read_pos;
408  std::ranges::advance(current_ref_pos , cigar_count);
409  }
410  else if (('I'_cigar_operation == cigar_operation)) // Insert gaps into ref
411  {
412  assert(std::ranges::distance(current_ref_pos, std::ranges::end(get<0>(alignment))) >= 0);
413  current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
414  ++current_ref_pos;
415  std::ranges::advance(current_read_pos, cigar_count);
416  }
417  else if (('P'_cigar_operation == cigar_operation)) // skip padding
418  {
419  current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
420  ++current_ref_pos;
421 
422  current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
423  ++current_read_pos;
424  }
425  }
426 }
427 
429 struct access_restrictor_fn
430 {
432  template <typename chr_t>
433  [[noreturn]] chr_t operator()(chr_t) const
434  {
435  throw std::logic_error{"Access is not allowed because there is no sequence information."};
436  }
437 };
438 
439 } // namespace seqan3::detail
Adaptations of algorithms from the Ranges TS.
Provides the seqan3::cigar alphabet.
T begin(T... args)
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:211
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
The Concepts library.
Auxiliary for pretty printing of exception messages.
T distance(T... args)
T emplace_back(T... args)
T from_chars(T... args)
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: concept.hpp:523
constexpr auto is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:287
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr auto get
A view calling get on each element in a range.
Definition: elements.hpp:114
constexpr auto single_pass_input
A view adapter that decays most of the range properties and adds single pass behavior.
Definition: single_pass_input.hpp:365
constexpr auto zip
A zip view.
Definition: zip.hpp:29
Provides seqan3::detail::pairwise_alignment and seqan3::detail::writable_pairwise_alignment.
Adaptations of concepts from the Ranges TS.
Provides std::from_chars and std::to_chars if not defined in the stdlib <charconv> header.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
Provides character predicates for tokenisation.
Provides seqan3::tuple_like.
Provides seqan3::views::single_pass_input.
Provides seqan3::views::zip.