SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
detail.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <sstream>
16 
21 #include <seqan3/std/algorithm>
22 #include <seqan3/std/charconv>
23 #include <seqan3/std/concepts>
24 #include <seqan3/std/ranges>
25 
26 #include <range/v3/view/zip.hpp>
27 
28 namespace seqan3::detail
29 {
31 struct view_equality_fn
32 {
34  template <std::ranges::ForwardRange rng1_type, std::ranges::ForwardRange rng2_type>
35  constexpr bool operator()(rng1_type && rng1, rng2_type && rng2) const
36  {
37  return std::ranges::equal(rng1, rng2);
38  }
39 };
40 
84 template <typename reference_char_type, typename query_char_type>
86  requires std::detail::WeaklyEqualityComparableWith<reference_char_type, gap> &&
87  std::detail::WeaklyEqualityComparableWith<query_char_type, gap>
89 char compare_aligned_values(reference_char_type const reference_char,
90  query_char_type const query_char,
91  bool const extended_cigar)
92 {
93  return (reference_char == gap{})
94  ? (query_char == gap{})
95  ? 'P'
96  : 'I'
97  : (query_char == gap{})
98  ? 'D'
99  : (extended_cigar)
100  ? (query_char == reference_char)
101  ? '='
102  : 'X'
103  : 'M';
104 }
105 
137 template <std::ranges::ForwardRange ref_seq_type, std::ranges::ForwardRange query_seq_type>
139  requires std::detail::WeaklyEqualityComparableWith<gap, reference_t<ref_seq_type>> &&
140  std::detail::WeaklyEqualityComparableWith<gap, reference_t<query_seq_type>>
142 std::string get_cigar_string(ref_seq_type && ref_seq,
143  query_seq_type && query_seq,
144  uint32_t const query_start_pos = 0,
145  uint32_t const query_end_pos = 0,
146  bool const extended_cigar = false)
147 {
148  if (ref_seq.size() != query_seq.size())
149  throw std::logic_error{"The aligned sequences must have the same length."};
150 
151  std::ostringstream result;
152 
153  if (!ref_seq.size())
154  return std::string(); // return empty string if sequences are empty
155 
156  // Add (S)oft-clipping at the start of the read
157  if (query_start_pos)
158  result << query_start_pos << 'S';
159 
160  // Create cigar string from alignment
161  // -------------------------------------------------------------------------
162  // initialize first operation:
163  char tmp_char{compare_aligned_values(ref_seq[0], query_seq[0], extended_cigar)};
164  size_t tmp_length{0};
165 
166  // go through alignment columns
167  for (auto column : std::view::zip(ref_seq, query_seq))
168  {
169  char next_op = compare_aligned_values(std::get<0>(column), std::get<1>(column), extended_cigar);
170 
171  if (tmp_char == next_op)
172  {
173  ++tmp_length;
174  }
175  else
176  {
177  result << tmp_length << tmp_char;
178  tmp_char = next_op;
179  tmp_length = 1;
180  }
181  }
182  // append last cigar element
183  result << tmp_length << tmp_char;
184 
185  // Add (S)oft-clipping at the end of the read
186  if (query_end_pos)
187  result << query_end_pos << 'S';
188 
189  return result.str();
190 }
191 
231 template <TupleLike alignment_type>
233  requires std::tuple_size_v<remove_cvref_t<alignment_type>> == 2
235 std::string get_cigar_string(alignment_type && alignment,
236  uint32_t const query_start_pos = 0,
237  uint32_t const query_end_pos = 0,
238  bool const extended_cigar = false)
239 {
240  return get_cigar_string(get<0>(alignment), get<1>(alignment),
241  query_start_pos, query_end_pos, extended_cigar);
242 }
282 template <TupleLike alignment_type>
284  requires std::tuple_size_v<remove_cvref_t<alignment_type>> == 2
286 std::vector<std::pair<char, size_t>> get_cigar_vector(alignment_type && alignment,
287  uint32_t const query_start_pos = 0,
288  uint32_t const query_end_pos = 0,
289  bool const extended_cigar = false)
290 {
291  auto & ref_seq = get<0>(alignment);
292  auto & query_seq = get<1>(alignment);
293 
294  if (ref_seq.size() != query_seq.size())
295  throw std::logic_error{"The aligned sequences must have the same length."};
296 
298 
299  if (!ref_seq.size())
300  return result; // return empty string if sequences are empty
301 
302  // Add (S)oft-clipping at the start of the read
303  if (query_start_pos)
304  result.emplace_back('S', query_start_pos);
305 
306  // Create cigar string from alignment
307  // -------------------------------------------------------------------------
308  // initialize first operation:
309  char tmp_char{compare_aligned_values(ref_seq[0], query_seq[0], extended_cigar)};
310  size_t tmp_length{0};
311 
312  // go through alignment columns
313  for (auto column : std::view::zip(ref_seq, query_seq))
314  {
315  char next_op = compare_aligned_values(std::get<0>(column), std::get<1>(column), extended_cigar);
316 
317  if (tmp_char == next_op)
318  {
319  ++tmp_length;
320  }
321  else
322  {
323  result.emplace_back(tmp_char, tmp_length);
324  tmp_char = next_op;
325  tmp_length = 1;
326  }
327  }
328 
329  // append last cigar element
330  result.emplace_back(tmp_char, tmp_length);
331 
332  // Add (S)oft-clipping at the end of the read
333  if (query_end_pos)
334  result.emplace_back('S', query_end_pos);
335 
336  return result;
337 }
338 
353 template <std::ranges::InputRange cigar_input_type>
354 std::tuple<std::vector<std::pair<char, size_t>>, size_t, size_t, size_t, size_t>
355 parse_cigar(cigar_input_type && cigar_input)
356 {
358  size_t sc_begin_count{}; // number of soft clipped bases at the beginning
359  size_t sc_end_count{}; // number of soft clipped bases at the end
360  std::array<char, 20> buffer{}; // buffer to parse numbers with from_chars. Biggest number should fit in uint64_t
361  char cigar_op{'\0'};
362  size_t cigar_count{}, ref_length{}, seq_length{}; // length of aligned part for ref and query
363 
364  auto update_lengths_fn = [&ref_length, &seq_length, &cigar_op, &cigar_count] ()
365  {
366  if (is_char<'M'>(cigar_op) || is_char<'='>(cigar_op) || is_char<'X'>(cigar_op))
367  {
368  ref_length += cigar_count;
369  seq_length += cigar_count;
370  }
371  else if (is_char<'D'>(cigar_op) || is_char<'N'>(cigar_op))
372  {
373  ref_length += cigar_count;
374  }
375  else if (is_char<'I'>(cigar_op))
376  {
377  seq_length += cigar_count;
378  }
379  else // illegal character
380  {
381  if (is_char<'P'>(cigar_op))
382  throw format_error{"We do currently not support cigar operation 'P'."};
383  else
384  throw format_error{std::string{"Illegal cigar operation: "} + std::string{cigar_op}};
385  }
386  };
387 
388  // transform input into a signle input view if it isn't already
389  auto cigar_view = cigar_input | view::single_pass_input;
390 
391  // check hard/soft clipping at the beginning manually
392  // -----------------------------------------------------------------------------------------------------------------
393  auto [ignore, buffer_end] = std::ranges::copy(cigar_view | view::take_until_or_throw(!is_digit), buffer.data());
394  (void) ignore;
395 
396  cigar_op = *std::ranges::begin(cigar_view);
398 
399  if (is_char<'H'>(cigar_op)) // hard clipping is ignored. parse the next operation
400  {
401  auto [ignore2, buffer_end2] = std::ranges::copy(cigar_view
402  | view::take_until_or_throw(!is_digit), buffer.data());
403  buffer_end = buffer_end2;
404  (void) ignore2;
405 
406  cigar_op = *std::ranges::begin(cigar_view);
408  }
409 
410  if (std::from_chars(buffer.begin(), buffer_end, cigar_count).ec != std::errc{})
411  throw format_error{"Corrupted cigar string encountered"};
412 
413  if (is_char<'S'>(cigar_op)) // check for soft clipping at the beginning
414  {
415  sc_begin_count = cigar_count;
416  }
417  else
418  {
419  update_lengths_fn();
420  operations.push_back({cigar_op, cigar_count});
421  }
422 
423  // parse the rest of the cigar
424  // -----------------------------------------------------------------------------------------------------------------
425  while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view)) // until stream is not empty
426  {
427  buffer_end = (std::ranges::copy(cigar_view | view::take_until_or_throw(!is_digit), buffer.data())).out;
428  cigar_op = *std::ranges::begin(cigar_view);
430 
431  if (std::from_chars(buffer.begin(), buffer_end, cigar_count).ec != std::errc{})
432  throw format_error{"Corrupted cigar string encountered"};
433 
434  if (is_char<'S'>(cigar_op)) // we are at the end, hard clipping afterwards can be ignored
435  {
436  sc_end_count = cigar_count;
437  return {operations, ref_length, seq_length, sc_begin_count, sc_end_count};
438  }
439  update_lengths_fn();
440  operations.push_back({cigar_op, cigar_count});
441  }
442  return {operations, ref_length, seq_length, sc_begin_count, sc_end_count};
443 }
444 
460 template <std::ranges::InputRange cigar_input_type>
461 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
462 {
464  size_t sc_begin_count{}; // number of soft clipped bases at the beginning
465  size_t sc_end_count{}; // number of soft clipped bases at the end
466  char cigar_op{'\0'};
467  size_t cigar_count{},ref_length{}, seq_length{};
468  uint32_t o_and_c{};
469  static constexpr char const * CIGAR_MAPPING = "MIDNSHP=X*******";
470  static constexpr uint32_t CIGAR_MASK = 0x0f; // 0000000000001111
471 
472  auto update_lengths_fn = [&ref_length, &seq_length, &cigar_op, &cigar_count] ()
473  {
474  if (is_char<'M'>(cigar_op) || is_char<'='>(cigar_op) || is_char<'X'>(cigar_op))
475  {
476  ref_length += cigar_count;
477  seq_length += cigar_count;
478  }
479  else if (is_char<'D'>(cigar_op) || is_char<'N'>(cigar_op))
480  {
481  ref_length += cigar_count;
482  }
483  else if (is_char<'I'>(cigar_op))
484  {
485  seq_length += cigar_count;
486  }
487  else // illegal character
488  {
489  if (is_char<'P'>(cigar_op))
490  throw format_error{"We do currently not support cigar operation 'P'."};
491  else
492  throw format_error{std::string{"Illegal cigar operation: "} + std::string{cigar_op}};
493  }
494  };
495 
496  if (n_cigar_op == 0) // [[unlikely]]
497  return std::tuple{operations, ref_length, seq_length, sc_begin_count, sc_end_count};
498 
499  std::ranges::copy_n(std::ranges::begin(cigar_input), sizeof(o_and_c), reinterpret_cast<char*>(&o_and_c));
500  assert((o_and_c & CIGAR_MASK) <= 8u);
501  cigar_op = CIGAR_MAPPING[o_and_c & CIGAR_MASK];
502  cigar_count = o_and_c >> 4;
503  --n_cigar_op;
504 
505  if (is_char<'H'>(cigar_op)) // hard clipping is ignored. parse the next operation
506  {
507  if (n_cigar_op == 0)
508  return std::tuple{operations, ref_length, seq_length, sc_begin_count, sc_end_count};
509 
510  std::ranges::copy_n(std::ranges::begin(cigar_input), sizeof(o_and_c), reinterpret_cast<char*>(&o_and_c));
511  assert((o_and_c & CIGAR_MASK) <= 8u);
512  cigar_op = CIGAR_MAPPING[o_and_c & CIGAR_MASK];
513  cigar_count = o_and_c >> 4;
514  --n_cigar_op;
515  }
516 
517  if (is_char<'S'>(cigar_op)) // check for soft clipping at the beginning
518  {
519  sc_begin_count = cigar_count;
520  }
521  else
522  {
523  update_lengths_fn();
524  operations.push_back({cigar_op, cigar_count});
525  }
526 
527  // parse the rest of the cigar
528  // -----------------------------------------------------------------------------------------------------------------
529  while (n_cigar_op > 0) // until stream is not empty
530  {
531  std::ranges::copy_n(std::ranges::begin(cigar_input), sizeof(o_and_c), reinterpret_cast<char*>(&o_and_c));
532  assert((o_and_c & CIGAR_MASK) <= 8u);
533  cigar_op = CIGAR_MAPPING[o_and_c & CIGAR_MASK];
534  cigar_count = o_and_c >> 4;
535 
536  if (is_char<'S'>(cigar_op)) // we are at the end, hard clipping afterwards can be ignored
537  {
538  sc_end_count = cigar_count;
539  return std::tuple{operations, ref_length, seq_length, sc_begin_count, sc_end_count};
540  }
541 
542  update_lengths_fn();
543  operations.push_back({cigar_op, cigar_count});
544  --n_cigar_op;
545  }
546  return std::tuple{operations, ref_length, seq_length, sc_begin_count, sc_end_count};
547 }
548 
572 template <TupleLike alignment_type>
574  requires std::tuple_size_v<remove_cvref_t<alignment_type>> == 2 &&
575  detail::all_satisfy_aligned_seq<detail::tuple_type_list_t<alignment_type>>
577 void alignment_from_cigar(alignment_type & alignment, std::vector<std::pair<char, size_t>> const & cigar)
578 {
579  using std::get;
580  auto current_ref_pos = std::ranges::begin(get<0>(alignment));
581  auto current_read_pos = std::ranges::begin(get<1>(alignment));
582 
583  for (auto [cigar_op, cigar_count] : cigar)
584  {
585  if (is_char<'M'>(cigar_op) || is_char<'='>(cigar_op) || is_char<'X'>(cigar_op))
586  {
587  std::ranges::advance(current_ref_pos , cigar_count);
588  std::ranges::advance(current_read_pos, cigar_count);
589  }
590  else if (is_char<'D'>(cigar_op) || is_char<'N'>(cigar_op)) // insert gaps into read
591  {
592  assert(std::distance(current_read_pos, std::ranges::end(get<1>(alignment))) >= 0);
593  current_read_pos = insert_gap(get<1>(alignment), current_read_pos, cigar_count);
594  ++current_read_pos;
595  std::ranges::advance(current_ref_pos , cigar_count);
596  }
597  else if (is_char<'I'>(cigar_op)) // Insert gaps into ref
598  {
599  assert(std::ranges::distance(current_ref_pos, std::ranges::end(get<0>(alignment))) >= 0);
600  current_ref_pos = insert_gap(get<0>(alignment), current_ref_pos, cigar_count);
601  ++current_ref_pos;
602  std::ranges::advance(current_read_pos, cigar_count);
603  }
604  else // illegal character
605  {
606  if (is_char<'P'>(cigar_op))
607  throw format_error{"We do currently not support cigar operation 'P'."};
608  else
609  throw format_error{std::string{"Illegal cigar operation: "} + std::string{cigar_op}};
610  }
611  }
612 }
613 
615 struct access_restrictor_fn
616 {
618  template <typename chr_t>
619  [[noreturn]] chr_t operator()(chr_t) const
620  {
621  throw std::logic_error{"Access is not allowed because there is no sequence information."};
622  }
623 };
624 
625 } // namespace seqan3::detail
::ranges::equal equal
Alias for ranges::equal. Determines if two sets of elements are the same.
Definition: algorithm:54
::ranges::distance distance
Alias for ranges::distance. Returns the number of hops from first to last.
Definition: iterator:321
::ranges::next next
Alias for ranges::next. Returns the nth successor of the given iterator.
Definition: iterator:331
aligned_seq_t::iterator insert_gap(aligned_seq_t &aligned_seq, typename aligned_seq_t::const_iterator pos_it)
An implementation of seqan3::AlignedSequence::insert_gap for sequence containers. ...
Definition: aligned_sequence_concept.hpp:229
T distance(T... args)
Includes the AlignedSequence and the related insert_gap and erase_gap functions to enable stl contain...
constexpr auto zip
A range adaptor that transforms a tuple of range into a range of tuples.
Definition: ranges:948
::ranges::copy_n copy_n
Alias for ranges::copy_n. Copies a range of exactly n elements to a new location. ...
Definition: algorithm:49
Provides std::from_chars and std::to_chars if not defined in the stl <charconv> header.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides seqan3::TupleLike.
auto constexpr is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:287
The Concepts library.
Adaptations of concepts from the Ranges TS.
::ranges::begin begin
Alias for ranges::begin. Returns an iterator to the beginning of a range.
Definition: ranges:174
::ranges::advance advance
Alias for ranges::advance. Advances the iterator by the given distance.
Definition: iterator:316
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: algorithm:44
constexpr auto single_pass_input
A view adapter that decays most of the range properties and adds single pass behavior.
Definition: single_pass_input.hpp:381
Definition: aligned_sequence_concept.hpp:35
auto const get
A view calling std::get on each element in a range.
Definition: get.hpp:66
auto constexpr take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:613
Adaptations of algorithms from the Ranges TS.
Provides seqan3::single_pass_input_view.
T from_chars(T... args)
::ranges::end end
Alias for ranges::end. Returns an iterator to the end of a range.
Definition: ranges:179