SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
format_sam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <seqan3/std/ranges>
17 #include <string>
18 #include <vector>
19 
38 
39 namespace seqan3
40 {
41 
114 class format_sam : private detail::format_sam_base
115 {
116 public:
120  // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
121  format_sam() = default;
122  format_sam(format_sam const &) = default;
123  format_sam & operator=(format_sam const &) = default;
124  format_sam(format_sam &&) = default;
125  format_sam & operator=(format_sam &&) = default;
126  ~format_sam() = default;
127 
129 
132  {
133  { "sam" },
134  };
135 
136 protected:
137 #ifdef SEQAN3_DEPRECATED_310
138  template <typename stream_type, // constraints checked by file
139  typename seq_legal_alph_type, bool seq_qual_combined,
140  typename seq_type, // other constraints checked inside function
141  typename id_type,
142  typename qual_type>
143  void read_sequence_record(stream_type & stream,
145  seq_type & sequence,
146  id_type & id,
147  qual_type & qualities);
148 #else // ^^^ before seqan 3.1 / after seqan 3.1 vvv
149  template <typename stream_type, // constraints checked by file
150  typename seq_legal_alph_type,
151  typename seq_type, // other constraints checked inside function
152  typename id_type,
153  typename qual_type>
154  void read_sequence_record(stream_type & stream,
156  seq_type & sequence,
157  id_type & id,
158  qual_type & qualities);
159 #endif // SEQAN3_DEPRECATED_310
160 
161  template <typename stream_type, // constraints checked by file
162  typename seq_type, // other constraints checked inside function
163  typename id_type,
164  typename qual_type>
165  void write_sequence_record(stream_type & stream,
166  sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
167  seq_type && sequence,
168  id_type && id,
169  qual_type && qualities);
170 
171  template <typename stream_type, // constraints checked by file
172  typename seq_legal_alph_type,
173  typename ref_seqs_type,
174  typename ref_ids_type,
175  typename seq_type,
176  typename id_type,
177  typename offset_type,
178  typename ref_seq_type,
179  typename ref_id_type,
180  typename ref_offset_type,
181  typename align_type,
182  typename cigar_type,
183  typename flag_type,
184  typename mapq_type,
185  typename qual_type,
186  typename mate_type,
187  typename tag_dict_type,
188  typename e_value_type,
189  typename bit_score_type>
190  void read_alignment_record(stream_type & stream,
191  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
192  ref_seqs_type & ref_seqs,
194  seq_type & seq,
195  qual_type & qual,
196  id_type & id,
197  offset_type & offset,
198  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
199  ref_id_type & ref_id,
200  ref_offset_type & ref_offset,
201  align_type & align,
202  cigar_type & cigar_vector,
203  flag_type & flag,
204  mapq_type & mapq,
205  mate_type & mate,
206  tag_dict_type & tag_dict,
207  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
208  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
209 
210  template <typename stream_type,
211  typename header_type,
212  typename seq_type,
213  typename id_type,
214  typename ref_seq_type,
215  typename ref_id_type,
216  typename align_type,
217  typename qual_type,
218  typename mate_type,
219  typename tag_dict_type,
220  typename e_value_type,
221  typename bit_score_type>
222  void write_alignment_record(stream_type & stream,
223  sam_file_output_options const & options,
224  header_type && header,
225  seq_type && seq,
226  qual_type && qual,
227  id_type && id,
228  int32_t const offset,
229  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
230  ref_id_type && ref_id,
231  std::optional<int32_t> ref_offset,
232  align_type && align,
233  std::vector<cigar> const & cigar_vector,
234  sam_flag const flag,
235  uint8_t const mapq,
236  mate_type && mate,
237  tag_dict_type && tag_dict,
238  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
239  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
240 
241 private:
243  std::string tmp_qual{};
244 
246  static constexpr std::string_view dummy{};
247 
249  sam_file_header<> default_header{};
250 
252  bool ref_info_present_in_header{false};
253 
255  std::string_view const & default_or(detail::ignore_t) const noexcept
256  {
257  return dummy;
258  }
259 
261  template <typename t>
262  decltype(auto) default_or(t && v) const noexcept
263  {
264  return std::forward<t>(v);
265  }
266 
267  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
268 
269  template <typename stream_view_type, typename value_type>
270  void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
271  stream_view_type && stream_view,
272  value_type value);
273 
274  template <typename stream_view_type>
275  void read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant,
276  stream_view_type && stream_view);
277 
278  template <typename stream_view_type>
279  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
280 
281  template <typename stream_it_t, std::ranges::forward_range field_type>
282  void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
283 
284  template <typename stream_it_t>
285  void write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value);
286 
287  template <typename stream_it_t>
288  void write_tag_fields(stream_it_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
289 };
290 
292 #ifdef SEQAN3_DEPRECATED_310
293 template <typename stream_type, // constraints checked by file
294  typename seq_legal_alph_type, bool seq_qual_combined,
295  typename seq_type, // other constraints checked inside function
296  typename id_type,
297  typename qual_type>
298 inline void format_sam::read_sequence_record(stream_type & stream,
300  seq_type & sequence,
301  id_type & id,
302  qual_type & qualities)
303 #else // ^^^ before seqan 3.1 / after seqan 3.1 vvv
304 template <typename stream_type, // constraints checked by file
305  typename seq_legal_alph_type,
306  typename seq_type, // other constraints checked inside function
307  typename id_type,
308  typename qual_type>
309 inline void format_sam::read_sequence_record(stream_type & stream,
311  seq_type & sequence,
312  id_type & id,
313  qual_type & qualities)
314 #endif // SEQAN3_DEPRECATED_310
315 {
317 
318 #ifdef SEQAN3_DEPRECATED_310
319  if constexpr (seq_qual_combined)
320  {
321  tmp_qual.clear();
322  read_alignment_record(stream, align_options, std::ignore, default_header, sequence, tmp_qual, id,
323  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore,
324  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore);
325 
326  for (auto sit = tmp_qual.begin(), dit = std::ranges::begin(sequence); sit != tmp_qual.end(); ++sit, ++dit)
327  get<1>(*dit).assign_char(*sit);
328  }
329  else
330 #endif // SEQAN3_DEPRECATED_310
331  {
332  read_alignment_record(stream, align_options, std::ignore, default_header, sequence, qualities, id,
333  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore,
334  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore);
335  }
336 
337  if constexpr (!detail::decays_to_ignore_v<seq_type>)
338  if (std::ranges::distance(sequence) == 0)
339  throw parse_error{"The sequence information must not be empty."};
340  if constexpr (!detail::decays_to_ignore_v<id_type>)
341  if (std::ranges::distance(id) == 0)
342  throw parse_error{"The id information must not be empty."};
343 
344  if (options.truncate_ids)
345  id = id | detail::take_until_and_consume(is_space) | views::to<id_type>;
346 }
347 
349 template <typename stream_type, // constraints checked by file
350  typename seq_type, // other constraints checked inside function
351  typename id_type,
352  typename qual_type>
353 inline void format_sam::write_sequence_record(stream_type & stream,
354  sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
355  seq_type && sequence,
356  id_type && id,
357  qual_type && qualities)
358 {
359  using default_align_t = std::pair<std::span<gapped<char>>, std::span<gapped<char>>>;
360  using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
361 
362  sam_file_output_options output_options;
363 
364  write_alignment_record(stream,
365  output_options,
366  /*header*/ std::ignore,
367  /*seq*/ default_or(sequence),
368  /*qual*/ default_or(qualities),
369  /*id*/ default_or(id),
370  /*offset*/ 0,
371  /*ref_seq*/ std::string_view{},
372  /*ref_id*/ std::string_view{},
373  /*ref_offset*/ -1,
374  /*align*/ default_align_t{},
375  /*cigar_vector*/ std::vector<cigar>{},
376  /*flag*/ sam_flag::none,
377  /*mapq*/ 0,
378  /*mate*/ default_mate_t{},
379  /*tag_dict*/ sam_tag_dictionary{},
380  /*e_value*/ 0,
381  /*bit_score*/ 0);
382 }
383 
385 template <typename stream_type, // constraints checked by file
386  typename seq_legal_alph_type,
387  typename ref_seqs_type,
388  typename ref_ids_type,
389  typename seq_type,
390  typename id_type,
391  typename offset_type,
392  typename ref_seq_type,
393  typename ref_id_type,
394  typename ref_offset_type,
395  typename align_type,
396  typename cigar_type,
397  typename flag_type,
398  typename mapq_type,
399  typename qual_type,
400  typename mate_type,
401  typename tag_dict_type,
402  typename e_value_type,
403  typename bit_score_type>
404 inline void format_sam::read_alignment_record(stream_type & stream,
405  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
406  ref_seqs_type & ref_seqs,
408  seq_type & seq,
409  qual_type & qual,
410  id_type & id,
411  offset_type & offset,
412  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
413  ref_id_type & ref_id,
414  ref_offset_type & ref_offset,
415  align_type & align,
416  cigar_type & cigar_vector,
417  flag_type & flag,
418  mapq_type & mapq,
419  mate_type & mate,
420  tag_dict_type & tag_dict,
421  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
422  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
423 {
424  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
425  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
426  "The ref_offset must be a specialisation of std::optional.");
427 
428  auto stream_view = detail::istreambuf(stream);
429  auto field_view = stream_view | detail::take_until_or_throw_and_consume(is_char<'\t'>);
430 
431  // these variables need to be stored to compute the ALIGNMENT
432  int32_t ref_offset_tmp{};
433  std::ranges::range_value_t<decltype(header.ref_ids())> ref_id_tmp{};
434  [[maybe_unused]] int32_t offset_tmp{};
435  [[maybe_unused]] int32_t soft_clipping_end{};
436  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
437  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
438 
439  // Header
440  // -------------------------------------------------------------------------------------------------------------
441  if (is_char<'@'>(*std::ranges::begin(stream_view))) // we always read the header if present
442  {
443  read_header(stream_view, header, ref_seqs);
444 
445  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
446  return;
447  }
448 
449  // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
450  // -------------------------------------------------------------------------------------------------------------
451  read_field(field_view, id);
452 
453  uint16_t flag_integral{};
454  read_field(field_view, flag_integral);
455  flag = sam_flag{flag_integral};
456 
457  read_field(field_view, ref_id_tmp);
458  check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
459 
460  read_field(field_view, ref_offset_tmp);
461  --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
462 
463  if (ref_offset_tmp == -1)
464  ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
465  else if (ref_offset_tmp > -1)
466  ref_offset = ref_offset_tmp;
467  else if (ref_offset_tmp < -1)
468  throw format_error{"No negative values are allowed for field::ref_offset."};
469 
470  read_field(field_view, mapq);
471 
472  // Field 6: CIGAR
473  // -------------------------------------------------------------------------------------------------------------
474  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
475  {
476  if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
477  {
478  std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view);
479  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
480  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
481  }
482  else
483  {
484  std::ranges::next(std::ranges::begin(field_view)); // skip '*'
485  }
486  }
487  else
488  {
489  detail::consume(field_view);
490  }
491 
492  offset = offset_tmp;
493 
494  // Field 7-9: (RNEXT PNEXT TLEN) = MATE
495  // -------------------------------------------------------------------------------------------------------------
496  if constexpr (!detail::decays_to_ignore_v<mate_type>)
497  {
498  std::ranges::range_value_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
499  read_field(field_view, tmp_mate_ref_id); // RNEXT
500 
501  if (tmp_mate_ref_id == "=") // indicates "same as ref id"
502  {
503  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
504  get<0>(mate) = ref_id;
505  else
506  check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
507  }
508  else
509  {
510  check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
511  }
512 
513  int32_t tmp_pnext{};
514  read_field(field_view, tmp_pnext); // PNEXT
515 
516  if (tmp_pnext > 0)
517  get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
518  else if (tmp_pnext < 0)
519  throw format_error{"No negative values are allowed at the mate mapping position."};
520  // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
521 
522  read_field(field_view, get<2>(mate)); // TLEN
523  }
524  else
525  {
526  for (size_t i = 0; i < 3u; ++i)
527  {
528  detail::consume(field_view);
529  }
530  }
531 
532  // Field 10: Sequence
533  // -------------------------------------------------------------------------------------------------------------
534  if (!is_char<'*'>(*std::ranges::begin(stream_view))) // sequence information is given
535  {
536  auto constexpr is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
537  auto seq_stream = field_view | std::views::transform([is_legal_alph] (char const c) // enforce legal alphabet
538  {
539  if (!is_legal_alph(c))
540  throw parse_error{std::string{"Encountered an unexpected letter: "} +
541  "char_is_valid_for<" +
542  detail::type_name_as_string<seq_legal_alph_type> +
543  "> evaluated to false on " +
544  detail::make_printable(c)};
545  return c;
546  });
547 
548  if constexpr (detail::decays_to_ignore_v<seq_type>)
549  {
550  if constexpr (!detail::decays_to_ignore_v<align_type>)
551  {
552  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
553  "If you want to read ALIGNMENT but not SEQ, the alignment"
554  " object must store a sequence container at the second (query) position.");
555 
556  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
557  {
558 
559  auto tmp_iter = std::ranges::begin(seq_stream);
560  std::ranges::advance(tmp_iter, offset_tmp);
561 
562  for (; seq_length > 0; --seq_length) // seq_length is not needed anymore
563  {
564  get<1>(align).push_back(std::ranges::range_value_t<decltype(get<1>(align))>{}.assign_char(*tmp_iter));
565  ++tmp_iter;
566  }
567 
568  std::ranges::advance(tmp_iter, soft_clipping_end);
569  }
570  else
571  {
572  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // empty container
573  }
574  }
575  else
576  {
577  detail::consume(seq_stream);
578  }
579  }
580  else
581  {
582  read_field(seq_stream, seq);
583 
584  if constexpr (!detail::decays_to_ignore_v<align_type>)
585  {
586  if (!tmp_cigar_vector.empty()) // if no alignment info is given, the field::alignment should remain empty
587  {
588  assign_unaligned(get<1>(align),
589  seq | views::slice(static_cast<decltype(std::ranges::size(seq))>(offset_tmp),
590  std::ranges::size(seq) - soft_clipping_end));
591  }
592  }
593  }
594  }
595  else
596  {
597  std::ranges::next(std::ranges::begin(field_view)); // skip '*'
598  }
599 
600  // Field 11: Quality
601  // -------------------------------------------------------------------------------------------------------------
602  auto const tab_or_end = is_char<'\t'> || is_char<'\r'> || is_char<'\n'>;
603  read_field(stream_view | detail::take_until_or_throw(tab_or_end), qual);
604 
605  if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
606  {
607  if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0 &&
608  std::ranges::distance(seq) != std::ranges::distance(qual))
609  {
610  throw format_error{detail::to_string("Sequence length (", std::ranges::distance(seq),
611  ") and quality length (", std::ranges::distance(qual),
612  ") must be the same.")};
613  }
614  }
615 
616  // All remaining optional fields if any: SAM tags dictionary
617  // -------------------------------------------------------------------------------------------------------------
618  while (is_char<'\t'>(*std::ranges::begin(stream_view))) // read all tags if present
619  {
620  std::ranges::next(std::ranges::begin(stream_view)); // skip tab
621  read_field(stream_view | detail::take_until_or_throw(tab_or_end), tag_dict);
622  }
623 
624  detail::consume(stream_view | detail::take_until(!(is_char<'\r'> || is_char<'\n'>))); // consume new line
625 
626  // DONE READING - wrap up
627  // -------------------------------------------------------------------------------------------------------------
628  // Alignment object construction
629  // Note that the query sequence in get<1>(align) has already been filled while reading Field 10.
630  if constexpr (!detail::decays_to_ignore_v<align_type>)
631  {
632  int32_t ref_idx{(ref_id_tmp.empty()/*unmapped read?*/) ? -1 : 0};
633 
634  if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
635  {
636  if (!ref_id_tmp.empty())
637  {
638  assert(header.ref_dict.count(ref_id_tmp) != 0); // taken care of in check_and_assign_ref_id()
639  ref_idx = header.ref_dict[ref_id_tmp]; // get index for reference sequence
640  }
641  }
642 
643  construct_alignment(align, tmp_cigar_vector, ref_idx, ref_seqs, ref_offset_tmp, ref_length);
644  }
645 
646  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
647  std::swap(cigar_vector, tmp_cigar_vector);
648 }
649 
651 template <typename stream_type,
652  typename header_type,
653  typename seq_type,
654  typename id_type,
655  typename ref_seq_type,
656  typename ref_id_type,
657  typename align_type,
658  typename qual_type,
659  typename mate_type,
660  typename tag_dict_type,
661  typename e_value_type,
662  typename bit_score_type>
663 inline void format_sam::write_alignment_record(stream_type & stream,
664  sam_file_output_options const & options,
665  header_type && header,
666  seq_type && seq,
667  qual_type && qual,
668  id_type && id,
669  int32_t const offset,
670  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
671  ref_id_type && ref_id,
672  std::optional<int32_t> ref_offset,
673  align_type && align,
674  std::vector<cigar> const & cigar_vector,
675  sam_flag const flag,
676  uint8_t const mapq,
677  mate_type && mate,
678  tag_dict_type && tag_dict,
679  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
680  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
681 {
682  /* Note the following general things:
683  *
684  * - Given the SAM specifications, all fields may be empty
685  *
686  * - arithmetic values default to 0 while all others default to '*'
687  *
688  * - Because of the former, arithmetic values can be directly streamed
689  * into 'stream' as operator<< is defined for all arithmetic types
690  * and the default value (0) is also the SAM default.
691  *
692  * - All other non-arithmetic values need to be checked for emptiness
693  */
694 
695  // ---------------------------------------------------------------------
696  // Type Requirements (as static asserts for user friendliness)
697  // ---------------------------------------------------------------------
698  static_assert((std::ranges::forward_range<seq_type> &&
699  alphabet<std::ranges::range_reference_t<seq_type>>),
700  "The seq object must be a std::ranges::forward_range over "
701  "letters that model seqan3::alphabet.");
702 
703  static_assert((std::ranges::forward_range<id_type> &&
704  alphabet<std::ranges::range_reference_t<id_type>>),
705  "The id object must be a std::ranges::forward_range over "
706  "letters that model seqan3::alphabet.");
707 
708  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
709  {
710  static_assert((std::ranges::forward_range<ref_id_type> ||
711  std::integral<std::remove_reference_t<ref_id_type>> ||
712  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
713  "The ref_id object must be a std::ranges::forward_range "
714  "over letters that model seqan3::alphabet.");
715 
716  if constexpr (std::integral<std::remove_cvref_t<ref_id_type>> ||
717  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>)
718  static_assert(!detail::decays_to_ignore_v<header_type>,
719  "If you give indices as reference id information the header must also be present.");
720  }
721 
723  "The align object must be a std::pair of two ranges whose "
724  "value_type is comparable to seqan3::gap");
725 
726  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
727  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
728  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
729  "The align object must be a std::pair of two ranges whose "
730  "value_type is comparable to seqan3::gap");
731 
732  static_assert((std::ranges::forward_range<qual_type> &&
733  alphabet<std::ranges::range_reference_t<qual_type>>),
734  "The qual object must be a std::ranges::forward_range "
735  "over letters that model seqan3::alphabet.");
736 
738  "The mate object must be a std::tuple of size 3 with "
739  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
740  "2) a std::integral or std::optional<std::integral>, and "
741  "3) a std::integral.");
742 
743  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
744  std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
745  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
746  (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
747  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
748  std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
749  "The mate object must be a std::tuple of size 3 with "
750  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
751  "2) a std::integral or std::optional<std::integral>, and "
752  "3) a std::integral.");
753 
754  if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
755  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>)
756  static_assert(!detail::decays_to_ignore_v<header_type>,
757  "If you give indices as mate reference id information the header must also be present.");
758 
759  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
760  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
761 
762  // ---------------------------------------------------------------------
763  // logical Requirements
764  // ---------------------------------------------------------------------
765  if constexpr (!detail::decays_to_ignore_v<header_type> &&
766  !detail::decays_to_ignore_v<ref_id_type> &&
767  !std::integral<std::remove_reference_t<ref_id_type>> &&
768  !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
769  {
770 
771  if (options.sam_require_header && !std::ranges::empty(ref_id))
772  {
773  auto id_it = header.ref_dict.end();
774 
775  if constexpr (std::ranges::contiguous_range<decltype(ref_id)> &&
776  std::ranges::sized_range<decltype(ref_id)> &&
777  std::ranges::borrowed_range<decltype(ref_id)>)
778  {
779  id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
780  }
781  else
782  {
783  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
784 
786  "The ref_id type is not convertible to the reference id information stored in the "
787  "reference dictionary of the header object.");
788 
789  id_it = header.ref_dict.find(ref_id);
790  }
791 
792  if (id_it == header.ref_dict.end()) // no reference id matched
793  throw format_error{detail::to_string("The ref_id '", ref_id, "' was not in the list of references:",
794  header.ref_ids())};
795  }
796  }
797 
798  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
799  throw format_error{"The ref_offset object must be an std::integral >= 0."};
800 
801  // ---------------------------------------------------------------------
802  // Writing the Header on first call
803  // ---------------------------------------------------------------------
804  if constexpr (!detail::decays_to_ignore_v<header_type>)
805  {
806  if (options.sam_require_header && !header_was_written)
807  {
808  write_header(stream, options, header);
809  header_was_written = true;
810  }
811  }
812 
813  // ---------------------------------------------------------------------
814  // Writing the Record
815  // ---------------------------------------------------------------------
816 
817  detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
818  constexpr char separator{'\t'};
819 
820  write_range_or_asterisk(stream_it, id);
821  *stream_it = separator;
822 
823  stream_it.write_number(static_cast<uint16_t>(flag));
824  *stream_it = separator;
825 
826  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
827  {
828  if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
829  {
830  write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id]);
831  }
832  else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
833  {
834  if (ref_id.has_value())
835  write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id.value()]);
836  else
837  *stream_it = '*';
838  }
839  else
840  {
841  write_range_or_asterisk(stream_it, ref_id);
842  }
843  }
844  else
845  {
846  *stream_it = '*';
847  }
848 
849  *stream_it = separator;
850 
851  // SAM is 1 based, 0 indicates unmapped read if optional is not set
852  stream_it.write_number(ref_offset.value_or(-1) + 1);
853  *stream_it = separator;
854 
855  stream_it.write_number(static_cast<unsigned>(mapq));
856  *stream_it = separator;
857 
858  if (!std::ranges::empty(cigar_vector))
859  {
860  for (auto & c : cigar_vector) //TODO THIS IS PROBABLY TERRIBLE PERFORMANCE_WISE
861  stream_it.write_range(c.to_string());
862  }
863  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
864  {
865  // compute possible distance from alignment end to sequence end
866  // which indicates soft clipping at the end.
867  // This should be replace by a free count_gaps function for
868  // aligned sequences which is more efficient if possible.
869  size_t off_end{std::ranges::size(seq) - offset};
870  for (auto chr : get<1>(align))
871  if (chr == gap{})
872  ++off_end;
873 
874  // Might happen if get<1>(align) doesn't correspond to the reference.
875  assert(off_end >= std::ranges::size(get<1>(align)));
876  off_end -= std::ranges::size(get<1>(align));
877 
878  write_range_or_asterisk(stream_it, detail::get_cigar_string(align, offset, off_end));
879  }
880  else
881  {
882  *stream_it = '*';
883  }
884 
885  *stream_it = separator;
886 
887  if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
888  {
889  write_range_or_asterisk(stream_it, (header.ref_ids())[get<0>(mate)]);
890  }
891  else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>, std::optional>)
892  {
893  if (get<0>(mate).has_value())
894  // value_or(0) instead of value() (which is equivalent here) as a
895  // workaround for a ubsan false-positive in GCC8: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90058
896  write_range_or_asterisk(stream_it, header.ref_ids()[get<0>(mate).value_or(0)]);
897  else
898  *stream_it = '*';
899  }
900  else
901  {
902  write_range_or_asterisk(stream_it, get<0>(mate));
903  }
904 
905  *stream_it = separator;
906 
907  if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
908  {
909  // SAM is 1 based, 0 indicates unmapped read if optional is not set
910  stream_it.write_number(get<1>(mate).value_or(-1) + 1);
911  *stream_it = separator;
912  }
913  else
914  {
915  stream_it.write_number(get<1>(mate));
916  *stream_it = separator;
917  }
918 
919  stream_it.write_number(get<2>(mate));
920  *stream_it = separator;
921 
922  write_range_or_asterisk(stream_it, seq);
923  *stream_it = separator;
924 
925  write_range_or_asterisk(stream_it, qual);
926 
927  write_tag_fields(stream_it, tag_dict, separator);
928 
929  stream_it.write_end_of_line(options.add_carriage_return);
930 }
931 
932 
950 template <typename stream_view_type, typename value_type>
951 inline void format_sam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
952  stream_view_type && stream_view,
953  value_type value)
954 {
955  std::vector<value_type> tmp_vector;
956  while (std::ranges::begin(stream_view) != ranges::end(stream_view)) // not fully consumed yet
957  {
958  read_field(stream_view | detail::take_until(is_char<','>), value);
959  tmp_vector.push_back(value);
960 
961  if (is_char<','>(*std::ranges::begin(stream_view)))
962  std::ranges::next(std::ranges::begin(stream_view)); // skip ','
963  }
964  variant = std::move(tmp_vector);
965 }
966 
980 template <typename stream_view_type>
981 inline void format_sam::read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant,
982  stream_view_type && stream_view)
983 {
984  std::vector<std::byte> tmp_vector;
985  std::byte value;
986 
987  while (std::ranges::begin(stream_view) != ranges::end(stream_view)) // not fully consumed yet
988  {
989  try
990  {
991  read_field(stream_view | detail::take_exactly_or_throw(2), value);
992  }
993  catch (std::exception const & e)
994  {
995  throw format_error{"Hexadecimal tag has an uneven number of digits!"};
996  }
997 
998  tmp_vector.push_back(value);
999  }
1000 
1001  variant = std::move(tmp_vector);
1002 }
1003 
1021 template <typename stream_view_type>
1022 inline void format_sam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
1023 {
1024  /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
1025  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
1026  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
1027  VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
1028  */
1029  uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
1030  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1031  tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
1032  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1033  std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
1034  char type_id = *std::ranges::begin(stream_view);
1035  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1036  std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
1037 
1038  switch (type_id)
1039  {
1040  case 'A' : // char
1041  {
1042  target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
1043  std::ranges::next(std::ranges::begin(stream_view)); // skip char that has been read
1044  break;
1045  }
1046  case 'i' : // int32_t
1047  {
1048  int32_t tmp;
1049  read_field(stream_view, tmp);
1050  target[tag] = tmp;
1051  break;
1052  }
1053  case 'f' : // float
1054  {
1055  float tmp;
1056  read_field(stream_view, tmp);
1057  target[tag] = tmp;
1058  break;
1059  }
1060  case 'Z' : // string
1061  {
1062  target[tag] = stream_view | views::to<std::string>;
1063  break;
1064  }
1065  case 'H' :
1066  {
1067  read_sam_byte_vector(target[tag], stream_view);
1068  break;
1069  }
1070  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1071  {
1072  char array_value_type_id = *std::ranges::begin(stream_view);
1073  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1074  std::ranges::next(std::ranges::begin(stream_view)); // skip first ','
1075 
1076  switch (array_value_type_id)
1077  {
1078  case 'c' : // int8_t
1079  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1080  break;
1081  case 'C' : // uint8_t
1082  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1083  break;
1084  case 's' : // int16_t
1085  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1086  break;
1087  case 'S' : // uint16_t
1088  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1089  break;
1090  case 'i' : // int32_t
1091  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1092  break;
1093  case 'I' : // uint32_t
1094  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1095  break;
1096  case 'f' : // float
1097  read_sam_dict_vector(target[tag], stream_view, float{});
1098  break;
1099  default:
1100  throw format_error{std::string("The first character in the numerical ") +
1101  "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id +
1102  "' was given."};
1103  }
1104  break;
1105  }
1106  default:
1107  throw format_error{std::string("The second character in the numerical id of a "
1108  "SAM tag must be one of [A,i,Z,H,B,f] but '") + type_id + "' was given."};
1109  }
1110 }
1111 
1119 template <typename stream_it_t, std::ranges::forward_range field_type>
1120 inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
1121 {
1122  if (std::ranges::empty(field_value))
1123  {
1124  *stream_it = '*';
1125  }
1126  else
1127  {
1128  if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>, char>)
1129  stream_it.write_range(field_value);
1130  else // convert from alphabets to their character representation
1131  stream_it.write_range(field_value | views::to_char);
1132  }
1133 }
1134 
1141 template <typename stream_it_t>
1142 inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value)
1143 {
1144  write_range_or_asterisk(stream_it, std::string_view{field_value});
1145 }
1146 
1154 template <typename stream_it_t>
1155 inline void format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary const & tag_dict, char const separator)
1156 {
1157  auto const stream_variant_fn = [&stream_it] (auto && arg) // helper to print an std::variant
1158  {
1159  using T = std::remove_cvref_t<decltype(arg)>;
1160 
1161  if constexpr (std::ranges::input_range<T>)
1162  {
1163  if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, char>)
1164  {
1165  stream_it.write_range(arg);
1166  }
1167  else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, std::byte>)
1168  {
1169  if (!std::ranges::empty(arg))
1170  {
1171  stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1172 
1173  for (auto && elem : arg | std::views::drop(1))
1174  {
1175  *stream_it = ',';
1176  stream_it.write_number(std::to_integer<uint8_t>(elem));
1177  }
1178  }
1179  }
1180  else
1181  {
1182  if (!std::ranges::empty(arg))
1183  {
1184  stream_it.write_number(*std::ranges::begin(arg));
1185 
1186  for (auto && elem : arg | std::views::drop(1))
1187  {
1188  *stream_it = ',';
1189  stream_it.write_number(elem);
1190  }
1191  }
1192  }
1193  }
1194  else if constexpr (std::same_as<std::remove_cvref_t<T>, char>)
1195  {
1196  *stream_it = arg;
1197  }
1198  else // number
1199  {
1200  stream_it.write_number(arg);
1201  }
1202  };
1203 
1204  for (auto & [tag, variant] : tag_dict)
1205  {
1206  *stream_it = separator;
1207 
1208  char const char0 = tag / 256;
1209  char const char1 = tag % 256;
1210 
1211  *stream_it = char0;
1212  *stream_it = char1;
1213  *stream_it = ':';
1214  *stream_it = detail::sam_tag_type_char[variant.index()];
1215  *stream_it = ':';
1216 
1217  if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1218  {
1219  *stream_it = detail::sam_tag_type_char_extra[variant.index()];
1220  *stream_it = ',';
1221  }
1222 
1223  std::visit(stream_variant_fn, variant);
1224  }
1225 }
1226 
1227 } // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
Provides seqan3::views::to_char.
T begin(T... args)
The SAM format (tag).
Definition: format_sam.hpp:115
format_sam & operator=(format_sam const &)=default
Defaulted.
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type, seq_qual_combined > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:298
~format_sam()=default
Defaulted.
format_sam & operator=(format_sam &&)=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:404
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_sam.hpp:132
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_sam.hpp:353
format_sam(format_sam &&)=default
Defaulted.
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition: format_sam.hpp:663
format_sam()=default
Defaulted.
format_sam(format_sam const &)=default
Defaulted.
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:32
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:157
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:118
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:332
Provides seqan3::detail::fast_ostreambuf_iterator.
constexpr auto to_char
Return the char representation of an alphabet object.
Definition: concept.hpp:384
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:146
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:73
@ none
None of the flags below are set.
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
typename decltype(detail::split_after< i >(list_t{}))::second_type drop
Return a seqan3::type_list of the types in the input type list, except the first n.
Definition: traits.hpp:388
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:189
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:74
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T push_back(T... args)
Adaptations of concepts from the Ranges TS.
Provides the seqan3::format_sam_base that can be inherited from.
Provides the seqan3::sam_file_header class.
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides helper data structures for the seqan3::sam_file_output.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:88
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:24
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:27
bool sam_require_header
Whether to require a header for SAM files.
Definition: output_options.hpp:41
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:30
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:32
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:22
Exposes the value_type of another type.
Definition: pre.hpp:58
T swap(T... args)
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
T tuple_size_v
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.
Provides seqan3::views::slice.
Provides seqan3::views::to.
T visit(T... args)