SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
format_sam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <string>
17 #include <vector>
18 
46 #include <seqan3/std/algorithm>
47 #include <seqan3/std/concepts>
48 #include <seqan3/std/ranges>
49 
50 namespace seqan3
51 {
52 
125 class format_sam : private detail::format_sam_base
126 {
127 public:
131  // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
132  format_sam() = default;
133  format_sam(format_sam const &) = default;
134  format_sam & operator=(format_sam const &) = default;
135  format_sam(format_sam &&) = default;
136  format_sam & operator=(format_sam &&) = default;
137  ~format_sam() = default;
138 
142  {
143  { "sam" },
144  };
145 
146 protected:
147  template <typename stream_type, // constraints checked by file
148  typename seq_legal_alph_type, bool seq_qual_combined,
149  typename seq_type, // other constraints checked inside function
150  typename id_type,
151  typename qual_type>
152  void read_sequence_record(stream_type & stream,
154  seq_type & sequence,
155  id_type & id,
156  qual_type & qualities);
157 
158  template <typename stream_type, // constraints checked by file
159  typename seq_type, // other constraints checked inside function
160  typename id_type,
161  typename qual_type>
162  void write_sequence_record(stream_type & stream,
163  sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
164  seq_type && sequence,
165  id_type && id,
166  qual_type && qualities);
167 
168  template <typename stream_type, // constraints checked by file
169  typename seq_legal_alph_type,
170  typename ref_seqs_type,
171  typename ref_ids_type,
172  typename seq_type,
173  typename id_type,
174  typename offset_type,
175  typename ref_seq_type,
176  typename ref_id_type,
177  typename ref_offset_type,
178  typename align_type,
179  typename cigar_type,
180  typename flag_type,
181  typename mapq_type,
182  typename qual_type,
183  typename mate_type,
184  typename tag_dict_type,
185  typename e_value_type,
186  typename bit_score_type>
187  void read_alignment_record(stream_type & stream,
188  alignment_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
189  ref_seqs_type & ref_seqs,
191  seq_type & seq,
192  qual_type & qual,
193  id_type & id,
194  offset_type & offset,
195  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
196  ref_id_type & ref_id,
197  ref_offset_type & ref_offset,
198  align_type & align,
199  cigar_type & cigar_vector,
200  flag_type & flag,
201  mapq_type & mapq,
202  mate_type & mate,
203  tag_dict_type & tag_dict,
204  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
205  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
206 
207  template <typename stream_type,
208  typename header_type,
209  typename seq_type,
210  typename id_type,
211  typename ref_seq_type,
212  typename ref_id_type,
213  typename align_type,
214  typename qual_type,
215  typename mate_type,
216  typename tag_dict_type,
217  typename e_value_type,
218  typename bit_score_type>
219  void write_alignment_record(stream_type & stream,
220  alignment_file_output_options const & options,
221  header_type && header,
222  seq_type && seq,
223  qual_type && qual,
224  id_type && id,
225  int32_t const offset,
226  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
227  ref_id_type && ref_id,
228  std::optional<int32_t> ref_offset,
229  align_type && align,
230  std::vector<cigar> const & cigar_vector,
231  sam_flag const flag,
232  uint8_t const mapq,
233  mate_type && mate,
234  tag_dict_type && tag_dict,
235  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
236  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
237 
238 private:
240  std::string tmp_qual{};
241 
243  static constexpr std::string_view dummy{};
244 
246  alignment_file_header<> default_header{};
247 
249  bool ref_info_present_in_header{false};
250 
252  std::string_view const & default_or(detail::ignore_t) const noexcept
253  {
254  return dummy;
255  }
256 
258  template <typename t>
259  decltype(auto) default_or(t && v) const noexcept
260  {
261  return std::forward<t>(v);
262  }
263 
264  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
265 
266  template <typename stream_view_type, typename value_type>
267  void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
268  stream_view_type && stream_view,
269  value_type value);
270 
271  template <typename stream_view_type>
272  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
273 
274  template <typename stream_it_t, std::ranges::forward_range field_type>
275  void write_range(stream_it_t & stream_it, field_type && field_value);
276 
277  template <typename stream_it_t>
278  void write_range(stream_it_t & stream_it, char const * const field_value);
279 
280  template <typename stream_t, arithmetic field_type>
281  void write_field(stream_t & stream, field_type field_value);
282 
283  template <typename stream_t>
284  void write_tag_fields(stream_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
285 };
286 
288 template <typename stream_type, // constraints checked by file
289  typename seq_legal_alph_type, bool seq_qual_combined,
290  typename seq_type, // other constraints checked inside function
291  typename id_type,
292  typename qual_type>
293 inline void format_sam::read_sequence_record(stream_type & stream,
295  seq_type & sequence,
296  id_type & id,
297  qual_type & qualities)
298 {
300 
301  if constexpr (seq_qual_combined)
302  {
303  tmp_qual.clear();
304  read_alignment_record(stream, align_options, std::ignore, default_header, sequence, tmp_qual, id,
305  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore,
306  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore);
307 
308  for (auto sit = tmp_qual.begin(), dit = std::ranges::begin(sequence); sit != tmp_qual.end(); ++sit, ++dit)
309  get<1>(*dit).assign_char(*sit);
310  }
311  else
312  {
313  read_alignment_record(stream, align_options, std::ignore, default_header, sequence, qualities, id,
314  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore,
315  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore);
316  }
317 
318  if constexpr (!detail::decays_to_ignore_v<seq_type>)
319  if (std::ranges::distance(sequence) == 0)
320  throw parse_error{"The sequence information must not be empty."};
321  if constexpr (!detail::decays_to_ignore_v<id_type>)
322  if (std::ranges::distance(id) == 0)
323  throw parse_error{"The id information must not be empty."};
324 
325  if (options.truncate_ids)
326  id = id | views::take_until_and_consume(is_space) | views::to<id_type>;
327 }
328 
330 template <typename stream_type, // constraints checked by file
331  typename seq_type, // other constraints checked inside function
332  typename id_type,
333  typename qual_type>
334 inline void format_sam::write_sequence_record(stream_type & stream,
335  sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
336  seq_type && sequence,
337  id_type && id,
338  qual_type && qualities)
339 {
340  using default_align_t = std::pair<std::span<gapped<char>>, std::span<gapped<char>>>;
341  using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
342 
343  alignment_file_output_options output_options;
344 
345  write_alignment_record(stream,
346  output_options,
347  /*header*/ std::ignore,
348  /*seq*/ default_or(sequence),
349  /*qual*/ default_or(qualities),
350  /*id*/ default_or(id),
351  /*offset*/ 0,
352  /*ref_seq*/ std::string_view{},
353  /*ref_id*/ std::string_view{},
354  /*ref_offset*/ -1,
355  /*align*/ default_align_t{},
356  /*cigar_vector*/ std::vector<cigar>{},
357  /*flag*/ sam_flag::none,
358  /*mapq*/ 0,
359  /*mate*/ default_mate_t{},
360  /*tag_dict*/ sam_tag_dictionary{},
361  /*e_value*/ 0,
362  /*bit_score*/ 0);
363 }
364 
366 template <typename stream_type, // constraints checked by file
367  typename seq_legal_alph_type,
368  typename ref_seqs_type,
369  typename ref_ids_type,
370  typename seq_type,
371  typename id_type,
372  typename offset_type,
373  typename ref_seq_type,
374  typename ref_id_type,
375  typename ref_offset_type,
376  typename align_type,
377  typename cigar_type,
378  typename flag_type,
379  typename mapq_type,
380  typename qual_type,
381  typename mate_type,
382  typename tag_dict_type,
383  typename e_value_type,
384  typename bit_score_type>
385 inline void format_sam::read_alignment_record(stream_type & stream,
386  alignment_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
387  ref_seqs_type & ref_seqs,
389  seq_type & seq,
390  qual_type & qual,
391  id_type & id,
392  offset_type & offset,
393  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
394  ref_id_type & ref_id,
395  ref_offset_type & ref_offset,
396  align_type & align,
397  cigar_type & cigar_vector,
398  flag_type & flag,
399  mapq_type & mapq,
400  mate_type & mate,
401  tag_dict_type & tag_dict,
402  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
403  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
404 {
405  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
406  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
407  "The ref_offset must be a specialisation of std::optional.");
408 
409  auto stream_view = views::istreambuf(stream);
410  auto field_view = stream_view | views::take_until_or_throw_and_consume(is_char<'\t'>);
411 
412  // these variables need to be stored to compute the ALIGNMENT
413  int32_t ref_offset_tmp{};
414  std::ranges::range_value_t<decltype(header.ref_ids())> ref_id_tmp{};
415  [[maybe_unused]] int32_t offset_tmp{};
416  [[maybe_unused]] int32_t soft_clipping_end{};
417  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
418  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
419 
420  // Header
421  // -------------------------------------------------------------------------------------------------------------
422  if (is_char<'@'>(*std::ranges::begin(stream_view))) // we always read the header if present
423  {
424  read_header(stream_view, header, ref_seqs);
425 
426  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
427  return;
428  }
429 
430  // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
431  // -------------------------------------------------------------------------------------------------------------
432  read_field(field_view, id);
433 
434  uint16_t flag_integral{};
435  read_field(field_view, flag_integral);
436  flag = sam_flag{flag_integral};
437 
438  read_field(field_view, ref_id_tmp);
439  check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
440 
441  read_field(field_view, ref_offset_tmp);
442  --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
443 
444  if (ref_offset_tmp == -1)
445  ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
446  else if (ref_offset_tmp > -1)
447  ref_offset = ref_offset_tmp;
448  else if (ref_offset_tmp < -1)
449  throw format_error{"No negative values are allowed for field::ref_offset."};
450 
451  read_field(field_view, mapq);
452 
453  // Field 6: CIGAR
454  // -------------------------------------------------------------------------------------------------------------
455  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
456  {
457  if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
458  {
459  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(field_view);
460  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
461  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
462  }
463  else
464  {
465  std::ranges::next(std::ranges::begin(field_view)); // skip '*'
466  }
467  }
468  else
469  {
470  detail::consume(field_view);
471  }
472 
473  offset = offset_tmp;
474 
475  // Field 7-9: (RNEXT PNEXT TLEN) = MATE
476  // -------------------------------------------------------------------------------------------------------------
477  if constexpr (!detail::decays_to_ignore_v<mate_type>)
478  {
479  std::ranges::range_value_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
480  read_field(field_view, tmp_mate_ref_id); // RNEXT
481 
482  if (tmp_mate_ref_id == "=") // indicates "same as ref id"
483  {
484  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
485  get<0>(mate) = ref_id;
486  else
487  check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
488  }
489  else
490  {
491  check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
492  }
493 
494  int32_t tmp_pnext{};
495  read_field(field_view, tmp_pnext); // PNEXT
496 
497  if (tmp_pnext > 0)
498  get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
499  else if (tmp_pnext < 0)
500  throw format_error{"No negative values are allowed at the mate mapping position."};
501  // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
502 
503  read_field(field_view, get<2>(mate)); // TLEN
504  }
505  else
506  {
507  for (size_t i = 0; i < 3u; ++i)
508  {
509  detail::consume(field_view);
510  }
511  }
512 
513  // Field 10: Sequence
514  // -------------------------------------------------------------------------------------------------------------
515  if (!is_char<'*'>(*std::ranges::begin(stream_view))) // sequence information is given
516  {
517  auto constexpr is_legal_alph = is_in_alphabet<seq_legal_alph_type>;
518  auto seq_stream = field_view | std::views::transform([is_legal_alph] (char const c) // enforce legal alphabet
519  {
520  if (!is_legal_alph(c))
521  throw parse_error{std::string{"Encountered an unexpected letter: "} +
522  is_legal_alph.msg +
523  " evaluated to false on " +
524  detail::make_printable(c)};
525  return c;
526  });
527 
528  if constexpr (detail::decays_to_ignore_v<seq_type>)
529  {
530  if constexpr (!detail::decays_to_ignore_v<align_type>)
531  {
532  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
533  "If you want to read ALIGNMENT but not SEQ, the alignment"
534  " object must store a sequence container at the second (query) position.");
535 
536  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
537  {
538 
539  auto tmp_iter = std::ranges::begin(seq_stream);
540  std::ranges::advance(tmp_iter, offset_tmp);
541 
542  for (; seq_length > 0; --seq_length) // seq_length is not needed anymore
543  {
544  get<1>(align).push_back(std::ranges::range_value_t<decltype(get<1>(align))>{}.assign_char(*tmp_iter));
545  ++tmp_iter;
546  }
547 
548  std::ranges::advance(tmp_iter, soft_clipping_end);
549  }
550  else
551  {
552  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // empty container
553  }
554  }
555  else
556  {
557  detail::consume(seq_stream);
558  }
559  }
560  else
561  {
562  read_field(seq_stream, seq);
563 
564  if constexpr (!detail::decays_to_ignore_v<align_type>)
565  {
566  if (!tmp_cigar_vector.empty()) // if no alignment info is given, the field::alignment should remain empty
567  {
568  assign_unaligned(get<1>(align),
569  seq | views::slice(static_cast<decltype(std::ranges::size(seq))>(offset_tmp),
570  std::ranges::size(seq) - soft_clipping_end));
571  }
572  }
573  }
574  }
575  else
576  {
577  std::ranges::next(std::ranges::begin(field_view)); // skip '*'
578  }
579 
580  // Field 11: Quality
581  // -------------------------------------------------------------------------------------------------------------
582  auto const tab_or_end = is_char<'\t'> || is_char<'\r'> || is_char<'\n'>;
583  read_field(stream_view | views::take_until_or_throw(tab_or_end), qual);
584 
585  if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
586  {
587  if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0 &&
588  std::ranges::distance(seq) != std::ranges::distance(qual))
589  {
590  throw format_error{detail::to_string("Sequence length (", std::ranges::distance(seq),
591  ") and quality length (", std::ranges::distance(qual),
592  ") must be the same.")};
593  }
594  }
595 
596  // All remaining optional fields if any: SAM tags dictionary
597  // -------------------------------------------------------------------------------------------------------------
598  while (is_char<'\t'>(*std::ranges::begin(stream_view))) // read all tags if present
599  {
600  std::ranges::next(std::ranges::begin(stream_view)); // skip tab
601  read_field(stream_view | views::take_until_or_throw(tab_or_end), tag_dict);
602  }
603 
604  detail::consume(stream_view | views::take_until(!(is_char<'\r'> || is_char<'\n'>))); // consume new line
605 
606  // DONE READING - wrap up
607  // -------------------------------------------------------------------------------------------------------------
608  // Alignment object construction
609  // Note that the query sequence in get<1>(align) has already been filled while reading Field 10.
610  if constexpr (!detail::decays_to_ignore_v<align_type>)
611  {
612  int32_t ref_idx{(ref_id_tmp.empty()/*unmapped read?*/) ? -1 : 0};
613 
614  if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
615  {
616  if (!ref_id_tmp.empty())
617  {
618  assert(header.ref_dict.count(ref_id_tmp) != 0); // taken care of in check_and_assign_ref_id()
619  ref_idx = header.ref_dict[ref_id_tmp]; // get index for reference sequence
620  }
621  }
622 
623  construct_alignment(align, tmp_cigar_vector, ref_idx, ref_seqs, ref_offset_tmp, ref_length);
624  }
625 
626  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
627  std::swap(cigar_vector, tmp_cigar_vector);
628 }
629 
631 template <typename stream_type,
632  typename header_type,
633  typename seq_type,
634  typename id_type,
635  typename ref_seq_type,
636  typename ref_id_type,
637  typename align_type,
638  typename qual_type,
639  typename mate_type,
640  typename tag_dict_type,
641  typename e_value_type,
642  typename bit_score_type>
643 inline void format_sam::write_alignment_record(stream_type & stream,
644  alignment_file_output_options const & options,
645  header_type && header,
646  seq_type && seq,
647  qual_type && qual,
648  id_type && id,
649  int32_t const offset,
650  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
651  ref_id_type && ref_id,
652  std::optional<int32_t> ref_offset,
653  align_type && align,
654  std::vector<cigar> const & cigar_vector,
655  sam_flag const flag,
656  uint8_t const mapq,
657  mate_type && mate,
658  tag_dict_type && tag_dict,
659  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
660  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
661 {
662  /* Note the following general things:
663  *
664  * - Given the SAM specifications, all fields may be empty
665  *
666  * - arithmetic values default to 0 while all others default to '*'
667  *
668  * - Because of the former, arithmetic values can be directly streamed
669  * into 'stream' as operator<< is defined for all arithmetic types
670  * and the default value (0) is also the SAM default.
671  *
672  * - All other non-arithmetic values need to be checked for emptiness
673  */
674 
675  // ---------------------------------------------------------------------
676  // Type Requirements (as static asserts for user friendliness)
677  // ---------------------------------------------------------------------
678  static_assert((std::ranges::forward_range<seq_type> &&
679  alphabet<std::ranges::range_reference_t<seq_type>>),
680  "The seq object must be a std::ranges::forward_range over "
681  "letters that model seqan3::alphabet.");
682 
683  static_assert((std::ranges::forward_range<id_type> &&
684  alphabet<std::ranges::range_reference_t<id_type>>),
685  "The id object must be a std::ranges::forward_range over "
686  "letters that model seqan3::alphabet.");
687 
688  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
689  {
690  static_assert((std::ranges::forward_range<ref_id_type> ||
691  std::integral<std::remove_reference_t<ref_id_type>> ||
692  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
693  "The ref_id object must be a std::ranges::forward_range "
694  "over letters that model seqan3::alphabet.");
695 
696  if constexpr (std::integral<std::remove_cvref_t<ref_id_type>> ||
697  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>)
698  static_assert(!detail::decays_to_ignore_v<header_type>,
699  "If you give indices as reference id information the header must also be present.");
700  }
701 
703  "The align object must be a std::pair of two ranges whose "
704  "value_type is comparable to seqan3::gap");
705 
706  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
707  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
708  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
709  "The align object must be a std::pair of two ranges whose "
710  "value_type is comparable to seqan3::gap");
711 
712  static_assert((std::ranges::forward_range<qual_type> &&
713  alphabet<std::ranges::range_reference_t<qual_type>>),
714  "The qual object must be a std::ranges::forward_range "
715  "over letters that model seqan3::alphabet.");
716 
718  "The mate object must be a std::tuple of size 3 with "
719  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
720  "2) a std::integral or std::optional<std::integral>, and "
721  "3) a std::integral.");
722 
723  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
724  std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
725  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
726  (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
727  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
728  std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
729  "The mate object must be a std::tuple of size 3 with "
730  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
731  "2) a std::integral or std::optional<std::integral>, and "
732  "3) a std::integral.");
733 
734  if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
735  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>)
736  static_assert(!detail::decays_to_ignore_v<header_type>,
737  "If you give indices as mate reference id information the header must also be present.");
738 
739  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
740  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
741 
742  // ---------------------------------------------------------------------
743  // logical Requirements
744  // ---------------------------------------------------------------------
745  if constexpr (!detail::decays_to_ignore_v<header_type> &&
746  !detail::decays_to_ignore_v<ref_id_type> &&
747  !std::integral<std::remove_reference_t<ref_id_type>> &&
748  !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
749  {
750 
751  if (options.sam_require_header && !std::ranges::empty(ref_id))
752  {
753  auto id_it = header.ref_dict.end();
754 
755  if constexpr (std::ranges::contiguous_range<decltype(ref_id)> &&
756  std::ranges::sized_range<decltype(ref_id)> &&
757  std::ranges::borrowed_range<decltype(ref_id)>)
758  {
759  id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
760  }
761  else
762  {
763  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
764 
766  "The ref_id type is not convertible to the reference id information stored in the "
767  "reference dictionary of the header object.");
768 
769  id_it = header.ref_dict.find(ref_id);
770  }
771 
772  if (id_it == header.ref_dict.end()) // no reference id matched
773  throw format_error{detail::to_string("The ref_id '", ref_id, "' was not in the list of references:",
774  header.ref_ids())};
775  }
776  }
777 
778  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
779  throw format_error{"The ref_offset object must be an std::integral >= 0."};
780 
781  // ---------------------------------------------------------------------
782  // Writing the Header on first call
783  // ---------------------------------------------------------------------
784  if constexpr (!detail::decays_to_ignore_v<header_type>)
785  {
786  if (options.sam_require_header && !header_was_written)
787  {
788  write_header(stream, options, header);
789  header_was_written = true;
790  }
791  }
792 
793  // ---------------------------------------------------------------------
794  // Writing the Record
795  // ---------------------------------------------------------------------
796  std::cpp20::ostreambuf_iterator stream_it{stream};
797  char const separator{'\t'};
798 
799  write_range(stream_it, std::forward<id_type>(id));
800 
801  stream << separator;
802 
803  stream << static_cast<uint16_t>(flag) << separator;
804 
805  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
806  {
807  if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
808  {
809  write_range(stream_it, (header.ref_ids())[ref_id]);
810  }
811  else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
812  {
813  if (ref_id.has_value())
814  write_range(stream_it, (header.ref_ids())[ref_id.value()]);
815  else
816  stream << '*';
817  }
818  else
819  {
820  write_range(stream_it, std::forward<ref_id_type>(ref_id));
821  }
822  }
823  else
824  {
825  stream << '*';
826  }
827 
828  stream << separator;
829 
830  // SAM is 1 based, 0 indicates unmapped read if optional is not set
831  stream << (ref_offset.value_or(-1) + 1) << separator;
832 
833  stream << static_cast<unsigned>(mapq) << separator;
834 
835  if (!std::ranges::empty(cigar_vector))
836  {
837  for (auto & c : cigar_vector)
838  stream << c.to_string(); // returns a small_vector instead of char so write_range doesn't work
839  }
840  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
841  {
842  // compute possible distance from alignment end to sequence end
843  // which indicates soft clipping at the end.
844  // This should be replace by a free count_gaps function for
845  // aligned sequences which is more efficient if possible.
846  size_t off_end{std::ranges::size(seq) - offset};
847  for (auto chr : get<1>(align))
848  if (chr == gap{})
849  ++off_end;
850  off_end -= std::ranges::size(get<1>(align));
851 
852  write_range(stream_it, detail::get_cigar_string(std::forward<align_type>(align), offset, off_end));
853  }
854  else
855  {
856  stream << '*';
857  }
858 
859  stream << separator;
860 
861  if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
862  {
863  write_range(stream_it, (header.ref_ids())[get<0>(mate)]);
864  }
865  else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>, std::optional>)
866  {
867  if (get<0>(mate).has_value())
868  // value_or(0) instead of value() (which is equivalent here) as a
869  // workaround for a ubsan false-positive in GCC8: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90058
870  write_range(stream_it, header.ref_ids()[get<0>(mate).value_or(0)]);
871  else
872  stream << '*';
873  }
874  else
875  {
876  write_range(stream_it, get<0>(mate));
877  }
878 
879  stream << separator;
880 
881  if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
882  {
883  // SAM is 1 based, 0 indicates unmapped read if optional is not set
884  stream << (get<1>(mate).value_or(-1) + 1) << separator;
885  }
886  else
887  {
888  stream << get<1>(mate) << separator;
889  }
890 
891  stream << get<2>(mate) << separator;
892 
893  write_range(stream_it, std::forward<seq_type>(seq));
894 
895  stream << separator;
896 
897  write_range(stream_it, std::forward<qual_type>(qual));
898 
899  write_tag_fields(stream, std::forward<tag_dict_type>(tag_dict), separator);
900 
901  detail::write_eol(stream_it, options.add_carriage_return);
902 }
903 
904 
922 template <typename stream_view_type, typename value_type>
923 inline void format_sam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
924  stream_view_type && stream_view,
925  value_type value)
926 {
927  std::vector<value_type> tmp_vector;
928  while (std::ranges::begin(stream_view) != ranges::end(stream_view)) // not fully consumed yet
929  {
930  read_field(stream_view | views::take_until(is_char<','>), value);
931  tmp_vector.push_back(value);
932 
933  if (is_char<','>(*std::ranges::begin(stream_view)))
934  std::ranges::next(std::ranges::begin(stream_view)); // skip ','
935  }
936  variant = std::move(tmp_vector);
937 }
938 
956 template <typename stream_view_type>
957 inline void format_sam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
958 {
959  /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
960  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
961  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
962  VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
963  */
964  uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
965  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
966  tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
967  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
968  std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
969  char type_id = *std::ranges::begin(stream_view);
970  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
971  std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
972 
973  switch (type_id)
974  {
975  case 'A' : // char
976  {
977  target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
978  std::ranges::next(std::ranges::begin(stream_view)); // skip char that has been read
979  break;
980  }
981  case 'i' : // int32_t
982  {
983  int32_t tmp;
984  read_field(stream_view, tmp);
985  target[tag] = tmp;
986  break;
987  }
988  case 'f' : // float
989  {
990  float tmp;
991  read_field(stream_view, tmp);
992  target[tag] = tmp;
993  break;
994  }
995  case 'Z' : // string
996  {
997  target[tag] = stream_view | views::to<std::string>;
998  break;
999  }
1000  case 'H' :
1001  {
1002  // TODO
1003  break;
1004  }
1005  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1006  {
1007  char array_value_type_id = *std::ranges::begin(stream_view);
1008  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1009  std::ranges::next(std::ranges::begin(stream_view)); // skip first ','
1010 
1011  switch (array_value_type_id)
1012  {
1013  case 'c' : // int8_t
1014  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1015  break;
1016  case 'C' : // uint8_t
1017  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1018  break;
1019  case 's' : // int16_t
1020  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1021  break;
1022  case 'S' : // uint16_t
1023  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1024  break;
1025  case 'i' : // int32_t
1026  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1027  break;
1028  case 'I' : // uint32_t
1029  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1030  break;
1031  case 'f' : // float
1032  read_sam_dict_vector(target[tag], stream_view, float{});
1033  break;
1034  default:
1035  throw format_error{std::string("The first character in the numerical ") +
1036  "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id +
1037  "' was given."};
1038  }
1039  break;
1040  }
1041  default:
1042  throw format_error{std::string("The second character in the numerical id of a "
1043  "SAM tag must be one of [A,i,Z,H,B,f] but '") + type_id + "' was given."};
1044  }
1045 }
1046 
1054 template <typename stream_it_t, std::ranges::forward_range field_type>
1055 inline void format_sam::write_range(stream_it_t & stream_it, field_type && field_value)
1056 {
1057  if (std::ranges::empty(field_value))
1058  stream_it = '*';
1059  else
1060  std::ranges::copy(field_value | views::to_char, stream_it);
1061 }
1062 
1069 template <typename stream_it_t>
1070 inline void format_sam::write_range(stream_it_t & stream_it, char const * const field_value)
1071 {
1072  write_range(stream_it, std::string_view{field_value});
1073 }
1074 
1080 template <typename stream_t, arithmetic field_type>
1081 inline void format_sam::write_field(stream_t & stream, field_type field_value)
1082 {
1083  // TODO: replace this with to_chars for efficiency
1084  if constexpr (std::same_as<field_type, int8_t> || std::same_as<field_type, uint8_t>)
1085  stream << static_cast<int16_t>(field_value);
1086  else
1087  stream << field_value;
1088 }
1089 
1097 template <typename stream_t>
1098 inline void format_sam::write_tag_fields(stream_t & stream, sam_tag_dictionary const & tag_dict, char const separator)
1099 {
1100  auto stream_variant_fn = [this, &stream] (auto && arg) // helper to print an std::variant
1101  {
1102  using T = std::remove_cvref_t<decltype(arg)>;
1103 
1104  if constexpr (!container<T> || std::same_as<T, std::string>)
1105  {
1106  stream << arg;
1107  }
1108  else
1109  {
1110  if (arg.begin() != arg.end())
1111  {
1112  for (auto it = arg.begin(); it != (arg.end() - 1); ++it)
1113  {
1114  write_field(stream, *it);
1115  stream << ',';
1116  }
1117 
1118  write_field(stream, *(arg.end() - 1)); // write last value without trailing ','
1119  }
1120  }
1121  };
1122 
1123  for (auto & [tag, variant] : tag_dict)
1124  {
1125  stream << separator;
1126 
1127  char char0 = tag / 256;
1128  char char1 = tag % 256;
1129 
1130  stream << char0 << char1 << ':' << detail::sam_tag_type_char[variant.index()] << ':';
1131 
1132  if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1133  stream << detail::sam_tag_type_char_extra[variant.index()] << ',';
1134 
1135  std::visit(stream_variant_fn, variant);
1136  }
1137 }
1138 
1139 } // namespace seqan3
seqan3::gap
The alphabet of a gap character '-'.
Definition: gap.hpp:37
seqan3::alignment_file_output_options
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
std::optional::has_value
T has_value(T... args)
misc.hpp
Provides helper data structures for the seqan3::alignment_file_output.
seqan3::views::take_until_and_consume
constexpr auto take_until_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:638
take_until.hpp
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
input_format_concept.hpp
Provides seqan3::sequence_file_input_format and auxiliary classes.
std::span
seqan3::alignment_file_output_options::add_carriage_return
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:27
std::string
seqan3::sequence_file_input_options::truncate_ids
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:28
sequence_container
A more refined container concept than seqan3::container.
seqan3::format_sam::operator=
format_sam & operator=(format_sam const &)=default
Defaulted.
seqan3::format_sam::write_sequence_record
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_sam.hpp:334
std::string_view
tuple.hpp
Provides seqan3::tuple_like.
sam_tag_dictionary.hpp
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
std::pair
seqan3::format_error
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:85
vector
seqan3::alignment_file_output_options::sam_require_header
bool sam_require_header
Whether to require a header for SAM files.
Definition: output_options.hpp:41
std::optional::value_or
T value_or(T... args)
template_inspection.hpp
Provides seqan3::type_list and auxiliary type traits.
seqan3::sam_flag
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: misc.hpp:71
std::tuple
seqan3::alignment_file_input_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:24
seqan3::is_space
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:146
seqan3::alignment_file_header::ref_ids
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:119
seqan3::format_sam::read_alignment_record
void read_alignment_record(stream_type &stream, alignment_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, alignment_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:385
algorithm
Adaptations of algorithms from the Ranges TS.
std::string::clear
T clear(T... args)
concepts
The Concepts library.
input_format_concept.hpp
Provides seqan3::alignment_file_input_format and auxiliary classes.
seqan3::sequence_file_input_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:26
std::tie
T tie(T... args)
std::vector::push_back
T push_back(T... args)
seqan3::seq
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
seqan3::format_sam::file_extensions
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_sam.hpp:142
seqan3::value_type
Exposes the value_type of another type.
Definition: pre.hpp:58
slice.hpp
Provides seqan3::views::slice.
seqan3::views::take_until_or_throw_and_consume
constexpr auto take_until_or_throw_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:652
to.hpp
Provides seqan3::views::to.
seqan3::views::move
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:68
seqan3::sam_flag::none
@ none
None of the flags below are set.
seqan3::parse_error
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:48
seqan3::alignment_file_header
Stores the header information of alignment files.
Definition: header.hpp:33
range.hpp
Provides various transformation traits used by the range module.
core_language.hpp
Provides concepts for core language types and relations that don't have concepts in C++20 (yet).
seqan3::alignment_file_header::ref_dict
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:158
header.hpp
Provides the seqan3::alignment_file_header class.
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
seqan3::format_sam::~format_sam
~format_sam()=default
seqan3::views::take_until
constexpr auto take_until
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:610
seqan3::sequence_file_output_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:22
detail.hpp
Auxiliary functions for the alignment IO.
output_options.hpp
Provides seqan3::sequence_file_output_options.
output_options.hpp
Provides seqan3::alignment_file_output_options.
seqan3::pack_traits::size
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:116
tuple_like
Whether a type behaves like a tuple.
std::swap
T swap(T... args)
std::variant::index
T index(T... args)
to_char.hpp
Provides seqan3::views::to_char.
std::optional::value
T value(T... args)
ranges
Adaptations of concepts from the Ranges TS.
seqan3::sam_tag_dictionary
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:325
std::remove_reference_t
seqan3::views::take_until_or_throw
constexpr auto take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:624
alphabet
The generic alphabet concept that covers most data types used in ranges.
std::string::begin
T begin(T... args)
container
The (most general) container concept as defined by the standard library.
predicate.hpp
Provides character predicates for tokenisation.
seqan3::views::to_char
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:65
seqan3::views::slice
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:141
ignore_output_iterator.hpp
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
seqan3::format_sam
The SAM format (tag).
Definition: format_sam.hpp:126
std::visit
T visit(T... args)
output_format_concept.hpp
Provides seqan3::alignment_file_output_format and auxiliary classes.
seqan3::format_sam::operator=
format_sam & operator=(format_sam &&)=default
Defaulted.
std::remove_cvref_t
implicitly_convertible_to
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
std::optional
to_string.hpp
Auxiliary for pretty printing of exception messages.
istreambuf.hpp
Provides seqan3::views::istreambuf.
std::string::end
T end(T... args)
seqan3::pack_traits::transform
seqan3::type_list< trait_t< pack_t >... > transform
Apply a transformation trait to every type in the pack and return a seqan3::type_list of the results.
Definition: traits.hpp:307
iterator.hpp
Provides seqan3::fast_istreambuf_iterator and seqan3::fast_ostreambuf_iterator, as well as,...
misc.hpp
Provides various utility functions.
seqan3::format_sam::read_sequence_record
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type, seq_qual_combined > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:293
input_options.hpp
Provides seqan3::alignment_file_input_options.
format_sam_base.hpp
Provides the seqan3::format_sam_base that can be inherited from.
misc.hpp
Provides various utility functions.
sequence
The generic concept for a sequence.
char_to.hpp
Provides seqan3::views::char_to.
seqan3::format_sam::write_alignment_record
void write_alignment_record(stream_type &stream, alignment_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition: format_sam.hpp:643
seqan3::views::istreambuf
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:113
seqan3::format_sam::format_sam
format_sam()=default
Defaulted.
std::tuple_size_v
T tuple_size_v
seqan3::format_sam::format_sam
format_sam(format_sam &&)=default
Defaulted.
seqan3::format_sam::format_sam
format_sam(format_sam const &)=default
Defaulted.
std::variant
string