SeqAn3  3.0.1
The Modern C++ library for sequence analysis.
format_sam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <string>
17 #include <vector>
18 
45 #include <seqan3/std/algorithm>
46 #include <seqan3/std/concepts>
47 #include <seqan3/std/ranges>
48 
49 namespace seqan3
50 {
51 
124 class format_sam : private detail::format_sam_base
125 {
126 public:
130  // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
131  format_sam() = default;
132  format_sam(format_sam const &) = default;
133  format_sam & operator=(format_sam const &) = default;
134  format_sam(format_sam &&) = default;
135  format_sam & operator=(format_sam &&) = default;
136  ~format_sam() = default;
137 
141  {
142  { "sam" },
143  };
144 
145 protected:
146  template <typename stream_type, // constraints checked by file
147  typename seq_legal_alph_type, bool seq_qual_combined,
148  typename seq_type, // other constraints checked inside function
149  typename id_type,
150  typename qual_type>
151  void read_sequence_record(stream_type & stream,
153  seq_type & sequence,
154  id_type & id,
155  qual_type & qualities);
156 
157  template <typename stream_type, // constraints checked by file
158  typename seq_type, // other constraints checked inside function
159  typename id_type,
160  typename qual_type>
161  void write_sequence_record(stream_type & stream,
162  sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
163  seq_type && sequence,
164  id_type && id,
165  qual_type && qualities);
166 
167  template <typename stream_type, // constraints checked by file
168  typename seq_legal_alph_type,
169  typename ref_seqs_type,
170  typename ref_ids_type,
171  typename seq_type,
172  typename id_type,
173  typename offset_type,
174  typename ref_seq_type,
175  typename ref_id_type,
176  typename ref_offset_type,
177  typename align_type,
178  typename cigar_type,
179  typename flag_type,
180  typename mapq_type,
181  typename qual_type,
182  typename mate_type,
183  typename tag_dict_type,
184  typename e_value_type,
185  typename bit_score_type>
186  void read_alignment_record(stream_type & stream,
187  alignment_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
188  ref_seqs_type & ref_seqs,
190  seq_type & seq,
191  qual_type & qual,
192  id_type & id,
193  offset_type & offset,
194  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
195  ref_id_type & ref_id,
196  ref_offset_type & ref_offset,
197  align_type & align,
198  cigar_type & cigar_vector,
199  flag_type & flag,
200  mapq_type & mapq,
201  mate_type & mate,
202  tag_dict_type & tag_dict,
203  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
204  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
205 
206  template <typename stream_type,
207  typename header_type,
208  typename seq_type,
209  typename id_type,
210  typename ref_seq_type,
211  typename ref_id_type,
212  typename align_type,
213  typename qual_type,
214  typename mate_type,
215  typename tag_dict_type,
216  typename e_value_type,
217  typename bit_score_type>
218  void write_alignment_record(stream_type & stream,
219  alignment_file_output_options const & options,
220  header_type && header,
221  seq_type && seq,
222  qual_type && qual,
223  id_type && id,
224  int32_t const offset,
225  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
226  ref_id_type && ref_id,
228  align_type && align,
229  std::vector<cigar> const & cigar_vector,
230  sam_flag const flag,
231  uint8_t const mapq,
232  mate_type && mate,
233  tag_dict_type && tag_dict,
234  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
235  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
236 
237 private:
239  std::string tmp_qual{};
240 
242  static constexpr std::string_view dummy{};
243 
245  alignment_file_header<> default_header{};
246 
248  bool ref_info_present_in_header{false};
249 
251  std::string_view const & default_or(detail::ignore_t) const noexcept
252  {
253  return dummy;
254  }
255 
257  template <typename t>
258  decltype(auto) default_or(t && v) const noexcept
259  {
260  return std::forward<t>(v);
261  }
262 
263  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
264 
265  template <typename stream_view_type, typename value_type>
266  void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
267  stream_view_type && stream_view,
268  value_type value);
269 
270  template <typename stream_view_type>
271  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
272 
273  template <typename stream_it_t, std::ranges::forward_range field_type>
274  void write_range(stream_it_t & stream_it, field_type && field_value);
275 
276  template <typename stream_it_t>
277  void write_range(stream_it_t & stream_it, char const * const field_value);
278 
279  template <typename stream_t, arithmetic field_type>
280  void write_field(stream_t & stream, field_type field_value);
281 
282  template <typename stream_t>
283  void write_tag_fields(stream_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
284 };
285 
287 template <typename stream_type, // constraints checked by file
288  typename seq_legal_alph_type, bool seq_qual_combined,
289  typename seq_type, // other constraints checked inside function
290  typename id_type,
291  typename qual_type>
292 inline void format_sam::read_sequence_record(stream_type & stream,
294  seq_type & sequence,
295  id_type & id,
296  qual_type & qualities)
297 {
299 
300  if constexpr (seq_qual_combined)
301  {
302  tmp_qual.clear();
303  read_alignment_record(stream, align_options, std::ignore, default_header, sequence, tmp_qual, id,
304  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore,
305  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore);
306 
307  for (auto sit = tmp_qual.begin(), dit = std::ranges::begin(sequence); sit != tmp_qual.end(); ++sit, ++dit)
308  get<1>(*dit).assign_char(*sit);
309  }
310  else
311  {
312  read_alignment_record(stream, align_options, std::ignore, default_header, sequence, qualities, id,
313  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore,
314  std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore);
315  }
316 
317  if constexpr (!detail::decays_to_ignore_v<seq_type>)
318  if (std::ranges::distance(sequence) == 0)
319  throw parse_error{"The sequence information must not be empty."};
320  if constexpr (!detail::decays_to_ignore_v<id_type>)
321  if (std::ranges::distance(id) == 0)
322  throw parse_error{"The id information must not be empty."};
323 
324  if (options.truncate_ids)
325  id = id | views::take_until_and_consume(is_space) | views::to<id_type>;
326 }
327 
329 template <typename stream_type, // constraints checked by file
330  typename seq_type, // other constraints checked inside function
331  typename id_type,
332  typename qual_type>
333 inline void format_sam::write_sequence_record(stream_type & stream,
334  sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
335  seq_type && sequence,
336  id_type && id,
337  qual_type && qualities)
338 {
339  using default_align_t = std::pair<std::span<gapped<char>>, std::span<gapped<char>>>;
340  using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
341 
342  alignment_file_output_options output_options;
343 
344  write_alignment_record(stream,
345  output_options,
346  /*header*/ std::ignore,
347  /*seq*/ default_or(sequence),
348  /*qual*/ default_or(qualities),
349  /*id*/ default_or(id),
350  /*offset*/ 0,
351  /*ref_seq*/ std::string_view{},
352  /*ref_id*/ std::string_view{},
353  /*ref_offset*/ -1,
354  /*align*/ default_align_t{},
355  /*cigar_vector*/ std::vector<cigar>{},
356  /*flag*/ sam_flag::none,
357  /*mapq*/ 0,
358  /*mate*/ default_mate_t{},
359  /*tag_dict*/ sam_tag_dictionary{},
360  /*e_value*/ 0,
361  /*bit_score*/ 0);
362 }
363 
365 template <typename stream_type, // constraints checked by file
366  typename seq_legal_alph_type,
367  typename ref_seqs_type,
368  typename ref_ids_type,
369  typename seq_type,
370  typename id_type,
371  typename offset_type,
372  typename ref_seq_type,
373  typename ref_id_type,
374  typename ref_offset_type,
375  typename align_type,
376  typename cigar_type,
377  typename flag_type,
378  typename mapq_type,
379  typename qual_type,
380  typename mate_type,
381  typename tag_dict_type,
382  typename e_value_type,
383  typename bit_score_type>
384 inline void format_sam::read_alignment_record(stream_type & stream,
385  alignment_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
386  ref_seqs_type & ref_seqs,
388  seq_type & seq,
389  qual_type & qual,
390  id_type & id,
391  offset_type & offset,
392  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
393  ref_id_type & ref_id,
394  ref_offset_type & ref_offset,
395  align_type & align,
396  cigar_type & cigar_vector,
397  flag_type & flag,
398  mapq_type & mapq,
399  mate_type & mate,
400  tag_dict_type & tag_dict,
401  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
402  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
403 {
404  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
405  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
406  "The ref_offset must be a specialisation of std::optional.");
407 
408  auto stream_view = views::istreambuf(stream);
409  auto field_view = stream_view | views::take_until_or_throw_and_consume(is_char<'\t'>);
410 
411  // these variables need to be stored to compute the ALIGNMENT
412  int32_t ref_offset_tmp{};
413  value_type_t<decltype(header.ref_ids())> ref_id_tmp{};
414  [[maybe_unused]] int32_t offset_tmp{};
415  [[maybe_unused]] int32_t soft_clipping_end{};
416  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
417  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
418 
419  // Header
420  // -------------------------------------------------------------------------------------------------------------
421  if (is_char<'@'>(*std::ranges::begin(stream_view))) // we always read the header if present
422  {
423  read_header(stream_view, header, ref_seqs);
424 
425  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
426  return;
427  }
428 
429  // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
430  // -------------------------------------------------------------------------------------------------------------
431  read_field(field_view, id);
432 
433  uint16_t flag_integral{};
434  read_field(field_view, flag_integral);
435  flag = sam_flag{flag_integral};
436 
437  read_field(field_view, ref_id_tmp);
438  check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
439 
440  read_field(field_view, ref_offset_tmp);
441  --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
442 
443  if (ref_offset_tmp == -1)
444  ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
445  else if (ref_offset_tmp > -1)
446  ref_offset = ref_offset_tmp;
447  else if (ref_offset_tmp < -1)
448  throw format_error{"No negative values are allowed for field::ref_offset."};
449 
450  read_field(field_view, mapq);
451 
452  // Field 6: CIGAR
453  // -------------------------------------------------------------------------------------------------------------
454  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
455  {
456  if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
457  {
458  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(field_view);
459  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
460  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
461  }
462  else
463  {
464  std::ranges::next(std::ranges::begin(field_view)); // skip '*'
465  }
466  }
467  else
468  {
469  detail::consume(field_view);
470  }
471 
472  offset = offset_tmp;
473 
474  // Field 7-9: (RNEXT PNEXT TLEN) = MATE
475  // -------------------------------------------------------------------------------------------------------------
476  if constexpr (!detail::decays_to_ignore_v<mate_type>)
477  {
478  value_type_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
479  read_field(field_view, tmp_mate_ref_id); // RNEXT
480 
481  if (tmp_mate_ref_id == "=") // indicates "same as ref id"
482  {
483  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
484  get<0>(mate) = ref_id;
485  else
486  check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
487  }
488  else
489  {
490  check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
491  }
492 
493  int32_t tmp_pnext{};
494  read_field(field_view, tmp_pnext); // PNEXT
495 
496  if (tmp_pnext > 0)
497  get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
498  else if (tmp_pnext < 0)
499  throw format_error{"No negative values are allowed at the mate mapping position."};
500  // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
501 
502  read_field(field_view, get<2>(mate)); // TLEN
503  }
504  else
505  {
506  for (size_t i = 0; i < 3u; ++i)
507  {
508  detail::consume(field_view);
509  }
510  }
511 
512  // Field 10: Sequence
513  // -------------------------------------------------------------------------------------------------------------
514  if (!is_char<'*'>(*std::ranges::begin(stream_view))) // sequence information is given
515  {
516  auto constexpr is_legal_alph = is_in_alphabet<seq_legal_alph_type>;
517  auto seq_stream = field_view | std::views::transform([is_legal_alph] (char const c) // enforce legal alphabet
518  {
519  if (!is_legal_alph(c))
520  throw parse_error{std::string{"Encountered an unexpected letter: "} +
521  is_legal_alph.msg +
522  " evaluated to false on " +
523  detail::make_printable(c)};
524  return c;
525  });
526 
527  if constexpr (detail::decays_to_ignore_v<seq_type>)
528  {
529  if constexpr (!detail::decays_to_ignore_v<align_type>)
530  {
531  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
532  "If you want to read ALIGNMENT but not SEQ, the alignment"
533  " object must store a sequence container at the second (query) position.");
534 
535  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
536  {
537 
538  auto tmp_iter = std::ranges::begin(seq_stream);
539  std::ranges::advance(tmp_iter, offset_tmp);
540 
541  for (; seq_length > 0; --seq_length) // seq_length is not needed anymore
542  {
543  get<1>(align).push_back(value_type_t<decltype(get<1>(align))>{}.assign_char(*tmp_iter));
544  ++tmp_iter;
545  }
546 
547  std::ranges::advance(tmp_iter, soft_clipping_end);
548  }
549  else
550  {
551  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // empty container
552  }
553  }
554  else
555  {
556  detail::consume(seq_stream);
557  }
558  }
559  else
560  {
561  read_field(seq_stream, seq);
562 
563  if constexpr (!detail::decays_to_ignore_v<align_type>)
564  {
565  if (!tmp_cigar_vector.empty()) // if no alignment info is given, the field::alignment should remain empty
566  {
567  assign_unaligned(get<1>(align),
568  seq | views::slice(static_cast<decltype(std::ranges::size(seq))>(offset_tmp),
569  std::ranges::size(seq) - soft_clipping_end));
570  }
571  }
572  }
573  }
574  else
575  {
576  std::ranges::next(std::ranges::begin(field_view)); // skip '*'
577  }
578 
579  // Field 11: Quality
580  // -------------------------------------------------------------------------------------------------------------
581  auto const tab_or_end = is_char<'\t'> || is_char<'\r'> || is_char<'\n'>;
582  read_field(stream_view | views::take_until_or_throw(tab_or_end), qual);
583 
584  if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
585  {
586  if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0 &&
587  std::ranges::distance(seq) != std::ranges::distance(qual))
588  {
589  throw format_error{detail::to_string("Sequence length (", std::ranges::distance(seq),
590  ") and quality length (", std::ranges::distance(qual),
591  ") must be the same.")};
592  }
593  }
594 
595  // All remaining optional fields if any: SAM tags dictionary
596  // -------------------------------------------------------------------------------------------------------------
597  while (is_char<'\t'>(*std::ranges::begin(stream_view))) // read all tags if present
598  {
599  std::ranges::next(std::ranges::begin(stream_view)); // skip tab
600  read_field(stream_view | views::take_until_or_throw(tab_or_end), tag_dict);
601  }
602 
603  detail::consume(stream_view | views::take_until(!(is_char<'\r'> || is_char<'\n'>))); // consume new line
604 
605  // DONE READING - wrap up
606  // -------------------------------------------------------------------------------------------------------------
607  // Alignment object construction
608  // Note that the query sequence in get<1>(align) has already been filled while reading Field 10.
609  if constexpr (!detail::decays_to_ignore_v<align_type>)
610  {
611  int32_t ref_idx{(ref_id_tmp.empty()/*unmapped read?*/) ? -1 : 0};
612 
613  if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
614  {
615  if (!ref_id_tmp.empty())
616  {
617  assert(header.ref_dict.count(ref_id_tmp) != 0); // taken care of in check_and_assign_ref_id()
618  ref_idx = header.ref_dict[ref_id_tmp]; // get index for reference sequence
619  }
620  }
621 
622  construct_alignment(align, tmp_cigar_vector, ref_idx, ref_seqs, ref_offset_tmp, ref_length);
623  }
624 
625  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
626  std::swap(cigar_vector, tmp_cigar_vector);
627 }
628 
630 template <typename stream_type,
631  typename header_type,
632  typename seq_type,
633  typename id_type,
634  typename ref_seq_type,
635  typename ref_id_type,
636  typename align_type,
637  typename qual_type,
638  typename mate_type,
639  typename tag_dict_type,
640  typename e_value_type,
641  typename bit_score_type>
642 inline void format_sam::write_alignment_record(stream_type & stream,
643  alignment_file_output_options const & options,
644  header_type && header,
645  seq_type && seq,
646  qual_type && qual,
647  id_type && id,
648  int32_t const offset,
649  ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
650  ref_id_type && ref_id,
651  std::optional<int32_t> ref_offset,
652  align_type && align,
653  std::vector<cigar> const & cigar_vector,
654  sam_flag const flag,
655  uint8_t const mapq,
656  mate_type && mate,
657  tag_dict_type && tag_dict,
658  e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
659  bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
660 {
661  /* Note the following general things:
662  *
663  * - Given the SAM specifications, all fields may be empty
664  *
665  * - arithmetic values default to 0 while all others default to '*'
666  *
667  * - Because of the former, arithmetic values can be directly streamed
668  * into 'stream' as operator<< is defined for all arithmetic types
669  * and the default value (0) is also the SAM default.
670  *
671  * - All other non-arithmetic values need to be checked for emptiness
672  */
673 
674  // ---------------------------------------------------------------------
675  // Type Requirements (as static asserts for user friendliness)
676  // ---------------------------------------------------------------------
677  static_assert((std::ranges::forward_range<seq_type> &&
679  "The seq object must be a std::ranges::forward_range over "
680  "letters that model seqan3::alphabet.");
681 
682  static_assert((std::ranges::forward_range<id_type> &&
684  "The id object must be a std::ranges::forward_range over "
685  "letters that model seqan3::alphabet.");
686 
687  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
688  {
689  static_assert((std::ranges::forward_range<ref_id_type> ||
691  detail::is_type_specialisation_of_v<remove_cvref_t<ref_id_type>, std::optional>),
692  "The ref_id object must be a std::ranges::forward_range "
693  "over letters that model seqan3::alphabet.");
694 
696  detail::is_type_specialisation_of_v<remove_cvref_t<ref_id_type>, std::optional>)
697  static_assert(!detail::decays_to_ignore_v<header_type>,
698  "If you give indices as reference id information the header must also be present.");
699  }
700 
701  static_assert(tuple_like<remove_cvref_t<align_type>>,
702  "The align object must be a std::pair of two ranges whose "
703  "value_type is comparable to seqan3::gap");
704 
705  static_assert((std::tuple_size_v<remove_cvref_t<align_type>> == 2 &&
706  std::equality_comparable_with<gap, reference_t<decltype(std::get<0>(align))>> &&
707  std::equality_comparable_with<gap, reference_t<decltype(std::get<1>(align))>>),
708  "The align object must be a std::pair of two ranges whose "
709  "value_type is comparable to seqan3::gap");
710 
711  static_assert((std::ranges::forward_range<qual_type> &&
713  "The qual object must be a std::ranges::forward_range "
714  "over letters that model seqan3::alphabet.");
715 
716  static_assert(tuple_like<remove_cvref_t<mate_type>>,
717  "The mate object must be a std::tuple of size 3 with "
718  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
719  "2) a std::integral or std::optional<std::integral>, and "
720  "3) a std::integral.");
721 
722  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
723  std::integral<remove_cvref_t<decltype(std::get<0>(mate))>> ||
724  detail::is_type_specialisation_of_v<remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
725  (std::integral<remove_cvref_t<decltype(std::get<1>(mate))>> ||
726  detail::is_type_specialisation_of_v<remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
727  std::integral<remove_cvref_t<decltype(std::get<2>(mate))>>),
728  "The mate object must be a std::tuple of size 3 with "
729  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
730  "2) a std::integral or std::optional<std::integral>, and "
731  "3) a std::integral.");
732 
733  if constexpr (std::integral<remove_cvref_t<decltype(std::get<0>(mate))>> ||
734  detail::is_type_specialisation_of_v<remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>)
735  static_assert(!detail::decays_to_ignore_v<header_type>,
736  "If you give indices as mate reference id information the header must also be present.");
737 
739  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
740 
741  // ---------------------------------------------------------------------
742  // logical Requirements
743  // ---------------------------------------------------------------------
744  if constexpr (!detail::decays_to_ignore_v<header_type> &&
745  !detail::decays_to_ignore_v<ref_id_type> &&
747  !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
748  {
749 
750  if (options.sam_require_header && !std::ranges::empty(ref_id))
751  {
752  auto id_it = header.ref_dict.end();
753 
754  if constexpr (std::ranges::contiguous_range<decltype(ref_id)> &&
755  std::ranges::sized_range<decltype(ref_id)> &&
756  forwarding_range<decltype(ref_id)>)
757  {
758  id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
759  }
760  else
761  {
762  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
763 
765  "The ref_id type is not convertible to the reference id information stored in the "
766  "reference dictionary of the header object.");
767 
768  id_it = header.ref_dict.find(ref_id);
769  }
770 
771  if (id_it == header.ref_dict.end()) // no reference id matched
772  throw format_error{detail::to_string("The ref_id '", ref_id, "' was not in the list of references:",
773  header.ref_ids())};
774  }
775  }
776 
777  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
778  throw format_error{"The ref_offset object must be an std::integral >= 0."};
779 
780  // ---------------------------------------------------------------------
781  // Writing the Header on first call
782  // ---------------------------------------------------------------------
783  if constexpr (!detail::decays_to_ignore_v<header_type>)
784  {
785  if (options.sam_require_header && !header_was_written)
786  {
787  write_header(stream, options, header);
788  header_was_written = true;
789  }
790  }
791 
792  // ---------------------------------------------------------------------
793  // Writing the Record
794  // ---------------------------------------------------------------------
795  seqan3::ostreambuf_iterator stream_it{stream};
796  char const separator{'\t'};
797 
798  write_range(stream_it, std::forward<id_type>(id));
799 
800  stream << separator;
801 
802  stream << static_cast<uint16_t>(flag) << separator;
803 
804  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
805  {
807  {
808  write_range(stream_it, (header.ref_ids())[ref_id]);
809  }
810  else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
811  {
812  if (ref_id.has_value())
813  write_range(stream_it, (header.ref_ids())[ref_id.value()]);
814  else
815  stream << '*';
816  }
817  else
818  {
819  write_range(stream_it, std::forward<ref_id_type>(ref_id));
820  }
821  }
822  else
823  {
824  stream << '*';
825  }
826 
827  stream << separator;
828 
829  // SAM is 1 based, 0 indicates unmapped read if optional is not set
830  stream << (ref_offset.value_or(-1) + 1) << separator;
831 
832  stream << static_cast<unsigned>(mapq) << separator;
833 
834  if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
835  {
836  // compute possible distance from alignment end to sequence end
837  // which indicates soft clipping at the end.
838  // This should be replace by a free count_gaps function for
839  // aligned sequences which is more efficient if possible.
840  size_t off_end{std::ranges::size(seq) - offset};
841  for (auto chr : get<1>(align))
842  if (chr == gap{})
843  ++off_end;
844  off_end -= std::ranges::size(get<1>(align));
845 
846  write_range(stream_it, detail::get_cigar_string(std::forward<align_type>(align), offset, off_end));
847  }
848  else if (!cigar_vector.empty())
849  {
850  for (auto & c : cigar_vector)
851  stream << c.to_string(); // returns a small_vector instead of char so write_range doesn't work
852  }
853  else
854  {
855  stream << '*';
856  }
857 
858  stream << separator;
859 
860  if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
861  {
862  write_range(stream_it, (header.ref_ids())[get<0>(mate)]);
863  }
864  else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>, std::optional>)
865  {
866  if (get<0>(mate).has_value())
867  // value_or(0) instead of value() (which is equivalent here) as a
868  // workaround for a ubsan false-positive in GCC8: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90058
869  write_range(stream_it, header.ref_ids()[get<0>(mate).value_or(0)]);
870  else
871  stream << '*';
872  }
873  else
874  {
875  write_range(stream_it, get<0>(mate));
876  }
877 
878  stream << separator;
879 
880  if constexpr (detail::is_type_specialisation_of_v<remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
881  {
882  // SAM is 1 based, 0 indicates unmapped read if optional is not set
883  stream << (get<1>(mate).value_or(-1) + 1) << separator;
884  }
885  else
886  {
887  stream << get<1>(mate) << separator;
888  }
889 
890  stream << get<2>(mate) << separator;
891 
892  write_range(stream_it, std::forward<seq_type>(seq));
893 
894  stream << separator;
895 
896  write_range(stream_it, std::forward<qual_type>(qual));
897 
898  write_tag_fields(stream, std::forward<tag_dict_type>(tag_dict), separator);
899 
900  detail::write_eol(stream_it, options.add_carriage_return);
901 }
902 
903 
921 template <typename stream_view_type, typename value_type>
922 inline void format_sam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
923  stream_view_type && stream_view,
924  value_type value)
925 {
926  std::vector<value_type> tmp_vector;
927  while (std::ranges::begin(stream_view) != ranges::end(stream_view)) // not fully consumed yet
928  {
929  read_field(stream_view | views::take_until(is_char<','>), value);
930  tmp_vector.push_back(value);
931 
932  if (is_char<','>(*std::ranges::begin(stream_view)))
933  std::ranges::next(std::ranges::begin(stream_view)); // skip ','
934  }
935  variant = std::move(tmp_vector);
936 }
937 
955 template <typename stream_view_type>
956 inline void format_sam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
957 {
958  /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
959  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
960  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
961  VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
962  */
963  uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
964  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
965  tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
966  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
967  std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
968  char type_id = *std::ranges::begin(stream_view);
969  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
970  std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
971 
972  switch (type_id)
973  {
974  case 'A' : // char
975  {
976  target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
977  std::ranges::next(std::ranges::begin(stream_view)); // skip char that has been read
978  break;
979  }
980  case 'i' : // int32_t
981  {
982  int32_t tmp;
983  read_field(stream_view, tmp);
984  target[tag] = tmp;
985  break;
986  }
987  case 'f' : // float
988  {
989  float tmp;
990  read_field(stream_view, tmp);
991  target[tag] = tmp;
992  break;
993  }
994  case 'Z' : // string
995  {
996  target[tag] = stream_view | views::to<std::string>;
997  break;
998  }
999  case 'H' :
1000  {
1001  // TODO
1002  break;
1003  }
1004  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1005  {
1006  char array_value_type_id = *std::ranges::begin(stream_view);
1007  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1008  std::ranges::next(std::ranges::begin(stream_view)); // skip first ','
1009 
1010  switch (array_value_type_id)
1011  {
1012  case 'c' : // int8_t
1013  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1014  break;
1015  case 'C' : // uint8_t
1016  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1017  break;
1018  case 's' : // int16_t
1019  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1020  break;
1021  case 'S' : // uint16_t
1022  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1023  break;
1024  case 'i' : // int32_t
1025  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1026  break;
1027  case 'I' : // uint32_t
1028  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1029  break;
1030  case 'f' : // float
1031  read_sam_dict_vector(target[tag], stream_view, float{});
1032  break;
1033  default:
1034  throw format_error{std::string("The first character in the numerical ") +
1035  "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id +
1036  "' was given."};
1037  }
1038  break;
1039  }
1040  default:
1041  throw format_error{std::string("The second character in the numerical id of a "
1042  "SAM tag must be one of [A,i,Z,H,B,f] but '") + type_id + "' was given."};
1043  }
1044 }
1045 
1053 template <typename stream_it_t, std::ranges::forward_range field_type>
1054 inline void format_sam::write_range(stream_it_t & stream_it, field_type && field_value)
1055 {
1056  if (std::ranges::empty(field_value))
1057  stream_it = '*';
1058  else
1059  std::ranges::copy(field_value | views::to_char, stream_it);
1060 }
1061 
1068 template <typename stream_it_t>
1069 inline void format_sam::write_range(stream_it_t & stream_it, char const * const field_value)
1070 {
1071  write_range(stream_it, std::string_view{field_value});
1072 }
1073 
1079 template <typename stream_t, arithmetic field_type>
1080 inline void format_sam::write_field(stream_t & stream, field_type field_value)
1081 {
1082  // TODO: replace this with to_chars for efficiency
1084  stream << static_cast<int16_t>(field_value);
1085  else
1086  stream << field_value;
1087 }
1088 
1096 template <typename stream_t>
1097 inline void format_sam::write_tag_fields(stream_t & stream, sam_tag_dictionary const & tag_dict, char const separator)
1098 {
1099  auto stream_variant_fn = [this, &stream] (auto && arg) // helper to print an std::variant
1100  {
1101  using T = remove_cvref_t<decltype(arg)>;
1102 
1103  if constexpr (!container<T> || std::same_as<T, std::string>)
1104  {
1105  stream << arg;
1106  }
1107  else
1108  {
1109  if (arg.begin() != arg.end())
1110  {
1111  for (auto it = arg.begin(); it != (arg.end() - 1); ++it)
1112  {
1113  write_field(stream, *it);
1114  stream << ',';
1115  }
1116 
1117  write_field(stream, *(arg.end() - 1)); // write last value without trailing ','
1118  }
1119  }
1120  };
1121 
1122  for (auto & [tag, variant] : tag_dict)
1123  {
1124  stream << separator;
1125 
1126  char char0 = tag / 256;
1127  char char1 = tag % 256;
1128 
1129  stream << char0 << char1 << ':' << detail::sam_tag_type_char[variant.index()] << ':';
1130 
1131  if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1132  stream << detail::sam_tag_type_char_extra[variant.index()] << ',';
1133 
1134  std::visit(stream_variant_fn, variant);
1135  }
1136 }
1137 
1138 } // namespace seqan3
seqan3::gap
The alphabet of a gap character '-'.
Definition: gap.hpp:36
seqan3::alignment_file_output_options
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:22
seqan3::remove_cvref_t
std::remove_cv_t< std::remove_reference_t< t > > remove_cvref_t
Return the input type with const, volatile and references removed (type trait).
Definition: basic.hpp:35
misc.hpp
Provides helper data structures for the seqan3::alignment_file_output.
seqan3::views::take_until_and_consume
constexpr auto take_until_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:613
seqan3::field::seq
The "sequence", usually a range of nucleotides or amino acids.
take_until.hpp
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
std::span
seqan3::views::to_char
const auto to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:65
seqan3::alignment_file_output_options::add_carriage_return
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:27
std::string
seqan3::sequence_file_input_options::truncate_ids
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:28
sequence_container
A more refined container concept than seqan3::container.
seqan3::format_sam::operator=
format_sam & operator=(format_sam const &)=default
Defaulted.
seqan3::format_sam::write_sequence_record
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_sam.hpp:333
std::string_view
tuple.hpp
Provides seqan3::tuple_like.
seqan3::field::offset
Sequence (SEQ) relative start position (0-based), unsigned value.
sam_tag_dictionary.hpp
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
std::pair
seqan3::format_error
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:84
seqan3::reference_t
typename reference< t >::type reference_t
Shortcut for seqan3::reference (transformation_trait shortcut).
Definition: pre.hpp:77
vector
seqan3::alignment_file_output_options::sam_require_header
bool sam_require_header
Whether to require a header for SAM files.
Definition: output_options.hpp:41
seqan3::views::move
const auto move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:68
template_inspection.hpp
Provides seqan3::type_list and auxiliary type traits.
seqan3::sam_flag
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: misc.hpp:70
std::tuple
seqan3::alignment_file_input_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:23
seqan3::is_space
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:146
seqan3::alignment_file_header::ref_ids
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:119
seqan3::field::bit_score
The bit score (statistical significance indicator), unsigned value.
seqan3::field::ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
seqan3::field::ref_offset
Sequence (REF_SEQ) relative start position (0-based), unsigned value.
seqan3::format_sam::read_alignment_record
void read_alignment_record(stream_type &stream, alignment_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, alignment_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:384
algorithm
Adaptations of algorithms from the Ranges TS.
std::string::clear
T clear(T... args)
concepts
The Concepts library.
input_format_concept.hpp
Provides seqan3::alignment_file_input_format and auxiliary classes.
seqan3::sequence_file_input_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:25
std::tie
T tie(T... args)
std::vector::push_back
T push_back(T... args)
seqan3::seq
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
same_as
The concept std::same_as<T, U> is satisfied if and only if T and U denote the same type.
seqan3::format_sam::file_extensions
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_sam.hpp:141
seqan3::value_type
Exposes the value_type of another type.
Definition: pre.hpp:41
slice.hpp
Provides seqan3::views::slice.
seqan3::views::take_until_or_throw_and_consume
constexpr auto take_until_or_throw_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:627
to.hpp
Provides seqan3::views::to.
seqan3::sam_flag::none
None of the flags below are set.
seqan3::parse_error
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:47
seqan3::alignment_file_header
Stores the header information of alignment files.
Definition: header.hpp:32
range.hpp
Provides various transformation traits used by the range module.
seqan3::field::mapq
The mapping quality of the SEQ alignment, usually a ohred-scaled score.
core_language.hpp
Provides concepts for core language types and relations that don't have concepts in C++20 (yet).
seqan3::value_type_t
typename value_type< t >::type value_type_t
Shortcut for seqan3::value_type (transformation_trait shortcut).
Definition: pre.hpp:48
seqan3::alignment_file_header::ref_dict
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:164
seqan3::ostreambuf_iterator
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Writes successive characters onto the output stream from which...
Definition: iterator.hpp:204
header.hpp
Provides the seqan3::alignment_file_header class.
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:36
seqan3::format_sam::~format_sam
~format_sam()=default
seqan3::views::take_until
constexpr auto take_until
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:585
seqan3::sequence_file_output_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:21
detail.hpp
Auxiliary functions for the alignment IO.
output_options.hpp
Provides seqan3::sequence_file_output_options.
output_options.hpp
Provides seqan3::alignment_file_output_options.
seqan3::pack_traits::size
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:116
tuple_like
Whether a type behaves like a tuple.
std::swap
T swap(T... args)
std::variant::index
T index(T... args)
to_char.hpp
Provides seqan3::views::to_char.
ranges
Adaptations of concepts from the Ranges TS.
seqan3::sam_tag_dictionary
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:324
std::remove_reference_t
seqan3::views::take_until_or_throw
constexpr auto take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:599
alphabet
The generic alphabet concept that covers most data types used in ranges.
std::string::begin
T begin(T... args)
container
The (most general) container concept as defined by the standard library.
predicate.hpp
Provides character predicates for tokenisation.
seqan3::field::ref_id
The identifier of the (reference) sequence that SEQ was aligned to.
seqan3::views::slice
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:141
ignore_output_iterator.hpp
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
seqan3::format_sam
The SAM format (tag).
Definition: format_sam.hpp:124
std::visit
T visit(T... args)
output_format_concept.hpp
Provides seqan3::alignment_file_output_format and auxiliary classes.
std::vector::empty
T empty(T... args)
equality_comparable_with
Requires seqan3::detail::weakly_equality_comparable_witht<t1,t2>, but also that t1 and t2,...
std::remove_cv_t
implicitly_convertible_to
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
seqan3::field::qual
The qualities, usually in phred-score notation.
std::optional
to_string.hpp
Auxiliary for pretty printing of exception messages.
istreambuf.hpp
Provides seqan3::views::istreambuf.
std::string::end
T end(T... args)
seqan3::pack_traits::transform
seqan3::type_list< trait_t< pack_t >... > transform
Apply a transformation trait to every type in the pack and return a seqan3::type_list of the results.
Definition: traits.hpp:307
iterator.hpp
Provides seqan3::ostream and seqan3::ostreambuf iterator.
seqan3::field::flag
The alignment flag (bit information), uint16_t value.
integral
The concept integral is satisfied if and only if T is an integral type.
misc.hpp
Provides various utility functions.
seqan3::format_sam::read_sequence_record
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type, seq_qual_combined > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:292
input_options.hpp
Provides seqan3::alignment_file_input_options.
forwarding_range
Specifies a range whose iterators may outlive the range and remain valid.
format_sam_base.hpp
Provides the seqan3::format_sam_base that can be inherited from.
misc.hpp
Provides various utility functions.
char_to.hpp
Provides seqan3::views::char_to.
seqan3::format_sam::write_alignment_record
void write_alignment_record(stream_type &stream, alignment_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition: format_sam.hpp:642
seqan3::views::istreambuf
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:113
seqan3::format_sam::format_sam
format_sam()=default
Defaulted.
std::tuple_size_v
T tuple_size_v
seqan3::assign_unaligned
void assign_unaligned(aligned_seq_t &aligned_seq, unaligned_sequence_type &&unaligned_seq)
An implementation of seqan3::aligned_sequence::assign_unaligned_sequence for sequence containers.
Definition: aligned_sequence_concept.hpp:375
std::variant
seqan3::field::mate
The mate pair information given as a std::tuple of reference name, offset and template length.
string