SeqAn3  3.0.1
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <string>
17 #include <vector>
18 
19 #include <seqan3/alphabet/detail/convert.hpp>
43 #include <seqan3/std/concepts>
44 #include <seqan3/std/ranges>
45 
46 namespace seqan3
47 {
48 
59 class format_bam : private detail::format_sam_base
60 {
61 public:
65  format_bam() noexcept = default;
66  format_bam(format_bam const &) noexcept = default;
67  format_bam & operator=(format_bam const &) noexcept = default;
68  format_bam(format_bam &&) noexcept = default;
69  format_bam & operator=(format_bam &&) noexcept = default;
70  ~format_bam() noexcept = default;
71 
75  {
76  { "bam" }
77  };
78 
79 protected:
80  template <typename stream_type, // constraints checked by file
81  typename seq_legal_alph_type,
82  typename ref_seqs_type,
83  typename ref_ids_type,
84  typename seq_type,
85  typename id_type,
86  typename offset_type,
87  typename ref_seq_type,
88  typename ref_id_type,
89  typename ref_offset_type,
90  typename align_type,
91  typename cigar_type,
92  typename flag_type,
93  typename mapq_type,
94  typename qual_type,
95  typename mate_type,
96  typename tag_dict_type,
97  typename e_value_type,
98  typename bit_score_type>
99  void read_alignment_record(stream_type & stream,
100  alignment_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
101  ref_seqs_type & ref_seqs,
103  seq_type & seq,
104  qual_type & qual,
105  id_type & id,
106  offset_type & offset,
107  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
108  ref_id_type & ref_id,
109  ref_offset_type & ref_offset,
110  align_type & align,
111  cigar_type & cigar_vector,
112  flag_type & flag,
113  mapq_type & mapq,
114  mate_type & mate,
115  tag_dict_type & tag_dict,
116  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
117  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
118 
119  template <typename stream_type,
120  typename header_type,
121  typename seq_type,
122  typename id_type,
123  typename ref_seq_type,
124  typename ref_id_type,
125  typename align_type,
126  typename cigar_type,
127  typename qual_type,
128  typename mate_type,
129  typename tag_dict_type>
130  void write_alignment_record([[maybe_unused]] stream_type & stream,
131  [[maybe_unused]] alignment_file_output_options const & options,
132  [[maybe_unused]] header_type && header,
133  [[maybe_unused]] seq_type && seq,
134  [[maybe_unused]] qual_type && qual,
135  [[maybe_unused]] id_type && id,
136  [[maybe_unused]] int32_t const offset,
137  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
138  [[maybe_unused]] ref_id_type && ref_id,
139  [[maybe_unused]] std::optional<int32_t> ref_offset,
140  [[maybe_unused]] align_type && align,
141  [[maybe_unused]] cigar_type && cigar_vector,
142  [[maybe_unused]] sam_flag const flag,
143  [[maybe_unused]] uint8_t const mapq,
144  [[maybe_unused]] mate_type && mate,
145  [[maybe_unused]] tag_dict_type && tag_dict,
146  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
147  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
148 
149 private:
151  bool header_was_read{false};
152 
154  std::string string_buffer{};
155 
157  struct alignment_record_core
158  { // naming corresponds to official SAM/BAM specifications
159  int32_t block_size;
160  int32_t refID;
161  int32_t pos;
162  uint32_t l_read_name:8;
163  uint32_t mapq:8;
164  uint32_t bin:16;
165  uint32_t n_cigar_op:16;
166  sam_flag flag;
167  int32_t l_seq;
168  int32_t next_refID;
169  int32_t next_pos;
170  int32_t tlen;
171  };
172 
174  static constexpr std::array<uint8_t, 256> char_to_sam_rank
175  {
176  [] () constexpr
177  {
179 
180  using index_t = std::make_unsigned_t<char>;
181 
182  // ret['M'] = 0; set anyway by initialization
183  ret[static_cast<index_t>('I')] = 1;
184  ret[static_cast<index_t>('D')] = 2;
185  ret[static_cast<index_t>('N')] = 3;
186  ret[static_cast<index_t>('S')] = 4;
187  ret[static_cast<index_t>('H')] = 5;
188  ret[static_cast<index_t>('P')] = 6;
189  ret[static_cast<index_t>('=')] = 7;
190  ret[static_cast<index_t>('X')] = 8;
191 
192  return ret;
193  }()
194  };
195 
197  static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
198  {
199  --end;
200  if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
201  if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
202  if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
203  if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
204  if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
205  return 0;
206  }
207 
208  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
209 
216  template <typename stream_view_type, std::integral number_type>
217  void read_field(stream_view_type && stream_view, number_type & target)
218  {
219  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
220  }
221 
227  template <typename stream_view_type>
228  void read_field(stream_view_type && stream_view, float & target)
229  {
230  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
231  }
232 
243  template <typename stream_view_type, typename optional_value_type>
244  void read_field(stream_view_type && stream_view, std::optional<optional_value_type> & target)
245  {
246  optional_value_type tmp;
247  read_field(std::forward<stream_view_type>(stream_view), tmp);
248  target = tmp;
249  }
250 
251  template <typename stream_view_type, typename value_type>
252  void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
253  stream_view_type && stream_view,
254  value_type const & SEQAN3_DOXYGEN_ONLY(value));
255 
256  template <typename stream_view_type>
257  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
258 
259  template <typename cigar_input_type>
260  auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
261 
262  static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
263 };
264 
266 template <typename stream_type, // constraints checked by file
267  typename seq_legal_alph_type,
268  typename ref_seqs_type,
269  typename ref_ids_type,
270  typename seq_type,
271  typename id_type,
272  typename offset_type,
273  typename ref_seq_type,
274  typename ref_id_type,
275  typename ref_offset_type,
276  typename align_type,
277  typename cigar_type,
278  typename flag_type,
279  typename mapq_type,
280  typename qual_type,
281  typename mate_type,
282  typename tag_dict_type,
283  typename e_value_type,
284  typename bit_score_type>
285 inline void format_bam::read_alignment_record(stream_type & stream,
286  alignment_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
287  ref_seqs_type & ref_seqs,
289  seq_type & seq,
290  qual_type & qual,
291  id_type & id,
292  offset_type & offset,
293  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
294  ref_id_type & ref_id,
295  ref_offset_type & ref_offset,
296  align_type & align,
297  cigar_type & cigar_vector,
298  flag_type & flag,
299  mapq_type & mapq,
300  mate_type & mate,
301  tag_dict_type & tag_dict,
302  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
303  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
304 {
305  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
306  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
307  "The ref_offset must be a specialisation of std::optional.");
308 
309  static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
310  "The type of field::mapq must be uint8_t.");
311 
312  static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
313  "The type of field::flag must be seqan3::sam_flag.");
314 
316  auto stream_view = std::ranges::subrange<decltype(stream_buf_t{stream}), decltype(stream_buf_t{})>
317  {stream_buf_t{stream}, stream_buf_t{}};
318 
319  // these variables need to be stored to compute the ALIGNMENT
320  [[maybe_unused]] int32_t offset_tmp{};
321  [[maybe_unused]] int32_t soft_clipping_end{};
322  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
323  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
324 
325  // Header
326  // -------------------------------------------------------------------------------------------------------------
327  if (!header_was_read)
328  {
329  // magic BAM string
330  if (!std::ranges::equal(stream_view | views::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
331  throw format_error{"File is not in BAM format."};
332 
333  int32_t tmp32{};
334  read_field(stream_view, tmp32);
335 
336  if (tmp32 > 0) // header text is present
337  read_header(stream_view | views::take_exactly_or_throw(tmp32), header, ref_seqs);
338 
339  int32_t n_ref;
340  read_field(stream_view, n_ref);
341 
342  for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
343  {
344  read_field(stream_view, tmp32); // l_name (length of reference name including \0 character)
345 
346  string_buffer.resize(tmp32 - 1);
347  std::ranges::copy_n(std::ranges::begin(stream_view), tmp32 - 1, string_buffer.data()); // copy without \0 character
348  std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
349 
350  read_field(stream_view, tmp32); // l_ref (length of reference sequence)
351 
352  auto id_it = header.ref_dict.find(string_buffer);
353 
354  // sanity checks of reference information to existing header object:
355  if (id_it == header.ref_dict.end()) // [unlikely]
356  {
357  throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
358  "' found in BAM file header (header.ref_ids():",
359  header.ref_ids(), ").")};
360  }
361  else if (id_it->second != ref_idx) // [unlikely]
362  {
363  throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
364  " does not correspond to the position ", id_it->second,
365  " in the header (header.ref_ids():", header.ref_ids(), ").")};
366  }
367  else if (std::get<0>(header.ref_id_info[id_it->second]) != tmp32) // [unlikely]
368  {
369  throw format_error{"Provided reference has unequal length as specified in the header."};
370  }
371  }
372 
373  header_was_read = true;
374 
375  if (stream_buf_t{stream} == stream_buf_t{}) // no records follow
376  return;
377  }
378 
379  // read alignment record into buffer
380  // -------------------------------------------------------------------------------------------------------------
381  alignment_record_core core;
382  std::ranges::copy(stream_view | views::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
383 
384  if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
385  {
386  throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
387  "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
388  }
389  else if (core.refID > -1) // not unmapped
390  {
391  ref_id = core.refID; // field::ref_id
392  }
393 
394  flag = core.flag; // field::flag
395  mapq = core.mapq; // field::mapq
396 
397  if (core.pos > -1) // [[likely]]
398  ref_offset = core.pos; // field::ref_offset
399 
400  if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
401  {
402  if (core.next_refID > -1)
403  get<0>(mate) = core.next_refID;
404 
405  if (core.next_pos > -1) // [[likely]]
406  get<1>(mate) = core.next_pos;
407 
408  get<2>(mate) = core.tlen;
409  }
410 
411  // read id
412  // -------------------------------------------------------------------------------------------------------------
413  read_field(stream_view | views::take_exactly_or_throw(core.l_read_name - 1), id); // field::id
414  std::ranges::next(std::ranges::begin(stream_view)); // skip '\0'
415 
416  // read cigar string
417  // -------------------------------------------------------------------------------------------------------------
418  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
419  {
420  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
421  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
422  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
423  }
424  else
425  {
426  detail::consume(stream_view | views::take_exactly_or_throw(core.n_cigar_op * 4));
427  }
428 
429  offset = offset_tmp;
430 
431  // read sequence
432  // -------------------------------------------------------------------------------------------------------------
433  if (core.l_seq > 0) // sequence information is given
434  {
435  auto seq_stream = stream_view
436  | views::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
438  {
439  return {sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
440  sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
441  });
442 
443  if constexpr (detail::decays_to_ignore_v<seq_type>)
444  {
445  if constexpr (!detail::decays_to_ignore_v<align_type>)
446  {
447  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
448  "If you want to read ALIGNMENT but not SEQ, the alignment"
449  " object must store a sequence container at the second (query) position.");
450 
451  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
452  {
453  assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
454  using alph_t = value_type_t<decltype(get<1>(align))>;
455  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
456 
457  get<1>(align).reserve(seq_length);
458 
459  auto tmp_iter = std::ranges::begin(seq_stream);
460  std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
461 
462  if (offset_tmp & 1)
463  {
464  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
465  ++tmp_iter;
466  --seq_length;
467  }
468 
469  for (size_t i = (seq_length / 2); i > 0; --i)
470  {
471  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
472  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
473  ++tmp_iter;
474  }
475 
476  if (seq_length & 1)
477  {
478  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
479  ++tmp_iter;
480  --soft_clipping_end;
481  }
482 
483  std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
484  }
485  else
486  {
487  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
488  }
489  }
490  else
491  {
492  detail::consume(seq_stream);
493  if (core.l_seq & 1)
494  std::ranges::next(std::ranges::begin(stream_view));
495  }
496  }
497  else
498  {
499  using alph_t = value_type_t<decltype(seq)>;
500  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
501 
502  for (auto [d1, d2] : seq_stream)
503  {
504  seq.push_back(from_dna16[to_rank(d1)]);
505  seq.push_back(from_dna16[to_rank(d2)]);
506  }
507 
508  if (core.l_seq & 1)
509  {
510  sam_dna16 d = sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
511  seq.push_back(from_dna16[to_rank(d)]);
512  std::ranges::next(std::ranges::begin(stream_view));
513  }
514 
515  if constexpr (!detail::decays_to_ignore_v<align_type>)
516  {
517  assign_unaligned(get<1>(align),
518  seq | views::slice(static_cast<decltype(std::ranges::distance(seq))>(offset_tmp),
519  std::ranges::distance(seq) - soft_clipping_end));
520  }
521  }
522  }
523 
524  // read qual string
525  // -------------------------------------------------------------------------------------------------------------
526  read_field(stream_view | views::take_exactly_or_throw(core.l_seq)
527  | std::views::transform([] (char chr) { return static_cast<char>(chr + 33); }), qual);
528 
529  // All remaining optional fields if any: SAM tags dictionary
530  // -------------------------------------------------------------------------------------------------------------
531  int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
532  core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
533  assert(remaining_bytes >= 0);
534  auto tags_view = stream_view | views::take_exactly_or_throw(remaining_bytes);
535 
536  while (tags_view.size() > 0)
537  read_field(tags_view, tag_dict);
538 
539  // DONE READING - wrap up
540  // -------------------------------------------------------------------------------------------------------------
541  if constexpr (!detail::decays_to_ignore_v<align_type>)
542  {
543  // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
544  // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
545  // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
546  if (core.l_seq != 0 && offset_tmp == core.l_seq)
547  {
548  if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
549  { // maybe only throw in debug mode and otherwise return an empty alignment?
550  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
551  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
552  "stored in the optional field CG. You need to read in the field::tags and "
553  "field::seq in order to access this information.")};
554  }
555  else
556  {
557  auto it = tag_dict.find("CG"_tag);
558 
559  if (it == tag_dict.end())
560  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
561  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
562  "stored in the optional field CG but this tag is not present in the given ",
563  "record.")};
564 
565  auto cigar_view = std::views::all(std::get<std::string>(it->second));
566  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(cigar_view);
567  offset_tmp = soft_clipping_end = 0;
568  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
569 
570  assign_unaligned(get<1>(align),
571  seq | views::slice(static_cast<decltype(std::ranges::distance(seq))>(offset_tmp),
572  std::ranges::distance(seq) - soft_clipping_end));
573  tag_dict.erase(it); // remove redundant information
574  }
575  }
576 
577  // Alignment object construction
578  construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM format
579  }
580 
581  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
582  std::swap(cigar_vector, tmp_cigar_vector);
583 }
584 
586 template <typename stream_type,
587  typename header_type,
588  typename seq_type,
589  typename id_type,
590  typename ref_seq_type,
591  typename ref_id_type,
592  typename align_type,
593  typename cigar_type,
594  typename qual_type,
595  typename mate_type,
596  typename tag_dict_type>
597 inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
598  [[maybe_unused]] alignment_file_output_options const & options,
599  [[maybe_unused]] header_type && header,
600  [[maybe_unused]] seq_type && seq,
601  [[maybe_unused]] qual_type && qual,
602  [[maybe_unused]] id_type && id,
603  [[maybe_unused]] int32_t const offset,
604  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
605  [[maybe_unused]] ref_id_type && ref_id,
606  [[maybe_unused]] std::optional<int32_t> ref_offset,
607  [[maybe_unused]] align_type && align,
608  [[maybe_unused]] cigar_type && cigar_vector,
609  [[maybe_unused]] sam_flag const flag,
610  [[maybe_unused]] uint8_t const mapq,
611  [[maybe_unused]] mate_type && mate,
612  [[maybe_unused]] tag_dict_type && tag_dict,
613  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
614  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
615 {
616  // ---------------------------------------------------------------------
617  // Type Requirements (as static asserts for user friendliness)
618  // ---------------------------------------------------------------------
619  static_assert((std::ranges::forward_range<seq_type> &&
621  "The seq object must be a std::ranges::forward_range over "
622  "letters that model seqan3::alphabet.");
623 
624  static_assert((std::ranges::forward_range<id_type> &&
626  "The id object must be a std::ranges::forward_range over "
627  "letters that model seqan3::alphabet.");
628 
629  static_assert((std::ranges::forward_range<ref_seq_type> &&
631  "The ref_seq object must be a std::ranges::forward_range "
632  "over letters that model seqan3::alphabet.");
633 
634  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
635  {
636  static_assert((std::ranges::forward_range<ref_id_type> ||
638  detail::is_type_specialisation_of_v<remove_cvref_t<ref_id_type>, std::optional>),
639  "The ref_id object must be a std::ranges::forward_range "
640  "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
641  }
642 
643  static_assert(tuple_like<remove_cvref_t<align_type>>,
644  "The align object must be a std::pair of two ranges whose "
645  "value_type is comparable to seqan3::gap");
646 
647  static_assert((std::tuple_size_v<remove_cvref_t<align_type>> == 2 &&
648  std::equality_comparable_with<gap, reference_t<decltype(std::get<0>(align))>> &&
649  std::equality_comparable_with<gap, reference_t<decltype(std::get<1>(align))>>),
650  "The align object must be a std::pair of two ranges whose "
651  "value_type is comparable to seqan3::gap");
652 
653  static_assert((std::ranges::forward_range<qual_type> &&
655  "The qual object must be a std::ranges::forward_range "
656  "over letters that model seqan3::alphabet.");
657 
658  static_assert(tuple_like<remove_cvref_t<mate_type>>,
659  "The mate object must be a std::tuple of size 3 with "
660  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
661  "2) a std::integral or std::optional<std::integral>, and "
662  "3) a std::integral.");
663 
664  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
665  std::integral<remove_cvref_t<decltype(std::get<0>(mate))>> ||
666  detail::is_type_specialisation_of_v<remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
667  (std::integral<remove_cvref_t<decltype(std::get<1>(mate))>> ||
668  detail::is_type_specialisation_of_v<remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
669  std::integral<remove_cvref_t<decltype(std::get<2>(mate))>>),
670  "The mate object must be a std::tuple of size 3 with "
671  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
672  "2) a std::integral or std::optional<std::integral>, and "
673  "3) a std::integral.");
674 
676  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
677 
678  if constexpr (detail::decays_to_ignore_v<header_type>)
679  {
680  throw format_error{"BAM can only be written with a header but you did not provide enough information! "
681  "You can either construct the output file with ref_ids and ref_seqs information and "
682  "the header will be created for you, or you can access the `header` member directly."};
683  }
684  else
685  {
686  // ---------------------------------------------------------------------
687  // logical Requirements
688  // ---------------------------------------------------------------------
689 
690  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
691  throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
692 
693  seqan3::ostreambuf_iterator stream_it{stream};
694 
695  // ---------------------------------------------------------------------
696  // Writing the BAM Header on first call
697  // ---------------------------------------------------------------------
698  if (!header_was_written)
699  {
700  stream << "BAM\1";
702  write_header(os, options, header); // write SAM header to temporary stream to query the size.
703  int32_t l_text{static_cast<int32_t>(os.str().size())};
704  std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
705 
706  stream << os.str();
707 
708  int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
709  std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
710 
711  for (int32_t ridx = 0; ridx < n_ref; ++ridx)
712  {
713  int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
714  std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
715  // write reference name:
716  std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
717  stream_it = '\0';
718  // write reference sequence length:
719  std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
720  }
721 
722  header_was_written = true;
723  }
724 
725  // ---------------------------------------------------------------------
726  // Writing the Record
727  // ---------------------------------------------------------------------
728  int32_t ref_length{};
729 
730  // if alignment is non-empty, replace cigar_vector.
731  // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
732  if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
733  {
734  ref_length = std::ranges::distance(get<1>(align));
735 
736  // compute possible distance from alignment end to sequence end
737  // which indicates soft clipping at the end.
738  // This should be replaced by a free count_gaps function for
739  // aligned sequences which is more efficient if possible.
740  int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
741 
742  for (auto chr : get<1>(align))
743  if (chr == gap{})
744  ++off_end;
745 
746  off_end -= ref_length;
747  cigar_vector = detail::get_cigar_vector(align, offset, off_end);
748  }
749  else
750  {
751  int32_t dummy_seq_length{};
752  for (auto & [count, operation] : cigar_vector)
753  update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
754  }
755 
756  if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
757  {
758  tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
759  cigar_vector.resize(2);
760  cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_op};
761  cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_op};
762  }
763 
764  std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
765 
766  // Compute the value for the l_read_name field for the bam record.
767  // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
768  // the data type to store the value is uint8_t and 255 is the maximal size.
769  // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
770  // 2 bytes.
771  uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
772  read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
773 
774  alignment_record_core core
775  {
776  /* block_size */ 0, // will be initialised right after
777  /* refID */ -1, // will be initialised right after
778  /* pos */ ref_offset.value_or(-1),
779  /* l_read_name */ read_name_size,
780  /* mapq */ mapq,
781  /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
782  /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
783  /* flag */ flag,
784  /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
785  /* next_refId */ -1, // will be initialised right after
786  /* next_pos */ get<1>(mate).value_or(-1),
787  /* tlen */ get<2>(mate)
788  };
789 
790  auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
791  [[maybe_unused]] auto & id_target)
792  {
793  using id_t = std::remove_reference_t<decltype(id_source)>;
794 
795  if constexpr (!detail::decays_to_ignore_v<id_t>)
796  {
797  if constexpr (std::integral<id_t>)
798  {
799  id_target = id_source;
800  }
801  else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
802  {
803  id_target = id_source.value_or(-1);
804  }
805  else
806  {
807  if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
808  {
809  auto id_it = header.ref_dict.end();
810 
811  if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
812  std::ranges::sized_range<decltype(id_source)> &&
813  forwarding_range<decltype(id_source)>)
814  {
815  id_it = header.ref_dict.find(std::span{std::ranges::data(id_source),
816  std::ranges::size(id_source)});
817  }
818  else
819  {
820  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
821 
822  static_assert(implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
823  "The ref_id type is not convertible to the reference id information stored in the "
824  "reference dictionary of the header object.");
825 
826  id_it = header.ref_dict.find(id_source);
827  }
828 
829  if (id_it == header.ref_dict.end())
830  {
831  throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
832  "not be found in BAM header ref_dict: ",
833  header.ref_dict, ".")};
834  }
835 
836  id_target = id_it->second;
837  }
838  }
839  }
840  };
841 
842  // initialise core.refID
843  check_and_assign_id_to(ref_id, core.refID);
844 
845  // initialise core.next_refID
846  check_and_assign_id_to(get<0>(mate), core.next_refID);
847 
848  // initialise core.block_size
849  core.block_size = sizeof(core) - 4/*block_size excluded*/ +
850  core.l_read_name +
851  core.n_cigar_op * 4 + // each int32_t has 4 bytes
852  (core.l_seq + 1) / 2 + // bitcompressed seq
853  core.l_seq + // quality string
854  tag_dict_binary_str.size();
855 
856  std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
857 
858  if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
859  stream_it = '*';
860  else
861  std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
862  stream_it = '\0';
863 
864  // write cigar
865  for (auto [cigar_count, op] : cigar_vector)
866  {
867  cigar_count = cigar_count << 4;
868  cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
869  std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
870  }
871 
872  // write seq (bit-compressed: sam_dna16 characters go into one byte)
873  using alph_t = value_type_t<seq_type>;
874  constexpr auto to_dna16 = detail::convert_through_char_representation<sam_dna16, alph_t>;
875 
876  auto sit = std::ranges::begin(seq);
877  for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
878  {
879  uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
880  ++sidx, ++sit;
881  compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
882  stream_it = static_cast<char>(compressed_chr);
883  }
884 
885  if (core.l_seq & 1) // write one more
886  stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
887 
888  // write qual
889  if (std::ranges::empty(qual))
890  {
891  auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
892  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
893  }
894  else
895  {
896  if (std::ranges::distance(qual) != core.l_seq)
897  throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
898  core.l_seq, ". Got quality with size ",
899  std::ranges::distance(qual), " instead.")};
900 
901  auto v = qual | std::views::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
902  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
903  }
904 
905  // write optional fields
906  stream << tag_dict_binary_str;
907  } // if constexpr (!detail::decays_to_ignore_v<header_type>)
908 }
909 
911 template <typename stream_view_type, typename value_type>
912 inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
913  stream_view_type && stream_view,
914  value_type const & SEQAN3_DOXYGEN_ONLY(value))
915 {
916  int32_t count;
917  read_field(stream_view, count); // read length of vector
918  std::vector<value_type> tmp_vector;
919  tmp_vector.reserve(count);
920 
921  while (count > 0)
922  {
923  value_type tmp{};
924  read_field(stream_view, tmp);
925  tmp_vector.push_back(std::move(tmp));
926  --count;
927  }
928  variant = std::move(tmp_vector);
929 }
930 
948 template <typename stream_view_type>
949 inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
950 {
951  /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
952  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
953  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
954  VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
955  by the length (int32_t) of the array, followed by the values.
956  */
957  uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
958  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
959  tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
960  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
961  char type_id = static_cast<char>(*std::ranges::begin(stream_view));
962  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
963 
964  switch (type_id)
965  {
966  case 'A' : // char
967  {
968  target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
969  std::ranges::next(std::ranges::begin(stream_view)); // skip char that has been read
970  break;
971  }
972  // all integer sizes are possible
973  case 'c' : // int8_t
974  {
975  int8_t tmp;
976  read_field(stream_view, tmp);
977  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
978  break;
979  }
980  case 'C' : // uint8_t
981  {
982  uint8_t tmp;
983  read_field(stream_view, tmp);
984  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
985  break;
986  }
987  case 's' : // int16_t
988  {
989  int16_t tmp;
990  read_field(stream_view, tmp);
991  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
992  break;
993  }
994  case 'S' : // uint16_t
995  {
996  uint16_t tmp;
997  read_field(stream_view, tmp);
998  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
999  break;
1000  }
1001  case 'i' : // int32_t
1002  {
1003  int32_t tmp;
1004  read_field(stream_view, tmp);
1005  target[tag] = std::move(tmp); // readable sam format only allows int32_t
1006  break;
1007  }
1008  case 'I' : // uint32_t
1009  {
1010  uint32_t tmp;
1011  read_field(stream_view, tmp);
1012  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1013  break;
1014  }
1015  case 'f' : // float
1016  {
1017  float tmp;
1018  read_field(stream_view, tmp);
1019  target[tag] = tmp;
1020  break;
1021  }
1022  case 'Z' : // string
1023  {
1024  string_buffer.clear();
1025  while (!is_char<'\0'>(*std::ranges::begin(stream_view)))
1026  {
1027  string_buffer.push_back(*std::ranges::begin(stream_view));
1028  std::ranges::next(std::ranges::begin(stream_view));
1029  }
1030  std::ranges::next(std::ranges::begin(stream_view)); // skip \0
1031  target[tag] = string_buffer;
1032  break;
1033  }
1034  case 'H' :
1035  {
1036  // TODO
1037  break;
1038  }
1039  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1040  {
1041  char array_value_type_id = *std::ranges::begin(stream_view);
1042  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1043 
1044  switch (array_value_type_id)
1045  {
1046  case 'c' : // int8_t
1047  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1048  break;
1049  case 'C' : // uint8_t
1050  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1051  break;
1052  case 's' : // int16_t
1053  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1054  break;
1055  case 'S' : // uint16_t
1056  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1057  break;
1058  case 'i' : // int32_t
1059  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1060  break;
1061  case 'I' : // uint32_t
1062  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1063  break;
1064  case 'f' : // float
1065  read_sam_dict_vector(target[tag], stream_view, float{});
1066  break;
1067  default:
1068  throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1069  "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
1070  }
1071  break;
1072  }
1073  default:
1074  throw format_error{detail::to_string("The second character in the numerical id of a "
1075  "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
1076  }
1077 }
1078 
1093 template <typename cigar_input_type>
1094 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1095 {
1096  std::vector<cigar> operations{};
1097  char operation{'\0'};
1098  uint32_t count{};
1099  int32_t ref_length{}, seq_length{};
1100  uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1101  constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1102  constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1103 
1104  if (n_cigar_op == 0) // [[unlikely]]
1105  return std::tuple{operations, ref_length, seq_length};
1106 
1107  // parse the rest of the cigar
1108  // -------------------------------------------------------------------------------------------------------------
1109  while (n_cigar_op > 0) // until stream is not empty
1110  {
1111  std::ranges::copy_n(std::ranges::begin(cigar_input),
1112  sizeof(operation_and_count),
1113  reinterpret_cast<char*>(&operation_and_count));
1114  operation = cigar_mapping[operation_and_count & cigar_mask];
1115  count = operation_and_count >> 4;
1116 
1117  update_alignment_lengths(ref_length, seq_length, operation, count);
1118  operations.emplace_back(count, cigar_op{}.assign_char(operation));
1119  --n_cigar_op;
1120  }
1121 
1122  return std::tuple{operations, ref_length, seq_length};
1123 }
1124 
1128 inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary const & tag_dict)
1129 {
1130  std::string result{};
1131 
1132  auto stream_variant_fn = [&result] (auto && arg) // helper to print an std::variant
1133  {
1134  // T is either char, int32_t, float, std::string, or a std::vector<some int>
1135  using T = remove_cvref_t<decltype(arg)>;
1136 
1137  if constexpr (std::same_as<T, int32_t>)
1138  {
1139  // always choose the smallest possible representation [cCsSiI]
1140  bool negative = arg < 0;
1141  auto n = __builtin_ctz(detail::next_power_of_two(((negative) ? arg * -1 : arg) + 1) >> 1) / 8;
1142  n = n * n + 2 * negative; // for switch case order
1143 
1144  switch (n)
1145  {
1146  case 0:
1147  {
1148  result[result.size() - 1] = 'C';
1149  result.append(reinterpret_cast<char const *>(&arg), 1);
1150  break;
1151  }
1152  case 1:
1153  {
1154  result[result.size() - 1] = 'S';
1155  result.append(reinterpret_cast<char const *>(&arg), 2);
1156  break;
1157  }
1158  case 2:
1159  {
1160  result[result.size() - 1] = 'c';
1161  int8_t tmp = static_cast<int8_t>(arg);
1162  result.append(reinterpret_cast<char const *>(&tmp), 1);
1163  break;
1164  }
1165  case 3:
1166  {
1167  result[result.size() - 1] = 's';
1168  int16_t tmp = static_cast<int16_t>(arg);
1169  result.append(reinterpret_cast<char const *>(&tmp), 2);
1170  break;
1171  }
1172  default:
1173  {
1174  result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1175  break;
1176  }
1177  }
1178  }
1179  else if constexpr (std::same_as<T, std::string>)
1180  {
1181  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1182  }
1183  else if constexpr (!std::ranges::range<T>) // char, float
1184  {
1185  result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1186  }
1187  else // std::vector of some arithmetic_type type
1188  {
1189  int32_t sz{static_cast<int32_t>(arg.size())};
1190  result.append(reinterpret_cast<char *>(&sz), 4);
1191  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() * sizeof(value_type_t<T>));
1192  }
1193  };
1194 
1195  for (auto & [tag, variant] : tag_dict)
1196  {
1197  result.push_back(static_cast<char>(tag / 256));
1198  result.push_back(static_cast<char>(tag % 256));
1199 
1200  result.push_back(detail::sam_tag_type_char[variant.index()]);
1201 
1202  if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1203  result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1204 
1205  std::visit(stream_variant_fn, variant);
1206  }
1207 
1208  return result;
1209 }
1210 
1211 } // namespace seqan3
seqan3::format_bam::file_extensions
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:75
seqan3::views::take_exactly_or_throw
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:91
seqan3::gap
The alphabet of a gap character '-'.
Definition: gap.hpp:36
seqan3::alignment_file_output_options
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:22
std::string::resize
T resize(T... args)
seqan3::remove_cvref_t
std::remove_cv_t< std::remove_reference_t< t > > remove_cvref_t
Return the input type with const, volatile and references removed (type trait).
Definition: basic.hpp:35
misc.hpp
Provides helper data structures for the seqan3::alignment_file_output.
seqan3::field::seq
The "sequence", usually a range of nucleotides or amino acids.
bit_manipulation.hpp
Provides utility functions for bit twiddling.
seqan3::format_bam::operator=
format_bam & operator=(format_bam const &) noexcept=default
Defaulted.
take_until.hpp
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
std::span
std::string
sequence_container
A more refined container concept than seqan3::container.
std::string_view
tuple.hpp
Provides seqan3::tuple_like.
seqan3::field::offset
Sequence (SEQ) relative start position (0-based), unsigned value.
sam_tag_dictionary.hpp
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
std::pair
std::vector::reserve
T reserve(T... args)
seqan3::format_error
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:84
seqan3::reference_t
typename reference< t >::type reference_t
Shortcut for seqan3::reference (transformation_trait shortcut).
Definition: pre.hpp:77
vector
std::string::size
T size(T... args)
seqan3::format_bam::read_alignment_record
void read_alignment_record(stream_type &stream, alignment_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, alignment_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:285
seqan3::views::move
const auto move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:68
template_inspection.hpp
Provides seqan3::type_list and auxiliary type traits.
seqan3::sam_flag
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: misc.hpp:70
seqan3::to_rank
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:142
std::tuple
seqan3::alignment_file_input_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:23
seqan3::alignment_file_header::ref_ids
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:119
seqan3::field::bit_score
The bit score (statistical significance indicator), unsigned value.
seqan3::field::ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
seqan3::field::ref_offset
Sequence (REF_SEQ) relative start position (0-based), unsigned value.
seqan3::format_bam::format_bam
format_bam() noexcept=default
Defaulted.
std::string::clear
T clear(T... args)
concepts
The Concepts library.
input_format_concept.hpp
Provides seqan3::alignment_file_input_format and auxiliary classes.
std::tie
T tie(T... args)
std::vector::push_back
T push_back(T... args)
seqan3::seq
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
same_as
The concept std::same_as<T, U> is satisfied if and only if T and U denote the same type.
seqan3::value_type
Exposes the value_type of another type.
Definition: pre.hpp:41
seqan3::format_bam::~format_bam
~format_bam() noexcept=default
slice.hpp
Provides seqan3::views::slice.
seqan3::sam_dna16
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition: sam_dna16.hpp:45
seqan3::alignment_file_header
Stores the header information of alignment files.
Definition: header.hpp:32
range.hpp
Provides various transformation traits used by the range module.
seqan3::field::mapq
The mapping quality of the SEQ alignment, usually a ohred-scaled score.
core_language.hpp
Provides concepts for core language types and relations that don't have concepts in C++20 (yet).
seqan3::value_type_t
typename value_type< t >::type value_type_t
Shortcut for seqan3::value_type (transformation_trait shortcut).
Definition: pre.hpp:48
seqan3::alignment_file_header::ref_dict
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:164
seqan3::cigar
The cigar semialphabet pairs a counter with a seqan3::cigar_op letter.
Definition: cigar.hpp:54
seqan3::ostreambuf_iterator
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Writes successive characters onto the output stream from which...
Definition: iterator.hpp:204
header.hpp
Provides the seqan3::alignment_file_header class.
std::array< uint8_t, 256 >
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:36
detail.hpp
Auxiliary functions for the alignment IO.
std::istreambuf_iterator
output_options.hpp
Provides seqan3::alignment_file_output_options.
seqan3::search_cfg::all
constexpr detail::search_mode_all all
Configuration element to receive all hits within the error bounds.
Definition: mode.hpp:43
seqan3::pack_traits::size
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:116
tuple_like
Whether a type behaves like a tuple.
std::swap
T swap(T... args)
std::min
T min(T... args)
std::ostringstream
std::variant::index
T index(T... args)
ranges
Adaptations of concepts from the Ranges TS.
seqan3::sam_tag_dictionary
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:324
std::remove_reference_t
seqan3::format_bam
The BAM format.
Definition: format_bam.hpp:59
take_exactly.hpp
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
seqan3::alignment_file_header::ref_id_info
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:161
alphabet
The generic alphabet concept that covers most data types used in ranges.
std::ranges::begin
T begin(T... args)
predicate.hpp
Provides character predicates for tokenisation.
seqan3::field::ref_id
The identifier of the (reference) sequence that SEQ was aligned to.
seqan3::views::slice
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:141
ignore_output_iterator.hpp
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
std::visit
T visit(T... args)
output_format_concept.hpp
Provides seqan3::alignment_file_output_format and auxiliary classes.
equality_comparable_with
Requires seqan3::detail::weakly_equality_comparable_witht<t1,t2>, but also that t1 and t2,...
sam_dna16.hpp
Provides seqan3::sam_dna16.
std::remove_cv_t
implicitly_convertible_to
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
seqan3::field::qual
The qualities, usually in phred-score notation.
std::optional
to_string.hpp
Auxiliary for pretty printing of exception messages.
std::ostringstream::str
T str(T... args)
seqan3::alphabet_base::assign_rank
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:165
seqan3::format_bam::write_alignment_record
void write_alignment_record([[maybe_unused]] stream_type &stream, [[maybe_unused]] alignment_file_output_options const &options, [[maybe_unused]] header_type &&header, [[maybe_unused]] seq_type &&seq, [[maybe_unused]] qual_type &&qual, [[maybe_unused]] id_type &&id, [[maybe_unused]] int32_t const offset, [[maybe_unused]] ref_seq_type &&ref_seq, [[maybe_unused]] ref_id_type &&ref_id, [[maybe_unused]] std::optional< int32_t > ref_offset, [[maybe_unused]] align_type &&align, [[maybe_unused]] cigar_type &&cigar_vector, [[maybe_unused]] sam_flag const flag, [[maybe_unused]] uint8_t const mapq, [[maybe_unused]] mate_type &&mate, [[maybe_unused]] tag_dict_type &&tag_dict, [[maybe_unused]] double e_value, [[maybe_unused]] double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:597
seqan3::views::repeat_n
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:94
std::end
T end(T... args)
seqan3::pack_traits::transform
seqan3::type_list< trait_t< pack_t >... > transform
Apply a transformation trait to every type in the pack and return a seqan3::type_list of the results.
Definition: traits.hpp:307
seqan3::field::flag
The alignment flag (bit information), uint16_t value.
integral
The concept integral is satisfied if and only if T is an integral type.
misc.hpp
Provides various utility functions.
input_options.hpp
Provides seqan3::alignment_file_input_options.
forwarding_range
Specifies a range whose iterators may outlive the range and remain valid.
format_sam_base.hpp
Provides the seqan3::format_sam_base that can be inherited from.
misc.hpp
Provides various utility functions.
std::string::data
T data(T... args)
seqan3::pack_traits::count
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:134
std::make_unsigned_t
std::tuple_size_v
T tuple_size_v
seqan3::assign_unaligned
void assign_unaligned(aligned_seq_t &aligned_seq, unaligned_sequence_type &&unaligned_seq)
An implementation of seqan3::aligned_sequence::assign_unaligned_sequence for sequence containers.
Definition: aligned_sequence_concept.hpp:375
std::variant
seqan3::field::mate
The mate pair information given as a std::tuple of reference name, offset and template length.
string