SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/bit>
16 #include <iterator>
17 #include <seqan3/std/ranges>
18 #include <string>
19 #include <vector>
20 
33 
34 namespace seqan3
35 {
36 
47 class format_bam : private detail::format_sam_base
48 {
49 public:
53  // string_buffer is of type std::string and has some problems with pre-C++11 ABI
54  format_bam() = default;
55  format_bam(format_bam const &) = default;
56  format_bam & operator=(format_bam const &) = default;
57  format_bam(format_bam &&) = default;
58  format_bam & operator=(format_bam &&) = default;
59  ~format_bam() = default;
60 
62 
65  {
66  { "bam" }
67  };
68 
69 protected:
70  template <typename stream_type, // constraints checked by file
71  typename seq_legal_alph_type,
72  typename ref_seqs_type,
73  typename ref_ids_type,
74  typename seq_type,
75  typename id_type,
76  typename offset_type,
77  typename ref_seq_type,
78  typename ref_id_type,
79  typename ref_offset_type,
80  typename align_type,
81  typename cigar_type,
82  typename flag_type,
83  typename mapq_type,
84  typename qual_type,
85  typename mate_type,
86  typename tag_dict_type,
87  typename e_value_type,
88  typename bit_score_type>
89  void read_alignment_record(stream_type & stream,
90  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
91  ref_seqs_type & ref_seqs,
93  seq_type & seq,
94  qual_type & qual,
95  id_type & id,
96  offset_type & offset,
97  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
98  ref_id_type & ref_id,
99  ref_offset_type & ref_offset,
100  align_type & align,
101  cigar_type & cigar_vector,
102  flag_type & flag,
103  mapq_type & mapq,
104  mate_type & mate,
105  tag_dict_type & tag_dict,
106  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
107  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
108 
109  template <typename stream_type,
110  typename header_type,
111  typename seq_type,
112  typename id_type,
113  typename ref_seq_type,
114  typename ref_id_type,
115  typename align_type,
116  typename cigar_type,
117  typename qual_type,
118  typename mate_type,
119  typename tag_dict_type>
120  void write_alignment_record([[maybe_unused]] stream_type & stream,
121  [[maybe_unused]] sam_file_output_options const & options,
122  [[maybe_unused]] header_type && header,
123  [[maybe_unused]] seq_type && seq,
124  [[maybe_unused]] qual_type && qual,
125  [[maybe_unused]] id_type && id,
126  [[maybe_unused]] int32_t const offset,
127  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
128  [[maybe_unused]] ref_id_type && ref_id,
129  [[maybe_unused]] std::optional<int32_t> ref_offset,
130  [[maybe_unused]] align_type && align,
131  [[maybe_unused]] cigar_type && cigar_vector,
132  [[maybe_unused]] sam_flag const flag,
133  [[maybe_unused]] uint8_t const mapq,
134  [[maybe_unused]] mate_type && mate,
135  [[maybe_unused]] tag_dict_type && tag_dict,
136  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
137  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
138 
139 private:
141  bool header_was_read{false};
142 
144  std::string string_buffer{};
145 
147  struct alignment_record_core
148  { // naming corresponds to official SAM/BAM specifications
149  int32_t block_size;
150  int32_t refID;
151  int32_t pos;
152  uint32_t l_read_name:8;
153  uint32_t mapq:8;
154  uint32_t bin:16;
155  uint32_t n_cigar_op:16;
156  sam_flag flag;
157  int32_t l_seq;
158  int32_t next_refID;
159  int32_t next_pos;
160  int32_t tlen;
161  };
162 
164  static constexpr std::array<uint8_t, 256> char_to_sam_rank
165  {
166  [] () constexpr
167  {
169 
170  using index_t = std::make_unsigned_t<char>;
171 
172  // ret['M'] = 0; set anyway by initialization
173  ret[static_cast<index_t>('I')] = 1;
174  ret[static_cast<index_t>('D')] = 2;
175  ret[static_cast<index_t>('N')] = 3;
176  ret[static_cast<index_t>('S')] = 4;
177  ret[static_cast<index_t>('H')] = 5;
178  ret[static_cast<index_t>('P')] = 6;
179  ret[static_cast<index_t>('=')] = 7;
180  ret[static_cast<index_t>('X')] = 8;
181 
182  return ret;
183  }()
184  };
185 
187  static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
188  {
189  --end;
190  if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
191  if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
192  if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
193  if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
194  if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
195  return 0;
196  }
197 
198  using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
199 
206  template <typename stream_view_type, std::integral number_type>
207  void read_field(stream_view_type && stream_view, number_type & target)
208  {
209  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
210  }
211 
217  template <typename stream_view_type>
218  void read_field(stream_view_type && stream_view, float & target)
219  {
220  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
221  }
222 
233  template <typename stream_view_type, typename optional_value_type>
234  void read_field(stream_view_type && stream_view, std::optional<optional_value_type> & target)
235  {
236  optional_value_type tmp;
237  read_field(std::forward<stream_view_type>(stream_view), tmp);
238  target = tmp;
239  }
240 
241  template <typename stream_view_type, typename value_type>
242  void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
243  stream_view_type && stream_view,
244  value_type const & SEQAN3_DOXYGEN_ONLY(value));
245 
246  template <typename stream_view_type>
247  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
248 
249  template <typename cigar_input_type>
250  auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
251 
252  static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
253 };
254 
256 template <typename stream_type, // constraints checked by file
257  typename seq_legal_alph_type,
258  typename ref_seqs_type,
259  typename ref_ids_type,
260  typename seq_type,
261  typename id_type,
262  typename offset_type,
263  typename ref_seq_type,
264  typename ref_id_type,
265  typename ref_offset_type,
266  typename align_type,
267  typename cigar_type,
268  typename flag_type,
269  typename mapq_type,
270  typename qual_type,
271  typename mate_type,
272  typename tag_dict_type,
273  typename e_value_type,
274  typename bit_score_type>
275 inline void format_bam::read_alignment_record(stream_type & stream,
276  sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
277  ref_seqs_type & ref_seqs,
279  seq_type & seq,
280  qual_type & qual,
281  id_type & id,
282  offset_type & offset,
283  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
284  ref_id_type & ref_id,
285  ref_offset_type & ref_offset,
286  align_type & align,
287  cigar_type & cigar_vector,
288  flag_type & flag,
289  mapq_type & mapq,
290  mate_type & mate,
291  tag_dict_type & tag_dict,
292  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
293  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
294 {
295  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
296  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
297  "The ref_offset must be a specialisation of std::optional.");
298 
299  static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
300  "The type of field::mapq must be uint8_t.");
301 
302  static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
303  "The type of field::flag must be seqan3::sam_flag.");
304 
305  auto stream_view = seqan3::detail::istreambuf(stream);
306 
307  // these variables need to be stored to compute the ALIGNMENT
308  [[maybe_unused]] int32_t offset_tmp{};
309  [[maybe_unused]] int32_t soft_clipping_end{};
310  [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
311  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
312 
313  // Header
314  // -------------------------------------------------------------------------------------------------------------
315  if (!header_was_read)
316  {
317  // magic BAM string
318  if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
319  throw format_error{"File is not in BAM format."};
320 
321  int32_t l_text{}; // length of header text including \0 character
322  int32_t n_ref{}; // number of reference sequences
323  int32_t l_name{}; // 1 + length of reference name including \0 character
324  int32_t l_ref{}; // length of reference sequence
325 
326  read_field(stream_view, l_text);
327 
328  if (l_text > 0) // header text is present
329  read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
330 
331  read_field(stream_view, n_ref);
332 
333  for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
334  {
335  read_field(stream_view, l_name);
336 
337  string_buffer.resize(l_name - 1);
338  std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.data()); // copy without \0 character
339  std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
340 
341  read_field(stream_view, l_ref);
342 
343  if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
344  {
345  // If there was no header text, we parse reference sequences block as header information
346  if (l_text == 0)
347  {
348  auto & reference_ids = header.ref_ids();
349  // put the length of the reference sequence into ref_id_info
350  header.ref_id_info.emplace_back(l_ref, "");
351  // put the reference name into reference_ids
352  reference_ids.push_back(string_buffer);
353  // assign the reference name an ascending reference id (starts at index 0).
354  header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
355  continue;
356  }
357  }
358 
359  auto id_it = header.ref_dict.find(string_buffer);
360 
361  // sanity checks of reference information to existing header object:
362  if (id_it == header.ref_dict.end()) // [unlikely]
363  {
364  throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
365  "' found in BAM file header (header.ref_ids():",
366  header.ref_ids(), ").")};
367  }
368  else if (id_it->second != ref_idx) // [unlikely]
369  {
370  throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
371  " does not correspond to the position ", id_it->second,
372  " in the header (header.ref_ids():", header.ref_ids(), ").")};
373  }
374  else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
375  {
376  throw format_error{"Provided reference has unequal length as specified in the header."};
377  }
378  }
379 
380  header_was_read = true;
381 
382  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
383  return;
384  }
385 
386  // read alignment record into buffer
387  // -------------------------------------------------------------------------------------------------------------
388  alignment_record_core core;
389  std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
390 
391  if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
392  {
393  throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
394  "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
395  }
396  else if (core.refID > -1) // not unmapped
397  {
398  ref_id = core.refID; // field::ref_id
399  }
400 
401  flag = core.flag; // field::flag
402  mapq = core.mapq; // field::mapq
403 
404  if (core.pos > -1) // [[likely]]
405  ref_offset = core.pos; // field::ref_offset
406 
407  if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
408  {
409  if (core.next_refID > -1)
410  get<0>(mate) = core.next_refID;
411 
412  if (core.next_pos > -1) // [[likely]]
413  get<1>(mate) = core.next_pos;
414 
415  get<2>(mate) = core.tlen;
416  }
417 
418  // read id
419  // -------------------------------------------------------------------------------------------------------------
420  read_field(stream_view | detail::take_exactly_or_throw(core.l_read_name - 1), id); // field::id
421  std::ranges::next(std::ranges::begin(stream_view)); // skip '\0'
422 
423  // read cigar string
424  // -------------------------------------------------------------------------------------------------------------
425  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
426  {
427  std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
428  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
429  // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
430  }
431  else
432  {
433  detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
434  }
435 
436  offset = offset_tmp;
437 
438  // read sequence
439  // -------------------------------------------------------------------------------------------------------------
440  if (core.l_seq > 0) // sequence information is given
441  {
442  auto seq_stream = stream_view
443  | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
445  {
446  return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
447  dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
448  });
449 
450  if constexpr (detail::decays_to_ignore_v<seq_type>)
451  {
452  auto skip_sequence_bytes = [&] ()
453  {
454  detail::consume(seq_stream);
455  if (core.l_seq & 1)
456  std::ranges::next(std::ranges::begin(stream_view));
457  };
458 
459  if constexpr (!detail::decays_to_ignore_v<align_type>)
460  {
461  static_assert(sequence_container<std::remove_reference_t<decltype(get<1>(align))>>,
462  "If you want to read ALIGNMENT but not SEQ, the alignment"
463  " object must store a sequence container at the second (query) position.");
464 
465  if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
466  {
467  assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
468  using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
469  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
470 
471  get<1>(align).reserve(seq_length);
472 
473  auto tmp_iter = std::ranges::begin(seq_stream);
474  std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
475 
476  if (offset_tmp & 1)
477  {
478  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
479  ++tmp_iter;
480  --seq_length;
481  }
482 
483  for (size_t i = (seq_length / 2); i > 0; --i)
484  {
485  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
486  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
487  ++tmp_iter;
488  }
489 
490  if (seq_length & 1)
491  {
492  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
493  ++tmp_iter;
494  --soft_clipping_end;
495  }
496 
497  std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
498  }
499  else
500  {
501  skip_sequence_bytes();
502  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
503  }
504  }
505  else
506  {
507  skip_sequence_bytes();
508  }
509  }
510  else
511  {
512  using alph_t = std::ranges::range_value_t<decltype(seq)>;
513  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
514 
515  for (auto [d1, d2] : seq_stream)
516  {
517  seq.push_back(from_dna16[to_rank(d1)]);
518  seq.push_back(from_dna16[to_rank(d2)]);
519  }
520 
521  if (core.l_seq & 1)
522  {
523  dna16sam d = dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
524  seq.push_back(from_dna16[to_rank(d)]);
525  std::ranges::next(std::ranges::begin(stream_view));
526  }
527 
528  if constexpr (!detail::decays_to_ignore_v<align_type>)
529  {
530  assign_unaligned(get<1>(align),
531  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
532  std::ranges::distance(seq) - soft_clipping_end));
533  }
534  }
535  }
536 
537  // read qual string
538  // -------------------------------------------------------------------------------------------------------------
539  read_field(stream_view | detail::take_exactly_or_throw(core.l_seq)
540  | std::views::transform([] (char chr) { return static_cast<char>(chr + 33); }), qual);
541 
542  // All remaining optional fields if any: SAM tags dictionary
543  // -------------------------------------------------------------------------------------------------------------
544  int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
545  core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
546  assert(remaining_bytes >= 0);
547  auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
548 
549  while (tags_view.size() > 0)
550  read_field(tags_view, tag_dict);
551 
552  // DONE READING - wrap up
553  // -------------------------------------------------------------------------------------------------------------
554  if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
555  {
556  // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
557  // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
558  // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
559  if (core.l_seq != 0 && offset_tmp == core.l_seq)
560  {
561  if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
562  { // maybe only throw in debug mode and otherwise return an empty alignment?
563  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
564  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
565  "stored in the optional field CG. You need to read in the field::tags and "
566  "field::seq in order to access this information.")};
567  }
568  else
569  {
570  auto it = tag_dict.find("CG"_tag);
571 
572  if (it == tag_dict.end())
573  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
574  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
575  "stored in the optional field CG but this tag is not present in the given ",
576  "record.")};
577 
578  auto cigar_view = std::views::all(std::get<std::string>(it->second));
579  std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
580  offset_tmp = soft_clipping_end = 0;
581  transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
582  tag_dict.erase(it); // remove redundant information
583 
584  if constexpr (!detail::decays_to_ignore_v<align_type>)
585  {
586  assign_unaligned(get<1>(align),
587  seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
588  std::ranges::distance(seq) - soft_clipping_end));
589  }
590  }
591  }
592  }
593 
594  // Alignment object construction
595  if constexpr (!detail::decays_to_ignore_v<align_type>)
596  construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
597 
598  if constexpr (!detail::decays_to_ignore_v<cigar_type>)
599  std::swap(cigar_vector, tmp_cigar_vector);
600 }
601 
603 template <typename stream_type,
604  typename header_type,
605  typename seq_type,
606  typename id_type,
607  typename ref_seq_type,
608  typename ref_id_type,
609  typename align_type,
610  typename cigar_type,
611  typename qual_type,
612  typename mate_type,
613  typename tag_dict_type>
614 inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
615  [[maybe_unused]] sam_file_output_options const & options,
616  [[maybe_unused]] header_type && header,
617  [[maybe_unused]] seq_type && seq,
618  [[maybe_unused]] qual_type && qual,
619  [[maybe_unused]] id_type && id,
620  [[maybe_unused]] int32_t const offset,
621  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
622  [[maybe_unused]] ref_id_type && ref_id,
623  [[maybe_unused]] std::optional<int32_t> ref_offset,
624  [[maybe_unused]] align_type && align,
625  [[maybe_unused]] cigar_type && cigar_vector,
626  [[maybe_unused]] sam_flag const flag,
627  [[maybe_unused]] uint8_t const mapq,
628  [[maybe_unused]] mate_type && mate,
629  [[maybe_unused]] tag_dict_type && tag_dict,
630  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
631  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
632 {
633  // ---------------------------------------------------------------------
634  // Type Requirements (as static asserts for user friendliness)
635  // ---------------------------------------------------------------------
636  static_assert((std::ranges::forward_range<seq_type> &&
637  alphabet<std::ranges::range_reference_t<seq_type>>),
638  "The seq object must be a std::ranges::forward_range over "
639  "letters that model seqan3::alphabet.");
640 
641  static_assert((std::ranges::forward_range<id_type> &&
642  alphabet<std::ranges::range_reference_t<id_type>>),
643  "The id object must be a std::ranges::forward_range over "
644  "letters that model seqan3::alphabet.");
645 
646  static_assert((std::ranges::forward_range<ref_seq_type> &&
647  alphabet<std::ranges::range_reference_t<ref_seq_type>>),
648  "The ref_seq object must be a std::ranges::forward_range "
649  "over letters that model seqan3::alphabet.");
650 
651  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
652  {
653  static_assert((std::ranges::forward_range<ref_id_type> ||
654  std::integral<std::remove_reference_t<ref_id_type>> ||
655  detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
656  "The ref_id object must be a std::ranges::forward_range "
657  "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
658  }
659 
661  "The align object must be a std::pair of two ranges whose "
662  "value_type is comparable to seqan3::gap");
663 
664  static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
665  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
666  std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
667  "The align object must be a std::pair of two ranges whose "
668  "value_type is comparable to seqan3::gap");
669 
670  static_assert((std::ranges::forward_range<qual_type> &&
671  alphabet<std::ranges::range_reference_t<qual_type>>),
672  "The qual object must be a std::ranges::forward_range "
673  "over letters that model seqan3::alphabet.");
674 
676  "The mate object must be a std::tuple of size 3 with "
677  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
678  "2) a std::integral or std::optional<std::integral>, and "
679  "3) a std::integral.");
680 
681  static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
682  std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
683  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
684  (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
685  detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
686  std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
687  "The mate object must be a std::tuple of size 3 with "
688  "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
689  "2) a std::integral or std::optional<std::integral>, and "
690  "3) a std::integral.");
691 
692  static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
693  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
694 
695  if constexpr (detail::decays_to_ignore_v<header_type>)
696  {
697  throw format_error{"BAM can only be written with a header but you did not provide enough information! "
698  "You can either construct the output file with ref_ids and ref_seqs information and "
699  "the header will be created for you, or you can access the `header` member directly."};
700  }
701  else
702  {
703  // ---------------------------------------------------------------------
704  // logical Requirements
705  // ---------------------------------------------------------------------
706 
707  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
708  throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
709 
710  detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
711 
712  // ---------------------------------------------------------------------
713  // Writing the BAM Header on first call
714  // ---------------------------------------------------------------------
715  if (!header_was_written)
716  {
717  stream << "BAM\1";
719  write_header(os, options, header); // write SAM header to temporary stream to query the size.
720  int32_t l_text{static_cast<int32_t>(os.str().size())};
721  std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
722 
723  stream << os.str();
724 
725  int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
726  std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
727 
728  for (int32_t ridx = 0; ridx < n_ref; ++ridx)
729  {
730  int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
731  std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
732  // write reference name:
733  std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
734  stream_it = '\0';
735  // write reference sequence length:
736  std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
737  }
738 
739  header_was_written = true;
740  }
741 
742  // ---------------------------------------------------------------------
743  // Writing the Record
744  // ---------------------------------------------------------------------
745  int32_t ref_length{};
746 
747  // if alignment is non-empty, replace cigar_vector.
748  // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
749  if (!std::ranges::empty(cigar_vector))
750  {
751  int32_t dummy_seq_length{};
752  for (auto & [count, operation] : cigar_vector)
753  detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
754  }
755  else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
756  {
757  ref_length = std::ranges::distance(get<1>(align));
758 
759  // compute possible distance from alignment end to sequence end
760  // which indicates soft clipping at the end.
761  // This should be replaced by a free count_gaps function for
762  // aligned sequences which is more efficient if possible.
763  int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
764 
765  for (auto chr : get<1>(align))
766  if (chr == gap{})
767  ++off_end;
768 
769  off_end -= ref_length;
770  cigar_vector = detail::get_cigar_vector(align, offset, off_end);
771  }
772 
773  if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
774  {
775  tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
776  cigar_vector.resize(2);
777  cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
778  cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
779  }
780 
781  std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
782 
783  // Compute the value for the l_read_name field for the bam record.
784  // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
785  // the data type to store the value is uint8_t and 255 is the maximal size.
786  // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
787  // 2 bytes.
788  uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
789  read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
790 
791  alignment_record_core core
792  {
793  /* block_size */ 0, // will be initialised right after
794  /* refID */ -1, // will be initialised right after
795  /* pos */ ref_offset.value_or(-1),
796  /* l_read_name */ read_name_size,
797  /* mapq */ mapq,
798  /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
799  /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
800  /* flag */ flag,
801  /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
802  /* next_refId */ -1, // will be initialised right after
803  /* next_pos */ get<1>(mate).value_or(-1),
804  /* tlen */ get<2>(mate)
805  };
806 
807  auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
808  [[maybe_unused]] auto & id_target)
809  {
810  using id_t = std::remove_reference_t<decltype(id_source)>;
811 
812  if constexpr (!detail::decays_to_ignore_v<id_t>)
813  {
814  if constexpr (std::integral<id_t>)
815  {
816  id_target = id_source;
817  }
818  else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
819  {
820  id_target = id_source.value_or(-1);
821  }
822  else
823  {
824  if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
825  {
826  auto id_it = header.ref_dict.end();
827 
828  if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
829  std::ranges::sized_range<decltype(id_source)> &&
830  std::ranges::borrowed_range<decltype(id_source)>)
831  {
832  id_it = header.ref_dict.find(std::span{std::ranges::data(id_source),
833  std::ranges::size(id_source)});
834  }
835  else
836  {
837  using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
838 
839  static_assert(implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
840  "The ref_id type is not convertible to the reference id information stored in the "
841  "reference dictionary of the header object.");
842 
843  id_it = header.ref_dict.find(id_source);
844  }
845 
846  if (id_it == header.ref_dict.end())
847  {
848  throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
849  "not be found in BAM header ref_dict: ",
850  header.ref_dict, ".")};
851  }
852 
853  id_target = id_it->second;
854  }
855  }
856  }
857  };
858 
859  // initialise core.refID
860  check_and_assign_id_to(ref_id, core.refID);
861 
862  // initialise core.next_refID
863  check_and_assign_id_to(get<0>(mate), core.next_refID);
864 
865  // initialise core.block_size
866  core.block_size = sizeof(core) - 4/*block_size excluded*/ +
867  core.l_read_name +
868  core.n_cigar_op * 4 + // each int32_t has 4 bytes
869  (core.l_seq + 1) / 2 + // bitcompressed seq
870  core.l_seq + // quality string
871  tag_dict_binary_str.size();
872 
873  std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
874 
875  if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
876  stream_it = '*';
877  else
878  std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
879  stream_it = '\0';
880 
881  // write cigar
882  for (auto [cigar_count, op] : cigar_vector)
883  {
884  cigar_count = cigar_count << 4;
885  cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
886  std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
887  }
888 
889  // write seq (bit-compressed: dna16sam characters go into one byte)
890  using alph_t = std::ranges::range_value_t<seq_type>;
891  constexpr auto to_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
892 
893  auto sit = std::ranges::begin(seq);
894  for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
895  {
896  uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
897  ++sidx, ++sit;
898  compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
899  stream_it = static_cast<char>(compressed_chr);
900  }
901 
902  if (core.l_seq & 1) // write one more
903  stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
904 
905  // write qual
906  if (std::ranges::empty(qual))
907  {
908  auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
909  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
910  }
911  else
912  {
913  if (std::ranges::distance(qual) != core.l_seq)
914  throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
915  core.l_seq, ". Got quality with size ",
916  std::ranges::distance(qual), " instead.")};
917 
918  auto v = qual | std::views::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
919  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
920  }
921 
922  // write optional fields
923  stream << tag_dict_binary_str;
924  } // if constexpr (!detail::decays_to_ignore_v<header_type>)
925 }
926 
928 template <typename stream_view_type, typename value_type>
929 inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
930  stream_view_type && stream_view,
931  value_type const & SEQAN3_DOXYGEN_ONLY(value))
932 {
933  int32_t count;
934  read_field(stream_view, count); // read length of vector
935  std::vector<value_type> tmp_vector;
936  tmp_vector.reserve(count);
937 
938  while (count > 0)
939  {
940  value_type tmp{};
941  read_field(stream_view, tmp);
942  tmp_vector.push_back(std::move(tmp));
943  --count;
944  }
945  variant = std::move(tmp_vector);
946 }
947 
965 template <typename stream_view_type>
966 inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
967 {
968  /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
969  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
970  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
971  VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
972  by the length (int32_t) of the array, followed by the values.
973  */
974  auto it = std::ranges::begin(stream_view);
975  uint16_t tag = static_cast<uint16_t>(*it) << 8;
976  ++it; // skip char read before
977 
978  tag += static_cast<uint16_t>(*it);
979  ++it; // skip char read before
980 
981  char type_id = *it;
982  ++it; // skip char read before
983 
984  switch (type_id)
985  {
986  case 'A' : // char
987  {
988  target[tag] = *it;
989  ++it; // skip char that has been read
990  break;
991  }
992  // all integer sizes are possible
993  case 'c' : // int8_t
994  {
995  int8_t tmp;
996  read_field(stream_view, tmp);
997  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
998  break;
999  }
1000  case 'C' : // uint8_t
1001  {
1002  uint8_t tmp;
1003  read_field(stream_view, tmp);
1004  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1005  break;
1006  }
1007  case 's' : // int16_t
1008  {
1009  int16_t tmp;
1010  read_field(stream_view, tmp);
1011  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1012  break;
1013  }
1014  case 'S' : // uint16_t
1015  {
1016  uint16_t tmp;
1017  read_field(stream_view, tmp);
1018  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1019  break;
1020  }
1021  case 'i' : // int32_t
1022  {
1023  int32_t tmp;
1024  read_field(stream_view, tmp);
1025  target[tag] = std::move(tmp); // readable sam format only allows int32_t
1026  break;
1027  }
1028  case 'I' : // uint32_t
1029  {
1030  uint32_t tmp;
1031  read_field(stream_view, tmp);
1032  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1033  break;
1034  }
1035  case 'f' : // float
1036  {
1037  float tmp;
1038  read_field(stream_view, tmp);
1039  target[tag] = tmp;
1040  break;
1041  }
1042  case 'Z' : // string
1043  {
1044  string_buffer.clear();
1045  while (!is_char<'\0'>(*it))
1046  {
1047  string_buffer.push_back(*it);
1048  ++it;
1049  }
1050  ++it; // skip \0
1051  target[tag] = string_buffer;
1052  break;
1053  }
1054  case 'H' : // byte array, represented as null-terminated string; specification requires even number of bytes
1055  {
1056  std::vector<std::byte> byte_array;
1057  std::byte value;
1058  while (!is_char<'\0'>(*it))
1059  {
1060  string_buffer.clear();
1061  string_buffer.push_back(*it);
1062  ++it;
1063 
1064  if (*it == '\0')
1065  throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1066 
1067  string_buffer.push_back(*it);
1068  ++it;
1069  read_field(string_buffer, value);
1070  byte_array.push_back(value);
1071  }
1072  ++it; // skip \0
1073  target[tag] = byte_array;
1074  break;
1075  }
1076  case 'B' : // Array. Value type depends on second char [cCsSiIf]
1077  {
1078  char array_value_type_id = *it;
1079  ++it; // skip char read before
1080 
1081  switch (array_value_type_id)
1082  {
1083  case 'c' : // int8_t
1084  read_sam_dict_vector(target[tag], stream_view, int8_t{});
1085  break;
1086  case 'C' : // uint8_t
1087  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1088  break;
1089  case 's' : // int16_t
1090  read_sam_dict_vector(target[tag], stream_view, int16_t{});
1091  break;
1092  case 'S' : // uint16_t
1093  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1094  break;
1095  case 'i' : // int32_t
1096  read_sam_dict_vector(target[tag], stream_view, int32_t{});
1097  break;
1098  case 'I' : // uint32_t
1099  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1100  break;
1101  case 'f' : // float
1102  read_sam_dict_vector(target[tag], stream_view, float{});
1103  break;
1104  default:
1105  throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1106  "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
1107  }
1108  break;
1109  }
1110  default:
1111  throw format_error{detail::to_string("The second character in the numerical id of a "
1112  "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
1113  }
1114 }
1115 
1130 template <typename cigar_input_type>
1131 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1132 {
1133  std::vector<cigar> operations{};
1134  char operation{'\0'};
1135  uint32_t count{};
1136  int32_t ref_length{}, seq_length{};
1137  uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1138  constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1139  constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1140 
1141  if (n_cigar_op == 0) // [[unlikely]]
1142  return std::tuple{operations, ref_length, seq_length};
1143 
1144  // parse the rest of the cigar
1145  // -------------------------------------------------------------------------------------------------------------
1146  while (n_cigar_op > 0) // until stream is not empty
1147  {
1148  std::ranges::copy_n(std::ranges::begin(cigar_input),
1149  sizeof(operation_and_count),
1150  reinterpret_cast<char*>(&operation_and_count));
1151  operation = cigar_mapping[operation_and_count & cigar_mask];
1152  count = operation_and_count >> 4;
1153 
1154  detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1155  operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1156  --n_cigar_op;
1157  }
1158 
1159  return std::tuple{operations, ref_length, seq_length};
1160 }
1161 
1165 inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary const & tag_dict)
1166 {
1167  std::string result{};
1168 
1169  auto stream_variant_fn = [&result] (auto && arg) // helper to print an std::variant
1170  {
1171  // T is either char, int32_t, float, std::string, or a std::vector<some int>
1172  using T = std::remove_cvref_t<decltype(arg)>;
1173 
1174  if constexpr (std::same_as<T, int32_t>)
1175  {
1176  // always choose the smallest possible representation [cCsSiI]
1177  size_t const absolute_arg = std::abs(arg);
1178  auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1179  bool const negative = arg < 0;
1180  n = n * n + 2 * negative; // for switch case order
1181 
1182  switch (n)
1183  {
1184  case 0:
1185  {
1186  result[result.size() - 1] = 'C';
1187  result.append(reinterpret_cast<char const *>(&arg), 1);
1188  break;
1189  }
1190  case 1:
1191  {
1192  result[result.size() - 1] = 'S';
1193  result.append(reinterpret_cast<char const *>(&arg), 2);
1194  break;
1195  }
1196  case 2:
1197  {
1198  result[result.size() - 1] = 'c';
1199  int8_t tmp = static_cast<int8_t>(arg);
1200  result.append(reinterpret_cast<char const *>(&tmp), 1);
1201  break;
1202  }
1203  case 3:
1204  {
1205  result[result.size() - 1] = 's';
1206  int16_t tmp = static_cast<int16_t>(arg);
1207  result.append(reinterpret_cast<char const *>(&tmp), 2);
1208  break;
1209  }
1210  default:
1211  {
1212  result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1213  break;
1214  }
1215  }
1216  }
1217  else if constexpr (std::same_as<T, std::string>)
1218  {
1219  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1220  }
1221  else if constexpr (!std::ranges::range<T>) // char, float
1222  {
1223  result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1224  }
1225  else // std::vector of some arithmetic_type type
1226  {
1227  int32_t sz{static_cast<int32_t>(arg.size())};
1228  result.append(reinterpret_cast<char *>(&sz), 4);
1229  result.append(reinterpret_cast<char const *>(arg.data()),
1230  arg.size() * sizeof(std::ranges::range_value_t<T>));
1231  }
1232  };
1233 
1234  for (auto & [tag, variant] : tag_dict)
1235  {
1236  result.push_back(static_cast<char>(tag / 256));
1237  result.push_back(static_cast<char>(tag % 256));
1238 
1239  result.push_back(detail::sam_tag_type_char[variant.index()]);
1240 
1241  if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1242  result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1243 
1244  std::visit(stream_variant_fn, variant);
1245  }
1246 
1247  return result;
1248 }
1249 
1250 } // namespace seqan3
T begin(T... args)
Provides the C++20 <bit> header if it is not already available.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:211
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:264
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition: dna16sam.hpp:48
The BAM format.
Definition: format_bam.hpp:48
format_bam()=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:275
void write_alignment_record([[maybe_unused]] stream_type &stream, [[maybe_unused]] sam_file_output_options const &options, [[maybe_unused]] header_type &&header, [[maybe_unused]] seq_type &&seq, [[maybe_unused]] qual_type &&qual, [[maybe_unused]] id_type &&id, [[maybe_unused]] int32_t const offset, [[maybe_unused]] ref_seq_type &&ref_seq, [[maybe_unused]] ref_id_type &&ref_id, [[maybe_unused]] std::optional< int32_t > ref_offset, [[maybe_unused]] align_type &&align, [[maybe_unused]] cigar_type &&cigar_vector, [[maybe_unused]] sam_flag const flag, [[maybe_unused]] uint8_t const mapq, [[maybe_unused]] mate_type &&mate, [[maybe_unused]] tag_dict_type &&tag_dict, [[maybe_unused]] double e_value, [[maybe_unused]] double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:614
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:65
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:32
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:157
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:154
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:118
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:332
T clear(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T emplace_back(T... args)
T end(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:133
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:199
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:73
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:189
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:95
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:74
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
T push_back(T... args)
Adaptations of concepts from the Ranges TS.
T reserve(T... args)
T resize(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
Provides the seqan3::sam_file_header class.
Provides seqan3::sam_file_input_options.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides helper data structures for the seqan3::sam_file_output.
T size(T... args)
T str(T... args)
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:88
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:24
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
Exposes the value_type of another type.
Definition: pre.hpp:58
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T tuple_size_v
Provides seqan3::views::slice.
T visit(T... args)