SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <string>
17 #include <vector>
18 
19 #include <seqan3/alphabet/detail/convert.hpp>
43 #include <seqan3/std/concepts>
44 #include <seqan3/std/ranges>
45 
46 namespace seqan3
47 {
48 
59 struct format_bam
60 {
63  {
64  { "bam" }
65  };
66 };
67 
68 } // namespace seqan3
69 
70 namespace seqan3::detail
71 {
72 
74 struct alignment_record_core
75 { // naming corresponds to official SAM/BAM specifications
76  int32_t block_size;
77  int32_t refID;
78  int32_t pos;
79  uint32_t l_read_name:8;
80  uint32_t mapq:8;
81  uint32_t bin:16;
82  uint32_t n_cigar_op:16;
83  uint32_t flag:16;
84  int32_t l_seq;
85  int32_t next_refID;
86  int32_t next_pos;
87  int32_t tlen;
88 };
89 
92 template <>
93 class alignment_file_input_format<format_bam> : alignment_file_input_format<format_sam>
94 {
95 public:
97  using format_tag = format_bam;
98 
102  alignment_file_input_format() noexcept = default;
103  alignment_file_input_format(alignment_file_input_format const &) = delete;
106  alignment_file_input_format & operator=(alignment_file_input_format const &) = delete;
107  alignment_file_input_format(alignment_file_input_format &&) noexcept = default;
108  alignment_file_input_format & operator=(alignment_file_input_format &&) noexcept = default;
109  ~alignment_file_input_format() noexcept = default;
110 
113  template <typename stream_type, // constraints checked by file
114  typename seq_legal_alph_type,
115  typename ref_seqs_type,
116  typename ref_ids_type,
117  typename seq_type,
118  typename id_type,
119  typename offset_type,
120  typename ref_seq_type,
121  typename ref_id_type,
122  typename ref_offset_type,
123  typename align_type,
124  typename flag_type,
125  typename mapq_type,
126  typename qual_type,
127  typename mate_type,
128  typename tag_dict_type,
129  typename e_value_type,
130  typename bit_score_type>
131  void read(stream_type & stream,
132  alignment_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
133  ref_seqs_type & ref_seqs,
134  alignment_file_header<ref_ids_type> & header,
135  seq_type & seq,
136  qual_type & qual,
137  id_type & id,
138  offset_type & offset,
139  ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
140  ref_id_type & ref_id,
141  ref_offset_type & ref_offset,
142  align_type & align,
143  flag_type & flag,
144  mapq_type & mapq,
145  mate_type & mate,
146  tag_dict_type & tag_dict,
147  e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
148  bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
149  {
150  static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
151  detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
152  "The ref_offset must be a specialisation of std::optional.");
153 
154  static_assert(detail::decays_to_ignore_v<mapq_type> || std::Same<mapq_type, uint8_t>,
155  "The type of field::MAPQ must be uint8_t.");
156 
157  static_assert(detail::decays_to_ignore_v<flag_type> || std::Same<flag_type, uint16_t>,
158  "The type of field::FLAG must be uint8_t.");
159 
161  auto stream_view = std::ranges::subrange<decltype(stream_buf_t{stream}), decltype(stream_buf_t{})>
162  {stream_buf_t{stream}, stream_buf_t{}};
163 
164  // these variables need to be stored to compute the ALIGNMENT
165  [[maybe_unused]] int32_t offset_tmp{};
166  [[maybe_unused]] int32_t soft_clipping_end{};
167  [[maybe_unused]] std::vector<std::pair<char, size_t>> cigar{};
168  [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
169 
170  // Header
171  // -------------------------------------------------------------------------------------------------------------
172  if (!header_was_read)
173  {
174  // magic BAM string
175  if (!std::ranges::equal(stream_view | view::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
176  throw format_error{"File is not in BAM format."};
177 
178  int32_t tmp32{};
179  read_field(stream_view, tmp32);
180 
181  if (tmp32 > 0) // header text is present
182  read_header(stream_view | view::take_exactly_or_throw(tmp32)
183  | view::take_until_and_consume(is_char<'\0'>),
184  header,
185  ref_seqs);
186 
187  int32_t n_ref;
188  read_field(stream_view, n_ref);
189 
190  for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
191  {
192  read_field(stream_view, tmp32); // l_name (length of reference name including \0 character)
193 
194  string_buffer.resize(tmp32 - 1);
195  std::ranges::copy_n(std::ranges::begin(stream_view), tmp32 - 1, string_buffer.data()); // copy without \0 character
196  std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
197 
198  read_field(stream_view, tmp32); // l_ref (length of reference sequence)
199 
200  auto id_it = header.ref_dict.find(string_buffer);
201 
202  // sanity checks of reference information to existing header object:
203  if (id_it == header.ref_dict.end()) // [unlikely]
204  {
205  throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
206  "' found in BAM file header (header.ref_ids():",
207  header.ref_ids(), ").")};
208  }
209  else if (id_it->second != ref_idx) // [unlikely]
210  {
211  throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
212  " does not correspond to the position ", id_it->second,
213  " in the header (header.ref_ids():", header.ref_ids(), ").")};
214  }
215  else if (std::get<0>(header.ref_id_info[id_it->second]) != tmp32) // [unlikely]
216  {
217  throw format_error{"Provided reference has unequal length as specified in the header."};
218  }
219  }
220 
221  header_was_read = true;
222 
223  if (stream_buf_t{stream} == stream_buf_t{}) // no records follow
224  return;
225  }
226 
227  // read alignment record into buffer
228  // -------------------------------------------------------------------------------------------------------------
229  alignment_record_core core;
230  std::ranges::copy_n(stream_view.begin(), sizeof(core), reinterpret_cast<char *>(&core));
231 
232  if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
233  {
234  throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
235  "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
236  }
237  else if (core.refID > -1) // not unmapped
238  {
239  ref_id = core.refID; // field::REF_ID
240  }
241 
242  flag = core.flag; // field::FLAG
243  mapq = core.mapq; // field::MAPQ
244 
245  if (core.pos > -1) // [[likely]]
246  ref_offset = core.pos; // field::REF_OFFSET
247 
248  if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::MATE
249  {
250  if (core.next_refID > -1)
251  get<0>(mate) = core.next_refID;
252 
253  if (core.next_pos > -1) // [[likely]]
254  get<1>(mate) = core.next_pos;
255 
256  get<2>(mate) = core.tlen;
257  }
258 
259  // read id
260  // -------------------------------------------------------------------------------------------------------------
261  read_field(stream_view | view::take_exactly_or_throw(core.l_read_name - 1), id); // field::ID
262  std::ranges::next(std::ranges::begin(stream_view)); // skip '\0'
263 
264  // read cigar string
265  // -------------------------------------------------------------------------------------------------------------
266  if constexpr (!detail::decays_to_ignore_v<align_type>)
267  {
268  std::tie(cigar, ref_length, seq_length, offset_tmp, soft_clipping_end) =
269  detail::parse_binary_cigar(stream_view, core.n_cigar_op);
270  }
271  else
272  {
273  detail::consume(stream_view | view::take_exactly_or_throw(core.n_cigar_op * 4));
274  }
275 
276  offset = offset_tmp;
277 
278  // read sequence
279  // -------------------------------------------------------------------------------------------------------------
280  if (core.l_seq > 0) // sequence information is given
281  {
282  auto seq_stream = stream_view
283  | view::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
285  {
286  return {sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
287  sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
288  });
289 
290  if constexpr (detail::decays_to_ignore_v<seq_type>)
291  {
292  if constexpr (!detail::decays_to_ignore_v<align_type>)
293  {
294  static_assert(SequenceContainer<std::remove_reference_t<decltype(get<1>(align))>>,
295  "If you want to read ALIGNMENT but not SEQ, the alignment"
296  " object must store a sequence container at the second (query) position.");
297 
298  if (!cigar.empty()) // only parse alignment if cigar information was given
299  {
300  assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
301  using alph_t = value_type_t<decltype(get<1>(align))>;
302  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
303 
304  get<1>(align).reserve(seq_length);
305 
306  auto tmp_iter = std::ranges::begin(seq_stream);
307  std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
308 
309  if (offset_tmp & 1)
310  {
311  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
312  ++tmp_iter;
313  --seq_length;
314  }
315 
316  for (size_t i = (seq_length / 2); i > 0; --i)
317  {
318  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
319  get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
320  ++tmp_iter;
321  }
322 
323  if (seq_length & 1)
324  {
325  get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
326  ++tmp_iter;
327  --soft_clipping_end;
328  }
329 
330  std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
331  }
332  else
333  {
334  get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
335  }
336  }
337  else
338  {
339  detail::consume(seq_stream);
340  if (core.l_seq & 1)
342  }
343  }
344  else
345  {
346  using alph_t = value_type_t<decltype(seq)>;
347  constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
348 
349  for (auto [d1, d2] : seq_stream)
350  {
351  seq.push_back(from_dna16[to_rank(d1)]);
352  seq.push_back(from_dna16[to_rank(d2)]);
353  }
354 
355  if (core.l_seq & 1)
356  {
357  sam_dna16 d = sam_dna16{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
358  seq.push_back(from_dna16[to_rank(d)]);
360  }
361 
362  if constexpr (!detail::decays_to_ignore_v<align_type>)
363  {
364  assign_unaligned(get<1>(align),
365  seq | view::slice(static_cast<decltype(std::ranges::distance(seq))>(offset_tmp),
366  std::ranges::distance(seq) - soft_clipping_end));
367  }
368  }
369  }
370 
371  // read qual string
372  // -------------------------------------------------------------------------------------------------------------
373  read_field(stream_view | view::take_exactly_or_throw(core.l_seq)
374  | std::view::transform([] (char chr) { return static_cast<char>(chr + 33); }), qual);
375 
376  // All remaining optional fields if any: SAM tags dictionary
377  // -------------------------------------------------------------------------------------------------------------
378  int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
379  core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
380  assert(remaining_bytes >= 0);
381  auto tags_view = stream_view | view::take_exactly_or_throw(remaining_bytes);
382 
383  while (tags_view.size() > 0)
384  read_field(tags_view, tag_dict);
385 
386  // DONE READING - wrap up
387  // -------------------------------------------------------------------------------------------------------------
388  if constexpr (!detail::decays_to_ignore_v<align_type>)
389  {
390  // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
391  // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
392  // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
393  if (core.l_seq != 0 && offset_tmp == core.l_seq)
394  {
395  if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
396  { // maybe only throw in debug mode and otherwise return an empty alignment?
397  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
398  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
399  "stored in the optional field CG. You need to read in the field::TAGS and "
400  "field::SEQ in order to access this information.")};
401  }
402  else
403  {
404  auto it = tag_dict.find("CG"_tag);
405 
406  if (it == tag_dict.end())
407  throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
408  "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
409  "stored in the optional field CG but this tag is not present in the given ",
410  "record.")};
411 
412  auto cigar_view = std::view::all(std::get<std::string>(it->second));
413  std::tie(cigar, ref_length, seq_length, offset_tmp, soft_clipping_end) = detail::parse_cigar(cigar_view);
414  assign_unaligned(get<1>(align),
415  seq | view::slice(static_cast<decltype(std::ranges::distance(seq))>(offset_tmp),
416  std::ranges::distance(seq) - soft_clipping_end));
417  tag_dict.erase(it); // remove redundant information
418  }
419  }
420 
421  // Alignment object construction
422  construct_alignment(align, cigar, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM format
423  }
424  }
425 
426 protected:
428 
430  bool header_was_read{false};
431 
433  std::string string_buffer{};
434 
435  // inherit read_field function from format_sam
436  using alignment_file_input_format<format_sam>::read_field;
437 
444  template <typename stream_view_type, std::Integral number_type>
445  void read_field(stream_view_type && stream_view, number_type & target)
446  {
447  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
448  }
449 
456  template <typename stream_view_type>
457  void read_field(stream_view_type && stream_view, float & target)
458  {
459  std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
460  }
461 
463  template <typename stream_view_type, typename value_type>
464  void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
465  stream_view_type && stream_view,
466  value_type const & SEQAN3_DOXYGEN_ONLY(value))
467  {
468  int32_t count;
469  read_field(stream_view, count); // read length of vector
470  std::vector<value_type> tmp_vector;
471  tmp_vector.reserve(count);
472 
473  while (count > 0)
474  {
475  value_type tmp{};
476  read_field(stream_view, tmp);
477  tmp_vector.push_back(std::move(tmp));
478  --count;
479  }
480  variant = std::move(tmp_vector);
481  }
482 
500  template <typename stream_view_type>
501  void read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
502  {
503  /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
504  name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
505  describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
506  VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
507  by the length (int32_t) of the array, followed by the values.
508  */
509  uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
510  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
511  tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
512  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
513  char type_id = static_cast<char>(*std::ranges::begin(stream_view));
514  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
515 
516  switch (type_id)
517  {
518  case 'A' : // char
519  {
520  target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
521  std::ranges::next(std::ranges::begin(stream_view)); // skip char that has been read
522  break;
523  }
524  // all integer sizes are possible
525  case 'c' : // int8_t
526  {
527  int8_t tmp;
528  read_field(stream_view, tmp);
529  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
530  break;
531  }
532  case 'C' : // uint8_t
533  {
534  uint8_t tmp;
535  read_field(stream_view, tmp);
536  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
537  break;
538  }
539  case 's' : // int16_t
540  {
541  int16_t tmp;
542  read_field(stream_view, tmp);
543  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
544  break;
545  }
546  case 'S' : // uint16_t
547  {
548  uint16_t tmp;
549  read_field(stream_view, tmp);
550  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
551  break;
552  }
553  case 'i' : // int32_t
554  {
555  int32_t tmp;
556  read_field(stream_view, tmp);
557  target[tag] = std::move(tmp); // readable sam format only allows int32_t
558  break;
559  }
560  case 'I' : // uint32_t
561  {
562  uint32_t tmp;
563  read_field(stream_view, tmp);
564  target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
565  break;
566  }
567  case 'f' : // float
568  {
569  float tmp;
570  read_field(stream_view, tmp);
571  target[tag] = tmp;
572  break;
573  }
574  case 'Z' : // string
575  {
576  string_buffer.clear();
577  while (!is_char<'\0'>(*std::ranges::begin(stream_view)))
578  {
579  string_buffer.push_back(*std::ranges::begin(stream_view));
581  }
582  std::ranges::next(std::ranges::begin(stream_view)); // skip \0
583  target[tag] = string_buffer;
584  break;
585  }
586  case 'H' :
587  {
588  // TODO
589  break;
590  }
591  case 'B' : // Array. Value type depends on second char [cCsSiIf]
592  {
593  char array_value_type_id = *std::ranges::begin(stream_view);
594  std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
595 
596  switch (array_value_type_id)
597  {
598  case 'c' : // int8_t
599  {
600  read_sam_dict_vector(target[tag], stream_view, int8_t{});
601  break;
602  }
603  case 'C' : // uint8_t
604  {
605  read_sam_dict_vector(target[tag], stream_view, uint8_t{});
606  break;
607  }
608  case 's' : // int16_t
609  {
610  read_sam_dict_vector(target[tag], stream_view, int16_t{});
611  break;
612  }
613  case 'S' : // uint16_t
614  {
615  read_sam_dict_vector(target[tag], stream_view, uint16_t{});
616  break;
617  }
618  case 'i' : // int32_t
619  {
620  read_sam_dict_vector(target[tag], stream_view, int32_t{});
621  break;
622  }
623  case 'I' : // uint32_t
624  {
625  read_sam_dict_vector(target[tag], stream_view, uint32_t{});
626  break;
627  }
628  case 'f' : // float
629  {
630  read_sam_dict_vector(target[tag], stream_view, float{});
631  break;
632  }
633  default:
634  throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
635  "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
636  }
637  break;
638  }
639  default:
640  throw format_error{detail::to_string("The second character in the numerical id of a "
641  "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
642  }
643  }
644 };
645 
648 template <>
649 class alignment_file_output_format<format_bam> : alignment_file_output_format<format_sam>
650 {
651 public:
653  using format_tag = format_bam;
654 
658  alignment_file_output_format() noexcept = default;
659  alignment_file_output_format(alignment_file_output_format const &) = delete;
662  alignment_file_output_format & operator=(alignment_file_output_format const &) = delete;
663  alignment_file_output_format(alignment_file_output_format &&) noexcept = default;
664  alignment_file_output_format & operator=(alignment_file_output_format &&) noexcept = default;
665  ~alignment_file_output_format() noexcept = default;
666 
669  template <typename stream_type,
670  typename header_type,
671  typename seq_type,
672  typename id_type,
673  typename ref_seq_type,
674  typename ref_id_type,
675  typename align_type,
676  typename qual_type,
677  typename mate_type,
678  typename tag_dict_type>
679  void write([[maybe_unused]] stream_type & stream,
680  [[maybe_unused]] alignment_file_output_options const & options,
681  [[maybe_unused]] header_type && header,
682  [[maybe_unused]] seq_type && seq,
683  [[maybe_unused]] qual_type && qual,
684  [[maybe_unused]] id_type && id,
685  [[maybe_unused]] int32_t offset,
686  [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
687  [[maybe_unused]] ref_id_type && ref_id,
688  [[maybe_unused]] std::optional<int32_t> ref_offset,
689  [[maybe_unused]] align_type && align,
690  [[maybe_unused]] uint16_t flag,
691  [[maybe_unused]] uint8_t mapq,
692  [[maybe_unused]] mate_type && mate,
693  [[maybe_unused]] tag_dict_type && tag_dict,
694  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
695  [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
696  {
697  // ---------------------------------------------------------------------
698  // Type Requirements (as static asserts for user friendliness)
699  // ---------------------------------------------------------------------
700  static_assert((std::ranges::ForwardRange<seq_type> &&
701  Alphabet<reference_t<seq_type>>),
702  "The seq object must be a std::ranges::ForwardRange over "
703  "letters that model seqan3::Alphabet.");
704 
705  static_assert((std::ranges::ForwardRange<id_type> &&
706  Alphabet<reference_t<id_type>>),
707  "The id object must be a std::ranges::ForwardRange over "
708  "letters that model seqan3::Alphabet.");
709 
711  Alphabet<reference_t<ref_seq_type>>),
712  "The ref_seq object must be a std::ranges::ForwardRange "
713  "over letters that model seqan3::Alphabet.");
714 
715  if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
716  {
719  detail::is_type_specialisation_of_v<remove_cvref_t<ref_id_type>, std::optional>),
720  "The ref_id object must be a std::ranges::ForwardRange "
721  "over letters that model seqan3::Alphabet or an Integral or a std::optional<Integral>.");
722  }
723 
724  static_assert(TupleLike<remove_cvref_t<align_type>>,
725  "The align object must be a std::pair of two ranges whose "
726  "value_type is comparable to seqan3::gap");
727 
728  static_assert((std::tuple_size_v<remove_cvref_t<align_type>> == 2 &&
729  std::EqualityComparableWith<gap, reference_t<decltype(std::get<0>(align))>> &&
730  std::EqualityComparableWith<gap, reference_t<decltype(std::get<1>(align))>>),
731  "The align object must be a std::pair of two ranges whose "
732  "value_type is comparable to seqan3::gap");
733 
734  static_assert((std::ranges::ForwardRange<qual_type> &&
735  Alphabet<reference_t<qual_type>>),
736  "The qual object must be a std::ranges::ForwardRange "
737  "over letters that model seqan3::Alphabet.");
738 
739  static_assert(TupleLike<remove_cvref_t<mate_type>>,
740  "The mate object must be a std::tuple of size 3 with "
741  "1) a std::ranges::ForwardRange with a value_type modelling seqan3::Alphabet, "
742  "2) a std::Integral or std::optional<std::Integral>, and "
743  "3) a std::Integral.");
744 
745  static_assert(((std::ranges::ForwardRange<decltype(std::get<0>(mate))> ||
746  std::Integral<remove_cvref_t<decltype(std::get<0>(mate))>> ||
747  detail::is_type_specialisation_of_v<remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
748  (std::Integral<remove_cvref_t<decltype(std::get<1>(mate))>> ||
749  detail::is_type_specialisation_of_v<remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
750  std::Integral<remove_cvref_t<decltype(std::get<2>(mate))>>),
751  "The mate object must be a std::tuple of size 3 with "
752  "1) a std::ranges::ForwardRange with a value_type modelling seqan3::Alphabet, "
753  "2) a std::Integral or std::optional<std::Integral>, and "
754  "3) a std::Integral.");
755 
756  static_assert(std::Same<remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
757  "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
758 
759  if constexpr (detail::decays_to_ignore_v<header_type>)
760  {
761  throw format_error{"BAM can only be written with a header but you did not provide enough information! "
762  "You can either construct the output file with ref_ids and ref_seqs information and "
763  "the header will be created for you, or you can access the `header` member directly."};
764  }
765  else
766  {
767  // ---------------------------------------------------------------------
768  // logical Requirements
769  // ---------------------------------------------------------------------
770 
771  if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
772  throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
773 
774  seqan3::ostreambuf_iterator stream_it{stream};
775 
776  // ---------------------------------------------------------------------
777  // Writing the Header on first call
778  // ---------------------------------------------------------------------
779  if (!written_header)
780  {
781  stream << "BAM\1";
783  write_header(os, options, header); // write header to temporary stream to query the size.
784  int32_t l_text{static_cast<int32_t>(os.str().size())};
785  std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
786 
787  stream << os.str();
788 
789  int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
790  std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
791 
792  for (int32_t ridx = 0; ridx < n_ref; ++ridx)
793  {
794  int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
795  std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
796  // write reference name:
797  std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
798  stream_it = '\0';
799  // write reference sequence length:
800  std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
801  }
802 
803  written_header = true;
804  }
805 
806  // ---------------------------------------------------------------------
807  // Writing the Record
808  // ---------------------------------------------------------------------
809 
810  // compute possible distance from alignment end to sequence end
811  // which indicates soft clipping at the end.
812  // This should be replaced by a free count_gaps function for
813  // aligned sequences which is more efficient if possible.
814  auto off_end{std::ranges::distance(seq) - offset};
815 
816  for (auto chr : get<1>(align))
817  if (chr == gap{})
818  ++off_end;
819 
820  off_end -= std::ranges::distance(get<1>(align));
821 
822  std::vector<std::pair<char, size_t>> cigar = detail::get_cigar_vector(align, offset, off_end);
823 
824  if (cigar.size() > 65535) // cannot be represented with 16bits, must be written into the sam tag CG
825  {
826  tag_dict["CG"_tag] = detail::get_cigar_string(align, offset, off_end);
827  cigar.resize(2);
828  cigar[0] = {'S', std::ranges::distance(seq)};
829  cigar[1] = {'N', std::ranges::distance(get<1>(align))};
830  }
831 
832  std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
833 
834  alignment_record_core core
835  {
836  /* block_size */ 0, // will be initialised right after
837  /* refID */ -1, // will be initialised right after
838  /* pos */ ref_offset.value_or(-1),
839  /* l_read_name */ static_cast<uint8_t>(std::ranges::distance(id) + 1),
840  /* mapq */ mapq,
841  /* bin */ reg2bin(ref_offset.value_or(-1), std::ranges::distance(get<1>(align))),
842  /* n_cigar_op */ static_cast<uint16_t>(cigar.size()),
843  /* flag */ flag,
844  /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
845  /* next_refId */ -1, // will be initialised right after
846  /* next_pos */ get<1>(mate).value_or(-1),
847  /* tlen */ get<2>(mate)
848  };
849 
850  auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
851  [[maybe_unused]] auto & id_target)
852  {
854 
855  if constexpr (!detail::decays_to_ignore_v<id_t>)
856  {
857  if constexpr (std::Integral<id_t>)
858  {
859  id_target = id_source;
860  }
861  else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
862  {
863  id_target = id_source.value_or(-1);
864  }
865  else
866  {
867  if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
868  {
869  auto id_it = header.ref_dict.find(id_source);
870 
871  if (id_it == header.ref_dict.end())
872  {
873  throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
874  "not be found in BAM header ref_dict: ",
875  header.ref_dict, ".")};
876  }
877  id_target = id_it->second;
878  }
879  }
880  }
881  };
882 
883  // initialise core.refID
884  check_and_assign_id_to(ref_id, core.refID);
885 
886  // initialise core.next_refID
887  check_and_assign_id_to(get<0>(mate), core.next_refID);
888 
889  // initialise core.block_size
890  core.block_size = sizeof(core) - 4/*block_size excluded*/ +
891  core.l_read_name +
892  core.n_cigar_op * 4 + // each int32_t has 4 bytes
893  (core.l_seq + 1) / 2 + // bitcompressed seq
894  core.l_seq + // quality string
895  tag_dict_binary_str.size();
896 
897  std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
898 
899  std::ranges::copy_n(std::ranges::begin(id), std::ranges::distance(id), stream_it); // write read id
900  stream_it = '\0';
901 
902  // write cigar
903  for (auto [cigar_op, cigar_count] : cigar)
904  {
905  cigar_count = cigar_count << 4;
906  cigar_count |= static_cast<int32_t>(char_to_sam_rank[cigar_op]);
907  std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
908  }
909 
910  // write seq (bit-compressed: sam_dna16 characters go into one byte)
911  using alph_t = value_type_t<seq_type>;
912  constexpr auto to_dna16 = detail::convert_through_char_representation<sam_dna16, alph_t>;
913 
914  auto sit = std::ranges::begin(seq);
915  for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
916  {
917  uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
918  ++sidx, ++sit;
919  compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
920  stream_it = static_cast<char>(compressed_chr);
921  }
922 
923  if (core.l_seq & 1) // write one more
924  stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
925 
926  // write qual
927  if (std::ranges::empty(qual))
928  {
929  auto v = view::repeat_n(static_cast<char>(255), core.l_seq);
930  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
931  }
932  else
933  {
934  assert(static_cast<int32_t>(std::ranges::distance(qual)) == core.l_seq);
935  auto v = qual | std::view::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
936  std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
937  }
938 
939  // write optional fields
940  stream << tag_dict_binary_str;
941  } // if constexpr (!detail::decays_to_ignore_v<header_type>)
942  }
943 
945  static constexpr std::array<uint8_t, 256> char_to_sam_rank
946  {
947  [] () constexpr
948  {
950 
951  using index_t = std::make_unsigned_t<char>;
952 
953  // ret['M'] = 0; set anyway by initialization
954  ret[static_cast<index_t>('I')] = 1;
955  ret[static_cast<index_t>('D')] = 2;
956  ret[static_cast<index_t>('N')] = 3;
957  ret[static_cast<index_t>('S')] = 4;
958  ret[static_cast<index_t>('H')] = 5;
959  ret[static_cast<index_t>('P')] = 6;
960  ret[static_cast<index_t>('=')] = 7;
961  ret[static_cast<index_t>('X')] = 8;
962 
963  return ret;
964  }()
965  };
966 
970  static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict)
971  {
972  std::string result{};
973 
974  auto stream_variant_fn = [&result] (auto && arg) // helper to print an std::variant
975  {
976  // T is either char, int32_t, float, std::string, or a std::vector<some int>
977  using T = remove_cvref_t<decltype(arg)>;
978 
979  if constexpr (std::Same<T, int32_t>)
980  {
981  // always choose the smallest possible representation [cCsSiI]
982  bool negative = arg < 0;
983  auto n = __builtin_ctz(detail::next_power_of_two(((negative) ? arg * -1 : arg) + 1) >> 1) / 8;
984  n = n * n + 2 * negative; // for switch case order
985 
986  switch (n)
987  {
988  case 0:
989  {
990  result[result.size() - 1] = 'C';
991  result.append(reinterpret_cast<char const *>(&arg), 1);
992  break;
993  }
994  case 1:
995  {
996  result[result.size() - 1] = 'S';
997  result.append(reinterpret_cast<char const *>(&arg), 2);
998  break;
999  }
1000  case 2:
1001  {
1002  result[result.size() - 1] = 'c';
1003  int8_t tmp = static_cast<int8_t>(arg);
1004  result.append(reinterpret_cast<char const *>(&tmp), 1);
1005  break;
1006  }
1007  case 3:
1008  {
1009  result[result.size() - 1] = 's';
1010  int16_t tmp = static_cast<int16_t>(arg);
1011  result.append(reinterpret_cast<char const *>(&tmp), 2);
1012  break;
1013  }
1014  default:
1015  {
1016  result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1017  break;
1018  }
1019  }
1020  }
1021  else if constexpr (std::Same<T, std::string>)
1022  {
1023  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1024  }
1025  else if constexpr (!std::ranges::Range<T>) // char, float
1026  {
1027  result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1028  }
1029  else // std::vector of some Arithmetic_type type
1030  {
1031  int32_t sz{static_cast<int32_t>(arg.size())};
1032  result.append(reinterpret_cast<char *>(&sz), 4);
1033  result.append(reinterpret_cast<char const *>(arg.data()), arg.size() * sizeof(value_type_t<T>));
1034  }
1035  };
1036 
1037  for (auto & [tag, variant] : tag_dict)
1038  {
1039  result.push_back(static_cast<char>(tag / 256));
1040  result.push_back(static_cast<char>(tag % 256));
1041 
1042  result.push_back(detail::sam_tag_type_char[variant.index()]);
1043 
1044  if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1045  result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1046 
1047  std::visit(stream_variant_fn, variant);
1048  }
1049 
1050  return result;
1051  }
1052 
1054  static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
1055  {
1056  --end;
1057  if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
1058  if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
1059  if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
1060  if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
1061  if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
1062  return 0;
1063  }
1064 };
1065 
1066 } // namespace seqan3::detail
Defines the requirements of a type that allows iteration over its elements by providing a begin itera...
::ranges::equal equal
Alias for ranges::equal. Determines if two sets of elements are the same.
Definition: algorithm:54
The BAM format.
Definition: format_bam.hpp:59
Provides concepts for core language types and relations that don&#39;t have concepts in C++20 (yet)...
::ranges::distance distance
Alias for ranges::distance. Returns the number of hops from first to last.
Definition: iterator:321
T visit(T... args)
void assign_unaligned(aligned_seq_t &aligned_seq, unaligned_sequence_type &&unaligned_seq)
An implementation of seqan3::AlignedSequence::assign_unaligned_sequence for sequence containers...
Definition: aligned_sequence_concept.hpp:345
::ranges::next next
Alias for ranges::next. Returns the nth successor of the given iterator.
Definition: iterator:331
::ranges::subrange< it_t, sen_t, k > subrange
Create a view from a pair of iterator and sentinel.
Definition: ranges:339
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
T tie(T... args)
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T index(T... args)
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:103
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Writes successive characters onto the output stream from which...
Definition: iterator.hpp:56
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
Provides seqan3::type_list and auxiliary type traits.
std::remove_cv_t< std::remove_reference_t< t > > remove_cvref_t
Return the input type with const, volatile and references removed (type trait).
Definition: basic.hpp:35
::ranges::copy_n copy_n
Alias for ranges::copy_n. Copies a range of exactly n elements to a new location. ...
Definition: algorithm:49
SeqAn specific customisations in the standard namespace.
constexpr auto all
A view adaptor that behaves like std::view:all, but type erases contiguous ranges.
Definition: view_all.hpp:160
The main SeqAn3 namespace.
Auxiliary for pretty printing of exception messages.
Auxiliary functions for the alignment IO.
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:144
Provides seqan3::alignment_file_output_options.
T resize(T... args)
T min(T... args)
T push_back(T... args)
Provides seqan3::AlignmentFileInputFormat and auxiliary classes.
Provides seqan3::sam_dna16.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides seqan3::TupleLike.
Provides the seqan3::alignment_file_header class.
Provides various utility functions.
Provides various utility functions.
The Concepts library.
Adaptations of concepts from the Ranges TS.
::ranges::begin begin
Alias for ranges::begin. Returns an iterator to the beginning of a range.
Definition: ranges:174
::ranges::advance advance
Alias for ranges::advance. Advances the iterator by the given distance.
Definition: iterator:316
Provides seqan3::alignment_file_input_options.
T str(T... args)
T align(T... args)
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: algorithm:44
T tuple_size_v
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:63
T count(T... args)
Provides the seqan3::format_sam tag and the seqan3::alignment_file_input_format and seqan3::alignment...
Definition: aligned_sequence_concept.hpp:35
T size(T... args)
Provides character predicates for tokenisation.
Requires std::detail::WeaklyEqualityComparableWitht<t1,t2>, but also that t1 and t2, as well as their common_reference_t satisfy std::EqualityComparable.
Provides seqan3::AlignmentFileOutputFormat and auxiliary classes.
Provides seqan3::view::slice.
Provides utility functions for bit twiddling.
::ranges::empty empty
Alias for ranges::empty. Checks whether a range is empty.
Definition: ranges:194
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:97
Provides various transformation traits used by the range module.
typename reference< t >::type reference_t
Shortcut for seqan3::reference (TransformationTrait shortcut).
Definition: pre.hpp:77
Specifies requirements of a Range type for which begin returns a type that models std::ForwardIterato...
Static reflection for arbitrary types.
The concept std::Same<T, U> is satisfied if and only if T and U denote the same type.
auto constexpr take_until_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:627
::ranges::end end
Alias for ranges::end. Returns an iterator to the end of a range.
Definition: ranges:179
constexpr auto transform
A range adaptor that takes a invocable and returns a view of the elements with the invocable applied...
Definition: ranges:911
Provides seqan3::view::take_exactly and seqan3::view::take_exactly_or_throw.
The concept Integral is satisfied if and only if T is an integral type.
T reserve(T... args)
auto constexpr take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:94