SeqAn3 3.2.0
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <bit>
16#include <iterator>
17#include <ranges>
18#include <string>
19#include <vector>
20
33
34namespace seqan3
35{
36
49class format_bam : private detail::format_sam_base
50{
51public:
55 // string_buffer is of type std::string and has some problems with pre-C++11 ABI
56 format_bam() = default;
57 format_bam(format_bam const &) = default;
58 format_bam & operator=(format_bam const &) = default;
59 format_bam(format_bam &&) = default;
61 ~format_bam() = default;
62
64
67
68protected:
69 template <typename stream_type, // constraints checked by file
70 typename seq_legal_alph_type,
71 typename ref_seqs_type,
72 typename ref_ids_type,
73 typename stream_pos_type,
74 typename seq_type,
75 typename id_type,
76 typename offset_type,
77 typename ref_seq_type,
78 typename ref_id_type,
79 typename ref_offset_type,
80 typename align_type,
81 typename cigar_type,
82 typename flag_type,
83 typename mapq_type,
84 typename qual_type,
85 typename mate_type,
86 typename tag_dict_type,
87 typename e_value_type,
88 typename bit_score_type>
89 void read_alignment_record(stream_type & stream,
90 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
91 ref_seqs_type & ref_seqs,
93 stream_pos_type & position_buffer,
94 seq_type & seq,
95 qual_type & qual,
96 id_type & id,
97 offset_type & offset,
98 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
99 ref_id_type & ref_id,
100 ref_offset_type & ref_offset,
101 align_type & align,
102 cigar_type & cigar_vector,
103 flag_type & flag,
104 mapq_type & mapq,
105 mate_type & mate,
106 tag_dict_type & tag_dict,
107 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
108 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
109
110 template <typename stream_type,
111 typename header_type,
112 typename seq_type,
113 typename id_type,
114 typename ref_seq_type,
115 typename ref_id_type,
116 typename align_type,
117 typename cigar_type,
118 typename qual_type,
119 typename mate_type,
120 typename tag_dict_type>
121 void write_alignment_record([[maybe_unused]] stream_type & stream,
122 [[maybe_unused]] sam_file_output_options const & options,
123 [[maybe_unused]] header_type && header,
124 [[maybe_unused]] seq_type && seq,
125 [[maybe_unused]] qual_type && qual,
126 [[maybe_unused]] id_type && id,
127 [[maybe_unused]] int32_t const offset,
128 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
129 [[maybe_unused]] ref_id_type && ref_id,
130 [[maybe_unused]] std::optional<int32_t> ref_offset,
131 [[maybe_unused]] align_type && align,
132 [[maybe_unused]] cigar_type && cigar_vector,
133 [[maybe_unused]] sam_flag const flag,
134 [[maybe_unused]] uint8_t const mapq,
135 [[maybe_unused]] mate_type && mate,
136 [[maybe_unused]] tag_dict_type && tag_dict,
137 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
138 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
139
140private:
142 bool header_was_read{false};
143
145 std::string string_buffer{};
146
148 struct alignment_record_core
149 { // naming corresponds to official SAM/BAM specifications
150 int32_t block_size;
151 int32_t refID;
152 int32_t pos;
153 uint32_t l_read_name : 8;
154 uint32_t mapq : 8;
155 uint32_t bin : 16;
156 uint32_t n_cigar_op : 16;
157 sam_flag flag;
158 int32_t l_seq;
159 int32_t next_refID;
160 int32_t next_pos;
161 int32_t tlen;
162 };
163
165 static constexpr std::array<uint8_t, 256> char_to_sam_rank{[]() constexpr {std::array<uint8_t, 256> ret{};
166
167 using index_t = std::make_unsigned_t<char>;
168
169 // ret['M'] = 0; set anyway by initialization
170 ret[static_cast<index_t>('I')] = 1;
171 ret[static_cast<index_t>('D')] = 2;
172 ret[static_cast<index_t>('N')] = 3;
173 ret[static_cast<index_t>('S')] = 4;
174 ret[static_cast<index_t>('H')] = 5;
175 ret[static_cast<index_t>('P')] = 6;
176 ret[static_cast<index_t>('=')] = 7;
177 ret[static_cast<index_t>('X')] = 8;
178
179 return ret;
180}()
181}; // namespace seqan3
182
184static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
185{
186 --end;
187 if (beg >> 14 == end >> 14)
188 return ((1 << 15) - 1) / 7 + (beg >> 14);
189 if (beg >> 17 == end >> 17)
190 return ((1 << 12) - 1) / 7 + (beg >> 17);
191 if (beg >> 20 == end >> 20)
192 return ((1 << 9) - 1) / 7 + (beg >> 20);
193 if (beg >> 23 == end >> 23)
194 return ((1 << 6) - 1) / 7 + (beg >> 23);
195 if (beg >> 26 == end >> 26)
196 return ((1 << 3) - 1) / 7 + (beg >> 26);
197 return 0;
198}
199
206template <typename stream_view_type, std::integral number_type>
207void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
208{
209 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
210}
211
217template <typename stream_view_type>
218void read_float_byte_field(stream_view_type && stream_view, float & target)
219{
220 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
221}
222
223template <typename stream_view_type, typename value_type>
224void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
225 stream_view_type && stream_view,
226 value_type const & SEQAN3_DOXYGEN_ONLY(value));
227
228template <typename stream_view_type>
229void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
230
231template <typename cigar_input_type>
232auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
233
234static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
235}
236;
237
239template <typename stream_type, // constraints checked by file
240 typename seq_legal_alph_type,
241 typename ref_seqs_type,
242 typename ref_ids_type,
243 typename stream_pos_type,
244 typename seq_type,
245 typename id_type,
246 typename offset_type,
247 typename ref_seq_type,
248 typename ref_id_type,
249 typename ref_offset_type,
250 typename align_type,
251 typename cigar_type,
252 typename flag_type,
253 typename mapq_type,
254 typename qual_type,
255 typename mate_type,
256 typename tag_dict_type,
257 typename e_value_type,
258 typename bit_score_type>
259inline void
261 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
262 ref_seqs_type & ref_seqs,
264 stream_pos_type & position_buffer,
265 seq_type & seq,
266 qual_type & qual,
267 id_type & id,
268 offset_type & offset,
269 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
270 ref_id_type & ref_id,
271 ref_offset_type & ref_offset,
272 align_type & align,
273 cigar_type & cigar_vector,
274 flag_type & flag,
275 mapq_type & mapq,
276 mate_type & mate,
277 tag_dict_type & tag_dict,
278 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
279 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
280{
281 static_assert(detail::decays_to_ignore_v<ref_offset_type>
282 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
283 "The ref_offset must be a specialisation of std::optional.");
284
285 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
286 "The type of field::mapq must be uint8_t.");
287
288 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
289 "The type of field::flag must be seqan3::sam_flag.");
290
291 auto stream_view = seqan3::detail::istreambuf(stream);
292
293 // these variables need to be stored to compute the ALIGNMENT
294 [[maybe_unused]] int32_t offset_tmp{};
295 [[maybe_unused]] int32_t soft_clipping_end{};
296 [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
297 [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
298
299 // Header
300 // -------------------------------------------------------------------------------------------------------------
301 if (!header_was_read)
302 {
303 // magic BAM string
304 if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
305 throw format_error{"File is not in BAM format."};
306
307 int32_t l_text{}; // length of header text including \0 character
308 int32_t n_ref{}; // number of reference sequences
309 int32_t l_name{}; // 1 + length of reference name including \0 character
310 int32_t l_ref{}; // length of reference sequence
311
312 read_integral_byte_field(stream_view, l_text);
313
314 if (l_text > 0) // header text is present
315 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
316
317 read_integral_byte_field(stream_view, n_ref);
318
319 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
320 {
321 read_integral_byte_field(stream_view, l_name);
322
323 string_buffer.resize(l_name - 1);
325 l_name - 1,
326 string_buffer.data()); // copy without \0 character
327 ++std::ranges::begin(stream_view); // skip \0 character
328
329 read_integral_byte_field(stream_view, l_ref);
330
331 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
332 {
333 // If there was no header text, we parse reference sequences block as header information
334 if (l_text == 0)
335 {
336 auto & reference_ids = header.ref_ids();
337 // put the length of the reference sequence into ref_id_info
338 header.ref_id_info.emplace_back(l_ref, "");
339 // put the reference name into reference_ids
340 reference_ids.push_back(string_buffer);
341 // assign the reference name an ascending reference id (starts at index 0).
342 header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
343 continue;
344 }
345 }
346
347 auto id_it = header.ref_dict.find(string_buffer);
348
349 // sanity checks of reference information to existing header object:
350 if (id_it == header.ref_dict.end()) // [unlikely]
351 {
352 throw format_error{detail::to_string("Unknown reference name '" + string_buffer
353 + "' found in BAM file header (header.ref_ids():",
354 header.ref_ids(),
355 ").")};
356 }
357 else if (id_it->second != ref_idx) // [unlikely]
358 {
359 throw format_error{detail::to_string("Reference id '",
360 string_buffer,
361 "' at position ",
362 ref_idx,
363 " does not correspond to the position ",
364 id_it->second,
365 " in the header (header.ref_ids():",
366 header.ref_ids(),
367 ").")};
368 }
369 else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
370 {
371 throw format_error{"Provided reference has unequal length as specified in the header."};
372 }
373 }
374
375 header_was_read = true;
376
377 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
378 return;
379 }
380
381 // read alignment record into buffer
382 // -------------------------------------------------------------------------------------------------------------
383 position_buffer = stream.tellg();
384 alignment_record_core core;
385 std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
386
387 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
388 {
389 throw format_error{detail::to_string("Reference id index '",
390 core.refID,
391 "' is not in range of ",
392 "header.ref_ids(), which has size ",
393 header.ref_ids().size(),
394 ".")};
395 }
396 else if (core.refID > -1) // not unmapped
397 {
398 ref_id = core.refID; // field::ref_id
399 }
400
401 flag = core.flag; // field::flag
402 mapq = core.mapq; // field::mapq
403
404 if (core.pos > -1) // [[likely]]
405 ref_offset = core.pos; // field::ref_offset
406
407 if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
408 {
409 if (core.next_refID > -1)
410 get<0>(mate) = core.next_refID;
411
412 if (core.next_pos > -1) // [[likely]]
413 get<1>(mate) = core.next_pos;
414
415 get<2>(mate) = core.tlen;
416 }
417
418 // read id
419 // -------------------------------------------------------------------------------------------------------------
420 auto id_view = stream_view | detail::take_exactly_or_throw(core.l_read_name - 1);
421 if constexpr (!detail::decays_to_ignore_v<id_type>)
422 read_forward_range_field(id_view, id); // field::id
423 else
424 detail::consume(id_view);
425 ++std::ranges::begin(stream_view); // skip '\0'
426
427 // read cigar string
428 // -------------------------------------------------------------------------------------------------------------
429 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
430 {
431 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
432 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
433 // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
434 }
435 else
436 {
437 detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
438 }
439
440 offset = offset_tmp;
441
442 // read sequence
443 // -------------------------------------------------------------------------------------------------------------
444 if (core.l_seq > 0) // sequence information is given
445 {
446 auto seq_stream = stream_view | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
449 {
450 return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
451 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
452 });
453
454 if constexpr (detail::decays_to_ignore_v<seq_type>)
455 {
456 auto skip_sequence_bytes = [&]()
457 {
458 detail::consume(seq_stream);
459 if (core.l_seq & 1)
460 ++std::ranges::begin(stream_view);
461 };
462
463 if constexpr (!detail::decays_to_ignore_v<align_type>)
464 {
466 "If you want to read ALIGNMENT but not SEQ, the alignment"
467 " object must store a sequence container at the second (query) position.");
468
469 if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
470 {
471 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
472 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
473 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
474
475 get<1>(align).reserve(seq_length);
476
477 auto tmp_iter = std::ranges::begin(seq_stream);
478 std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
479
480 if (offset_tmp & 1)
481 {
482 get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
483 ++tmp_iter;
484 --seq_length;
485 }
486
487 for (size_t i = (seq_length / 2); i > 0; --i)
488 {
489 get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
490 get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
491 ++tmp_iter;
492 }
493
494 if (seq_length & 1)
495 {
496 get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
497 ++tmp_iter;
498 --soft_clipping_end;
499 }
500
501 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
502 }
503 else
504 {
505 skip_sequence_bytes();
506 get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
507 }
508 }
509 else
510 {
511 skip_sequence_bytes();
512 }
513 }
514 else
515 {
516 using alph_t = std::ranges::range_value_t<decltype(seq)>;
517 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
518
519 for (auto [d1, d2] : seq_stream)
520 {
521 seq.push_back(from_dna16[to_rank(d1)]);
522 seq.push_back(from_dna16[to_rank(d2)]);
523 }
524
525 if (core.l_seq & 1)
526 {
527 dna16sam d =
528 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
529 seq.push_back(from_dna16[to_rank(d)]);
530 ++std::ranges::begin(stream_view);
531 }
532
533 if constexpr (!detail::decays_to_ignore_v<align_type>)
534 {
535 assign_unaligned(get<1>(align),
536 seq
537 | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
538 std::ranges::distance(seq) - soft_clipping_end));
539 }
540 }
541 }
542
543 // read qual string
544 // -------------------------------------------------------------------------------------------------------------
545 auto qual_view = stream_view | detail::take_exactly_or_throw(core.l_seq)
547 [](char chr)
548 {
549 return static_cast<char>(chr + 33);
550 });
551 if constexpr (!detail::decays_to_ignore_v<qual_type>)
552 read_forward_range_field(qual_view, qual);
553 else
554 detail::consume(qual_view);
555
556 // All remaining optional fields if any: SAM tags dictionary
557 // -------------------------------------------------------------------------------------------------------------
558 int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4 /*block_size excluded*/)
559 - core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
560 assert(remaining_bytes >= 0);
561 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
562
563 while (tags_view.size() > 0)
564 {
565 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
566 read_sam_dict_field(tags_view, tag_dict);
567 else
568 detail::consume(tags_view);
569 }
570
571 // DONE READING - wrap up
572 // -------------------------------------------------------------------------------------------------------------
573 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
574 {
575 // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
576 // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
577 // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
578 if (core.l_seq != 0 && offset_tmp == core.l_seq)
579 {
580 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
581 { // maybe only throw in debug mode and otherwise return an empty alignment?
582 throw format_error{
583 detail::to_string("The cigar string '",
584 offset_tmp,
585 "S",
586 ref_length,
587 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
588 "stored in the optional field CG. You need to read in the field::tags and "
589 "field::seq in order to access this information.")};
590 }
591 else
592 {
593 auto it = tag_dict.find("CG"_tag);
594
595 if (it == tag_dict.end())
596 throw format_error{detail::to_string(
597 "The cigar string '",
598 offset_tmp,
599 "S",
600 ref_length,
601 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
602 "stored in the optional field CG but this tag is not present in the given ",
603 "record.")};
604
605 auto cigar_view = std::views::all(std::get<std::string>(it->second));
606 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
607 offset_tmp = soft_clipping_end = 0;
608 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
609 tag_dict.erase(it); // remove redundant information
610
611 if constexpr (!detail::decays_to_ignore_v<align_type>)
612 {
613 assign_unaligned(
614 get<1>(align),
615 seq
616 | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
617 std::ranges::distance(seq) - soft_clipping_end));
618 }
619 }
620 }
621 }
622
623 // Alignment object construction
624 if constexpr (!detail::decays_to_ignore_v<align_type>)
625 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
626
627 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
628 std::swap(cigar_vector, tmp_cigar_vector);
629}
630
632template <typename stream_type,
633 typename header_type,
634 typename seq_type,
635 typename id_type,
636 typename ref_seq_type,
637 typename ref_id_type,
638 typename align_type,
639 typename cigar_type,
640 typename qual_type,
641 typename mate_type,
642 typename tag_dict_type>
643inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
644 [[maybe_unused]] sam_file_output_options const & options,
645 [[maybe_unused]] header_type && header,
646 [[maybe_unused]] seq_type && seq,
647 [[maybe_unused]] qual_type && qual,
648 [[maybe_unused]] id_type && id,
649 [[maybe_unused]] int32_t const offset,
650 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
651 [[maybe_unused]] ref_id_type && ref_id,
652 [[maybe_unused]] std::optional<int32_t> ref_offset,
653 [[maybe_unused]] align_type && align,
654 [[maybe_unused]] cigar_type && cigar_vector,
655 [[maybe_unused]] sam_flag const flag,
656 [[maybe_unused]] uint8_t const mapq,
657 [[maybe_unused]] mate_type && mate,
658 [[maybe_unused]] tag_dict_type && tag_dict,
659 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
660 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
661{
662 // ---------------------------------------------------------------------
663 // Type Requirements (as static asserts for user friendliness)
664 // ---------------------------------------------------------------------
665 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
666 "The seq object must be a std::ranges::forward_range over "
667 "letters that model seqan3::alphabet.");
668
669 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
670 "The id object must be a std::ranges::forward_range over "
671 "letters that model seqan3::alphabet.");
672
673 static_assert((std::ranges::forward_range<ref_seq_type> && alphabet<std::ranges::range_reference_t<ref_seq_type>>),
674 "The ref_seq object must be a std::ranges::forward_range "
675 "over letters that model seqan3::alphabet.");
676
677 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
678 {
679 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
680 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
681 "The ref_id object must be a std::ranges::forward_range "
682 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
683 }
684
686 "The align object must be a std::pair of two ranges whose "
687 "value_type is comparable to seqan3::gap");
688
689 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2
690 && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>>
691 && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
692 "The align object must be a std::pair of two ranges whose "
693 "value_type is comparable to seqan3::gap");
694
695 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
696 "The qual object must be a std::ranges::forward_range "
697 "over letters that model seqan3::alphabet.");
698
700 "The mate object must be a std::tuple of size 3 with "
701 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
702 "2) a std::integral or std::optional<std::integral>, and "
703 "3) a std::integral.");
704
705 static_assert(
706 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
707 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
708 || detail::is_type_specialisation_of_v<
709 std::remove_cvref_t<decltype(std::get<0>(mate))>,
710 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
711 || detail::is_type_specialisation_of_v<
712 std::remove_cvref_t<decltype(std::get<1>(mate))>,
713 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
714 "The mate object must be a std::tuple of size 3 with "
715 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
716 "2) a std::integral or std::optional<std::integral>, and "
717 "3) a std::integral.");
718
719 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
720 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
721
722 if constexpr (detail::decays_to_ignore_v<header_type>)
723 {
724 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
725 "You can either construct the output file with ref_ids and ref_seqs information and "
726 "the header will be created for you, or you can access the `header` member directly."};
727 }
728 else
729 {
730 // ---------------------------------------------------------------------
731 // logical Requirements
732 // ---------------------------------------------------------------------
733
734 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
735 throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
736
737 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
738
739 // ---------------------------------------------------------------------
740 // Writing the BAM Header on first call
741 // ---------------------------------------------------------------------
742 if (!header_was_written)
743 {
744 stream << "BAM\1";
746 write_header(os, options, header); // write SAM header to temporary stream to query the size.
747 int32_t l_text{static_cast<int32_t>(os.str().size())};
748 std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
749
750 stream << os.str();
751
752 int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
753 std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
754
755 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
756 {
757 int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
758 std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
759 // write reference name:
760 std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
761 stream_it = '\0';
762 // write reference sequence length:
763 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
764 }
765
766 header_was_written = true;
767 }
768
769 // ---------------------------------------------------------------------
770 // Writing the Record
771 // ---------------------------------------------------------------------
772 int32_t ref_length{};
773
774 // if alignment is non-empty, replace cigar_vector.
775 // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
776 if (!std::ranges::empty(cigar_vector))
777 {
778 int32_t dummy_seq_length{};
779 for (auto & [count, operation] : cigar_vector)
780 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
781 }
782 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
783 {
784 ref_length = std::ranges::distance(get<1>(align));
785
786 // compute possible distance from alignment end to sequence end
787 // which indicates soft clipping at the end.
788 // This should be replaced by a free count_gaps function for
789 // aligned sequences which is more efficient if possible.
790 int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
791
792 for (auto chr : get<1>(align))
793 if (chr == gap{})
794 ++off_end;
795
796 off_end -= ref_length;
797 cigar_vector = detail::get_cigar_vector(align, offset, off_end);
798 }
799
800 if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
801 {
802 tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
803 cigar_vector.resize(2);
804 cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
805 cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
806 }
807
808 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
809
810 // Compute the value for the l_read_name field for the bam record.
811 // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
812 // the data type to store the value is uint8_t and 255 is the maximal size.
813 // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
814 // 2 bytes.
815 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
816 read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
817
818 alignment_record_core core{/* block_size */ 0, // will be initialised right after
819 /* refID */ -1, // will be initialised right after
820 /* pos */ ref_offset.value_or(-1),
821 /* l_read_name */ read_name_size,
822 /* mapq */ mapq,
823 /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
824 /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
825 /* flag */ flag,
826 /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
827 /* next_refId */ -1, // will be initialised right after
828 /* next_pos */ get<1>(mate).value_or(-1),
829 /* tlen */ get<2>(mate)};
830
831 auto check_and_assign_id_to = [&header]([[maybe_unused]] auto & id_source, [[maybe_unused]] auto & id_target)
832 {
833 using id_t = std::remove_reference_t<decltype(id_source)>;
834
835 if constexpr (!detail::decays_to_ignore_v<id_t>)
836 {
837 if constexpr (std::integral<id_t>)
838 {
839 id_target = id_source;
840 }
841 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
842 {
843 id_target = id_source.value_or(-1);
844 }
845 else
846 {
847 if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
848 {
849 auto id_it = header.ref_dict.end();
850
851 if constexpr (std::ranges::contiguous_range<decltype(id_source)>
852 && std::ranges::sized_range<decltype(id_source)>
853 && std::ranges::borrowed_range<decltype(id_source)>)
854 {
855 id_it = header.ref_dict.find(
856 std::span{std::ranges::data(id_source), std::ranges::size(id_source)});
857 }
858 else
859 {
860 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
861
862 static_assert(
863 implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
864 "The ref_id type is not convertible to the reference id information stored in the "
865 "reference dictionary of the header object.");
866
867 id_it = header.ref_dict.find(id_source);
868 }
869
870 if (id_it == header.ref_dict.end())
871 {
872 throw format_error{detail::to_string("Unknown reference name '",
873 id_source,
874 "' could "
875 "not be found in BAM header ref_dict: ",
876 header.ref_dict,
877 ".")};
878 }
879
880 id_target = id_it->second;
881 }
882 }
883 }
884 };
885
886 // initialise core.refID
887 check_and_assign_id_to(ref_id, core.refID);
888
889 // initialise core.next_refID
890 check_and_assign_id_to(get<0>(mate), core.next_refID);
891
892 // initialise core.block_size
893 core.block_size = sizeof(core) - 4 /*block_size excluded*/ + core.l_read_name + core.n_cigar_op * 4
894 + // each int32_t has 4 bytes
895 (core.l_seq + 1) / 2 + // bitcompressed seq
896 core.l_seq + // quality string
897 tag_dict_binary_str.size();
898
899 std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
900
901 if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
902 stream_it = '*';
903 else
904 std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
905 stream_it = '\0';
906
907 // write cigar
908 for (auto [cigar_count, op] : cigar_vector)
909 {
910 cigar_count = cigar_count << 4;
911 cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
912 std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
913 }
914
915 // write seq (bit-compressed: dna16sam characters go into one byte)
916 using alph_t = std::ranges::range_value_t<seq_type>;
917 constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
918
919 auto sit = std::ranges::begin(seq);
920 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
921 {
922 uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
923 ++sidx, ++sit;
924 compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
925 stream_it = static_cast<char>(compressed_chr);
926 }
927
928 if (core.l_seq & 1) // write one more
929 stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
930
931 // write qual
932 if (std::ranges::empty(qual))
933 {
934 auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
935 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
936 }
937 else
938 {
939 if (std::ranges::distance(qual) != core.l_seq)
940 throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
941 core.l_seq,
942 ". Got quality with size ",
943 std::ranges::distance(qual),
944 " instead.")};
945
946 auto v = qual
948 [](auto chr)
949 {
950 return static_cast<char>(to_rank(chr));
951 });
952 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
953 }
954
955 // write optional fields
956 stream << tag_dict_binary_str;
957 } // if constexpr (!detail::decays_to_ignore_v<header_type>)
958}
959
961template <typename stream_view_type, typename value_type>
962inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
963 stream_view_type && stream_view,
964 value_type const & SEQAN3_DOXYGEN_ONLY(value))
965{
966 int32_t count;
967 read_integral_byte_field(stream_view, count); // read length of vector
968 std::vector<value_type> tmp_vector;
969 tmp_vector.reserve(count);
970
971 while (count > 0)
972 {
973 value_type tmp{};
974 if constexpr (std::integral<value_type>)
975 {
976 read_integral_byte_field(stream_view, tmp);
977 }
978 else if constexpr (std::same_as<value_type, float>)
979 {
980 read_float_byte_field(stream_view, tmp);
981 }
982 else
983 {
984 constexpr bool always_false = std::is_same_v<value_type, void>;
985 static_assert(always_false, "format_bam::read_sam_dict_vector: unsupported value_type");
986 }
987 tmp_vector.push_back(std::move(tmp));
988 --count;
989 }
990 variant = std::move(tmp_vector);
991}
992
1010template <typename stream_view_type>
1011inline void format_bam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
1012{
1013 /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
1014 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
1015 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
1016 VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
1017 by the length (int32_t) of the array, followed by the values.
1018 */
1019 auto it = std::ranges::begin(stream_view);
1020 uint16_t tag = static_cast<uint16_t>(*it) << 8;
1021 ++it; // skip char read before
1022
1023 tag += static_cast<uint16_t>(*it);
1024 ++it; // skip char read before
1025
1026 char type_id = *it;
1027 ++it; // skip char read before
1028
1029 switch (type_id)
1030 {
1031 case 'A': // char
1032 {
1033 target[tag] = *it;
1034 ++it; // skip char that has been read
1035 break;
1036 }
1037 // all integer sizes are possible
1038 case 'c': // int8_t
1039 {
1040 int8_t tmp;
1041 read_integral_byte_field(stream_view, tmp);
1042 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1043 break;
1044 }
1045 case 'C': // uint8_t
1046 {
1047 uint8_t tmp;
1048 read_integral_byte_field(stream_view, tmp);
1049 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1050 break;
1051 }
1052 case 's': // int16_t
1053 {
1054 int16_t tmp;
1055 read_integral_byte_field(stream_view, tmp);
1056 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1057 break;
1058 }
1059 case 'S': // uint16_t
1060 {
1061 uint16_t tmp;
1062 read_integral_byte_field(stream_view, tmp);
1063 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1064 break;
1065 }
1066 case 'i': // int32_t
1067 {
1068 int32_t tmp;
1069 read_integral_byte_field(stream_view, tmp);
1070 target[tag] = std::move(tmp); // readable sam format only allows int32_t
1071 break;
1072 }
1073 case 'I': // uint32_t
1074 {
1075 uint32_t tmp;
1076 read_integral_byte_field(stream_view, tmp);
1077 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1078 break;
1079 }
1080 case 'f': // float
1081 {
1082 float tmp;
1083 read_float_byte_field(stream_view, tmp);
1084 target[tag] = tmp;
1085 break;
1086 }
1087 case 'Z': // string
1088 {
1089 string_buffer.clear();
1090 while (!is_char<'\0'>(*it))
1091 {
1092 string_buffer.push_back(*it);
1093 ++it;
1094 }
1095 ++it; // skip \0
1096 target[tag] = string_buffer;
1097 break;
1098 }
1099 case 'H': // byte array, represented as null-terminated string; specification requires even number of bytes
1100 {
1101 std::vector<std::byte> byte_array;
1102 std::byte value;
1103 while (!is_char<'\0'>(*it))
1104 {
1105 string_buffer.clear();
1106 string_buffer.push_back(*it);
1107 ++it;
1108
1109 if (*it == '\0')
1110 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1111
1112 string_buffer.push_back(*it);
1113 ++it;
1114 read_byte_field(string_buffer, value);
1115 byte_array.push_back(value);
1116 }
1117 ++it; // skip \0
1118 target[tag] = byte_array;
1119 break;
1120 }
1121 case 'B': // Array. Value type depends on second char [cCsSiIf]
1122 {
1123 char array_value_type_id = *it;
1124 ++it; // skip char read before
1125
1126 switch (array_value_type_id)
1127 {
1128 case 'c': // int8_t
1129 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1130 break;
1131 case 'C': // uint8_t
1132 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1133 break;
1134 case 's': // int16_t
1135 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1136 break;
1137 case 'S': // uint16_t
1138 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1139 break;
1140 case 'i': // int32_t
1141 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1142 break;
1143 case 'I': // uint32_t
1144 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1145 break;
1146 case 'f': // float
1147 read_sam_dict_vector(target[tag], stream_view, float{});
1148 break;
1149 default:
1150 throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1151 "must be one of [cCsSiIf] but '",
1152 array_value_type_id,
1153 "' was given.")};
1154 }
1155 break;
1156 }
1157 default:
1158 throw format_error{detail::to_string("The second character in the numerical id of a "
1159 "SAM tag must be one of [A,i,Z,H,B,f] but '",
1160 type_id,
1161 "' was given.")};
1162 }
1163}
1164
1179template <typename cigar_input_type>
1180inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1181{
1182 std::vector<cigar> operations{};
1183 char operation{'\0'};
1184 uint32_t count{};
1185 int32_t ref_length{}, seq_length{};
1186 uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1187 constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1188 constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1189
1190 if (n_cigar_op == 0) // [[unlikely]]
1191 return std::tuple{operations, ref_length, seq_length};
1192
1193 // parse the rest of the cigar
1194 // -------------------------------------------------------------------------------------------------------------
1195 while (n_cigar_op > 0) // until stream is not empty
1196 {
1198 sizeof(operation_and_count),
1199 reinterpret_cast<char *>(&operation_and_count));
1200 operation = cigar_mapping[operation_and_count & cigar_mask];
1201 count = operation_and_count >> 4;
1202
1203 detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1204 operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1205 --n_cigar_op;
1206 }
1207
1208 return std::tuple{operations, ref_length, seq_length};
1209}
1210
1214inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary const & tag_dict)
1215{
1216 std::string result{};
1217
1218 auto stream_variant_fn = [&result](auto && arg) // helper to print a std::variant
1219 {
1220 // T is either char, int32_t, float, std::string, or a std::vector<some int>
1221 using T = std::remove_cvref_t<decltype(arg)>;
1222
1223 if constexpr (std::same_as<T, int32_t>)
1224 {
1225 // always choose the smallest possible representation [cCsSiI]
1226 size_t const absolute_arg = std::abs(arg);
1227 auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1228 bool const negative = arg < 0;
1229 n = n * n + 2 * negative; // for switch case order
1230
1231 switch (n)
1232 {
1233 case 0:
1234 {
1235 result[result.size() - 1] = 'C';
1236 result.append(reinterpret_cast<char const *>(&arg), 1);
1237 break;
1238 }
1239 case 1:
1240 {
1241 result[result.size() - 1] = 'S';
1242 result.append(reinterpret_cast<char const *>(&arg), 2);
1243 break;
1244 }
1245 case 2:
1246 {
1247 result[result.size() - 1] = 'c';
1248 int8_t tmp = static_cast<int8_t>(arg);
1249 result.append(reinterpret_cast<char const *>(&tmp), 1);
1250 break;
1251 }
1252 case 3:
1253 {
1254 result[result.size() - 1] = 's';
1255 int16_t tmp = static_cast<int16_t>(arg);
1256 result.append(reinterpret_cast<char const *>(&tmp), 2);
1257 break;
1258 }
1259 default:
1260 {
1261 result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1262 break;
1263 }
1264 }
1265 }
1266 else if constexpr (std::same_as<T, std::string>)
1267 {
1268 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1 /*+ null character*/);
1269 }
1270 else if constexpr (!std::ranges::range<T>) // char, float
1271 {
1272 result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1273 }
1274 else // std::vector of some arithmetic_type type
1275 {
1276 int32_t sz{static_cast<int32_t>(arg.size())};
1277 result.append(reinterpret_cast<char *>(&sz), 4);
1278 result.append(reinterpret_cast<char const *>(arg.data()),
1279 arg.size() * sizeof(std::ranges::range_value_t<T>));
1280 }
1281 };
1282
1283 for (auto & [tag, variant] : tag_dict)
1284 {
1285 result.push_back(static_cast<char>(tag / 256));
1286 result.push_back(static_cast<char>(tag % 256));
1287
1288 result.push_back(detail::sam_tag_type_char[variant.index()]);
1289
1290 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1291 result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1292
1293 std::visit(stream_variant_fn, variant);
1294 }
1295
1296 return result;
1297}
1298
1299} // namespace seqan3
T begin(T... args)
T bit_ceil(T... args)
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:163
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:187
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:98
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The BAM format.
Definition: format_bam.hpp:50
format_bam()=default
Defaulted.
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:643
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:260
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:66
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:34
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:143
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:182
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:179
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
T clear(T... args)
T copy(T... args)
T copy_n(T... args)
T countr_zero(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T emplace_back(T... args)
T end(T... args)
T equal(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:470
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:164
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:146
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
T push_back(T... args)
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:26
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T visit(T... args)