SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/std/bit>
16#include <iterator>
17#include <seqan3/std/ranges>
18#include <string>
19#include <vector>
20
33
34namespace seqan3
35{
36
49class format_bam : private detail::format_sam_base
50{
51public:
55 // string_buffer is of type std::string and has some problems with pre-C++11 ABI
56 format_bam() = default;
57 format_bam(format_bam const &) = default;
58 format_bam & operator=(format_bam const &) = default;
59 format_bam(format_bam &&) = default;
61 ~format_bam() = default;
62
64
67 {
68 { "bam" }
69 };
70
71protected:
72 template <typename stream_type, // constraints checked by file
73 typename seq_legal_alph_type,
74 typename ref_seqs_type,
75 typename ref_ids_type,
76 typename seq_type,
77 typename id_type,
78 typename offset_type,
79 typename ref_seq_type,
80 typename ref_id_type,
81 typename ref_offset_type,
82 typename align_type,
83 typename cigar_type,
84 typename flag_type,
85 typename mapq_type,
86 typename qual_type,
87 typename mate_type,
88 typename tag_dict_type,
89 typename e_value_type,
90 typename bit_score_type>
91 void read_alignment_record(stream_type & stream,
92 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
93 ref_seqs_type & ref_seqs,
95 seq_type & seq,
96 qual_type & qual,
97 id_type & id,
98 offset_type & offset,
99 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
100 ref_id_type & ref_id,
101 ref_offset_type & ref_offset,
102 align_type & align,
103 cigar_type & cigar_vector,
104 flag_type & flag,
105 mapq_type & mapq,
106 mate_type & mate,
107 tag_dict_type & tag_dict,
108 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
109 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
110
111 template <typename stream_type,
112 typename header_type,
113 typename seq_type,
114 typename id_type,
115 typename ref_seq_type,
116 typename ref_id_type,
117 typename align_type,
118 typename cigar_type,
119 typename qual_type,
120 typename mate_type,
121 typename tag_dict_type>
122 void write_alignment_record([[maybe_unused]] stream_type & stream,
123 [[maybe_unused]] sam_file_output_options const & options,
124 [[maybe_unused]] header_type && header,
125 [[maybe_unused]] seq_type && seq,
126 [[maybe_unused]] qual_type && qual,
127 [[maybe_unused]] id_type && id,
128 [[maybe_unused]] int32_t const offset,
129 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
130 [[maybe_unused]] ref_id_type && ref_id,
131 [[maybe_unused]] std::optional<int32_t> ref_offset,
132 [[maybe_unused]] align_type && align,
133 [[maybe_unused]] cigar_type && cigar_vector,
134 [[maybe_unused]] sam_flag const flag,
135 [[maybe_unused]] uint8_t const mapq,
136 [[maybe_unused]] mate_type && mate,
137 [[maybe_unused]] tag_dict_type && tag_dict,
138 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
139 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
140
141private:
143 bool header_was_read{false};
144
146 std::string string_buffer{};
147
149 struct alignment_record_core
150 { // naming corresponds to official SAM/BAM specifications
151 int32_t block_size;
152 int32_t refID;
153 int32_t pos;
154 uint32_t l_read_name:8;
155 uint32_t mapq:8;
156 uint32_t bin:16;
157 uint32_t n_cigar_op:16;
158 sam_flag flag;
159 int32_t l_seq;
160 int32_t next_refID;
161 int32_t next_pos;
162 int32_t tlen;
163 };
164
166 static constexpr std::array<uint8_t, 256> char_to_sam_rank
167 {
168 [] () constexpr
169 {
171
172 using index_t = std::make_unsigned_t<char>;
173
174 // ret['M'] = 0; set anyway by initialization
175 ret[static_cast<index_t>('I')] = 1;
176 ret[static_cast<index_t>('D')] = 2;
177 ret[static_cast<index_t>('N')] = 3;
178 ret[static_cast<index_t>('S')] = 4;
179 ret[static_cast<index_t>('H')] = 5;
180 ret[static_cast<index_t>('P')] = 6;
181 ret[static_cast<index_t>('=')] = 7;
182 ret[static_cast<index_t>('X')] = 8;
183
184 return ret;
185 }()
186 };
187
189 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
190 {
191 --end;
192 if (beg >> 14 == end >> 14) return ((1 << 15) - 1) / 7 + (beg >> 14);
193 if (beg >> 17 == end >> 17) return ((1 << 12) - 1) / 7 + (beg >> 17);
194 if (beg >> 20 == end >> 20) return ((1 << 9) - 1) / 7 + (beg >> 20);
195 if (beg >> 23 == end >> 23) return ((1 << 6) - 1) / 7 + (beg >> 23);
196 if (beg >> 26 == end >> 26) return ((1 << 3) - 1) / 7 + (beg >> 26);
197 return 0;
198 }
199
200 using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
201
208 template <typename stream_view_type, std::integral number_type>
209 void read_field(stream_view_type && stream_view, number_type & target)
210 {
211 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
212 }
213
219 template <typename stream_view_type>
220 void read_field(stream_view_type && stream_view, float & target)
221 {
222 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
223 }
224
235 template <typename stream_view_type, typename optional_value_type>
236 void read_field(stream_view_type && stream_view, std::optional<optional_value_type> & target)
237 {
238 optional_value_type tmp;
239 read_field(std::forward<stream_view_type>(stream_view), tmp);
240 target = tmp;
241 }
242
243 template <typename stream_view_type, typename value_type>
244 void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
245 stream_view_type && stream_view,
246 value_type const & SEQAN3_DOXYGEN_ONLY(value));
247
248 template <typename stream_view_type>
249 void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
250
251 template <typename cigar_input_type>
252 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
253
254 static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
255};
256
258template <typename stream_type, // constraints checked by file
259 typename seq_legal_alph_type,
260 typename ref_seqs_type,
261 typename ref_ids_type,
262 typename seq_type,
263 typename id_type,
264 typename offset_type,
265 typename ref_seq_type,
266 typename ref_id_type,
267 typename ref_offset_type,
268 typename align_type,
269 typename cigar_type,
270 typename flag_type,
271 typename mapq_type,
272 typename qual_type,
273 typename mate_type,
274 typename tag_dict_type,
275 typename e_value_type,
276 typename bit_score_type>
277inline void format_bam::read_alignment_record(stream_type & stream,
278 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
279 ref_seqs_type & ref_seqs,
281 seq_type & seq,
282 qual_type & qual,
283 id_type & id,
284 offset_type & offset,
285 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
286 ref_id_type & ref_id,
287 ref_offset_type & ref_offset,
288 align_type & align,
289 cigar_type & cigar_vector,
290 flag_type & flag,
291 mapq_type & mapq,
292 mate_type & mate,
293 tag_dict_type & tag_dict,
294 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
295 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
296{
297 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
298 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
299 "The ref_offset must be a specialisation of std::optional.");
300
301 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
302 "The type of field::mapq must be uint8_t.");
303
304 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
305 "The type of field::flag must be seqan3::sam_flag.");
306
307 auto stream_view = seqan3::detail::istreambuf(stream);
308
309 // these variables need to be stored to compute the ALIGNMENT
310 [[maybe_unused]] int32_t offset_tmp{};
311 [[maybe_unused]] int32_t soft_clipping_end{};
312 [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
313 [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
314
315 // Header
316 // -------------------------------------------------------------------------------------------------------------
317 if (!header_was_read)
318 {
319 // magic BAM string
320 if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4), std::string_view{"BAM\1"}))
321 throw format_error{"File is not in BAM format."};
322
323 int32_t l_text{}; // length of header text including \0 character
324 int32_t n_ref{}; // number of reference sequences
325 int32_t l_name{}; // 1 + length of reference name including \0 character
326 int32_t l_ref{}; // length of reference sequence
327
328 read_field(stream_view, l_text);
329
330 if (l_text > 0) // header text is present
331 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
332
333 read_field(stream_view, n_ref);
334
335 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
336 {
337 read_field(stream_view, l_name);
338
339 string_buffer.resize(l_name - 1);
340 std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.data()); // copy without \0 character
341 std::ranges::next(std::ranges::begin(stream_view)); // skip \0 character
342
343 read_field(stream_view, l_ref);
344
345 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
346 {
347 // If there was no header text, we parse reference sequences block as header information
348 if (l_text == 0)
349 {
350 auto & reference_ids = header.ref_ids();
351 // put the length of the reference sequence into ref_id_info
352 header.ref_id_info.emplace_back(l_ref, "");
353 // put the reference name into reference_ids
354 reference_ids.push_back(string_buffer);
355 // assign the reference name an ascending reference id (starts at index 0).
356 header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
357 continue;
358 }
359 }
360
361 auto id_it = header.ref_dict.find(string_buffer);
362
363 // sanity checks of reference information to existing header object:
364 if (id_it == header.ref_dict.end()) // [unlikely]
365 {
366 throw format_error{detail::to_string("Unknown reference name '" + string_buffer +
367 "' found in BAM file header (header.ref_ids():",
368 header.ref_ids(), ").")};
369 }
370 else if (id_it->second != ref_idx) // [unlikely]
371 {
372 throw format_error{detail::to_string("Reference id '", string_buffer, "' at position ", ref_idx,
373 " does not correspond to the position ", id_it->second,
374 " in the header (header.ref_ids():", header.ref_ids(), ").")};
375 }
376 else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
377 {
378 throw format_error{"Provided reference has unequal length as specified in the header."};
379 }
380 }
381
382 header_was_read = true;
383
384 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
385 return;
386 }
387
388 // read alignment record into buffer
389 // -------------------------------------------------------------------------------------------------------------
390 alignment_record_core core;
391 std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
392
393 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
394 {
395 throw format_error{detail::to_string("Reference id index '", core.refID, "' is not in range of ",
396 "header.ref_ids(), which has size ", header.ref_ids().size(), ".")};
397 }
398 else if (core.refID > -1) // not unmapped
399 {
400 ref_id = core.refID; // field::ref_id
401 }
402
403 flag = core.flag; // field::flag
404 mapq = core.mapq; // field::mapq
405
406 if (core.pos > -1) // [[likely]]
407 ref_offset = core.pos; // field::ref_offset
408
409 if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
410 {
411 if (core.next_refID > -1)
412 get<0>(mate) = core.next_refID;
413
414 if (core.next_pos > -1) // [[likely]]
415 get<1>(mate) = core.next_pos;
416
417 get<2>(mate) = core.tlen;
418 }
419
420 // read id
421 // -------------------------------------------------------------------------------------------------------------
422 read_field(stream_view | detail::take_exactly_or_throw(core.l_read_name - 1), id); // field::id
423 std::ranges::next(std::ranges::begin(stream_view)); // skip '\0'
424
425 // read cigar string
426 // -------------------------------------------------------------------------------------------------------------
427 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
428 {
429 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
430 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
431 // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
432 }
433 else
434 {
435 detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
436 }
437
438 offset = offset_tmp;
439
440 // read sequence
441 // -------------------------------------------------------------------------------------------------------------
442 if (core.l_seq > 0) // sequence information is given
443 {
444 auto seq_stream = stream_view
445 | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
447 {
448 return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
449 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
450 });
451
452 if constexpr (detail::decays_to_ignore_v<seq_type>)
453 {
454 auto skip_sequence_bytes = [&] ()
455 {
456 detail::consume(seq_stream);
457 if (core.l_seq & 1)
458 std::ranges::next(std::ranges::begin(stream_view));
459 };
460
461 if constexpr (!detail::decays_to_ignore_v<align_type>)
462 {
464 "If you want to read ALIGNMENT but not SEQ, the alignment"
465 " object must store a sequence container at the second (query) position.");
466
467 if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
468 {
469 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end)); // sanity check
470 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
471 constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
472
473 get<1>(align).reserve(seq_length);
474
475 auto tmp_iter = std::ranges::begin(seq_stream);
476 std::ranges::advance(tmp_iter, offset_tmp / 2); // skip soft clipped bases at the beginning
477
478 if (offset_tmp & 1)
479 {
480 get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
481 ++tmp_iter;
482 --seq_length;
483 }
484
485 for (size_t i = (seq_length / 2); i > 0; --i)
486 {
487 get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
488 get<1>(align).push_back(from_dna16[to_rank(get<1>(*tmp_iter))]);
489 ++tmp_iter;
490 }
491
492 if (seq_length & 1)
493 {
494 get<1>(align).push_back(from_dna16[to_rank(get<0>(*tmp_iter))]);
495 ++tmp_iter;
496 --soft_clipping_end;
497 }
498
499 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
500 }
501 else
502 {
503 skip_sequence_bytes();
504 get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // assign empty container
505 }
506 }
507 else
508 {
509 skip_sequence_bytes();
510 }
511 }
512 else
513 {
514 using alph_t = std::ranges::range_value_t<decltype(seq)>;
515 constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
516
517 for (auto [d1, d2] : seq_stream)
518 {
519 seq.push_back(from_dna16[to_rank(d1)]);
520 seq.push_back(from_dna16[to_rank(d2)]);
521 }
522
523 if (core.l_seq & 1)
524 {
525 dna16sam d = dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
526 seq.push_back(from_dna16[to_rank(d)]);
527 std::ranges::next(std::ranges::begin(stream_view));
528 }
529
530 if constexpr (!detail::decays_to_ignore_v<align_type>)
531 {
532 assign_unaligned(get<1>(align),
533 seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
534 std::ranges::distance(seq) - soft_clipping_end));
535 }
536 }
537 }
538
539 // read qual string
540 // -------------------------------------------------------------------------------------------------------------
541 read_field(stream_view | detail::take_exactly_or_throw(core.l_seq)
542 | std::views::transform([] (char chr) { return static_cast<char>(chr + 33); }), qual);
543
544 // All remaining optional fields if any: SAM tags dictionary
545 // -------------------------------------------------------------------------------------------------------------
546 int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4/*block_size excluded*/) -
547 core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
548 assert(remaining_bytes >= 0);
549 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
550
551 while (tags_view.size() > 0)
552 read_field(tags_view, tag_dict);
553
554 // DONE READING - wrap up
555 // -------------------------------------------------------------------------------------------------------------
556 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
557 {
558 // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
559 // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
560 // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
561 if (core.l_seq != 0 && offset_tmp == core.l_seq)
562 {
563 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
564 { // maybe only throw in debug mode and otherwise return an empty alignment?
565 throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
566 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
567 "stored in the optional field CG. You need to read in the field::tags and "
568 "field::seq in order to access this information.")};
569 }
570 else
571 {
572 auto it = tag_dict.find("CG"_tag);
573
574 if (it == tag_dict.end())
575 throw format_error{detail::to_string("The cigar string '", offset_tmp, "S", ref_length,
576 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
577 "stored in the optional field CG but this tag is not present in the given ",
578 "record.")};
579
580 auto cigar_view = std::views::all(std::get<std::string>(it->second));
581 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
582 offset_tmp = soft_clipping_end = 0;
583 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
584 tag_dict.erase(it); // remove redundant information
585
586 if constexpr (!detail::decays_to_ignore_v<align_type>)
587 {
588 assign_unaligned(get<1>(align),
589 seq | views::slice(static_cast<std::ranges::range_difference_t<seq_type>>(offset_tmp),
590 std::ranges::distance(seq) - soft_clipping_end));
591 }
592 }
593 }
594 }
595
596 // Alignment object construction
597 if constexpr (!detail::decays_to_ignore_v<align_type>)
598 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length); // inherited from SAM
599
600 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
601 std::swap(cigar_vector, tmp_cigar_vector);
602}
603
605template <typename stream_type,
606 typename header_type,
607 typename seq_type,
608 typename id_type,
609 typename ref_seq_type,
610 typename ref_id_type,
611 typename align_type,
612 typename cigar_type,
613 typename qual_type,
614 typename mate_type,
615 typename tag_dict_type>
616inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
617 [[maybe_unused]] sam_file_output_options const & options,
618 [[maybe_unused]] header_type && header,
619 [[maybe_unused]] seq_type && seq,
620 [[maybe_unused]] qual_type && qual,
621 [[maybe_unused]] id_type && id,
622 [[maybe_unused]] int32_t const offset,
623 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
624 [[maybe_unused]] ref_id_type && ref_id,
625 [[maybe_unused]] std::optional<int32_t> ref_offset,
626 [[maybe_unused]] align_type && align,
627 [[maybe_unused]] cigar_type && cigar_vector,
628 [[maybe_unused]] sam_flag const flag,
629 [[maybe_unused]] uint8_t const mapq,
630 [[maybe_unused]] mate_type && mate,
631 [[maybe_unused]] tag_dict_type && tag_dict,
632 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
633 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
634{
635 // ---------------------------------------------------------------------
636 // Type Requirements (as static asserts for user friendliness)
637 // ---------------------------------------------------------------------
638 static_assert((std::ranges::forward_range<seq_type> &&
640 "The seq object must be a std::ranges::forward_range over "
641 "letters that model seqan3::alphabet.");
642
643 static_assert((std::ranges::forward_range<id_type> &&
645 "The id object must be a std::ranges::forward_range over "
646 "letters that model seqan3::alphabet.");
647
648 static_assert((std::ranges::forward_range<ref_seq_type> &&
650 "The ref_seq object must be a std::ranges::forward_range "
651 "over letters that model seqan3::alphabet.");
652
653 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
654 {
655 static_assert((std::ranges::forward_range<ref_id_type> ||
656 std::integral<std::remove_reference_t<ref_id_type>> ||
657 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
658 "The ref_id object must be a std::ranges::forward_range "
659 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
660 }
661
663 "The align object must be a std::pair of two ranges whose "
664 "value_type is comparable to seqan3::gap");
665
666 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
667 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
668 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
669 "The align object must be a std::pair of two ranges whose "
670 "value_type is comparable to seqan3::gap");
671
672 static_assert((std::ranges::forward_range<qual_type> &&
674 "The qual object must be a std::ranges::forward_range "
675 "over letters that model seqan3::alphabet.");
676
678 "The mate object must be a std::tuple of size 3 with "
679 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
680 "2) a std::integral or std::optional<std::integral>, and "
681 "3) a std::integral.");
682
683 static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
684 std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
685 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
686 (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
687 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
688 std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
689 "The mate object must be a std::tuple of size 3 with "
690 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
691 "2) a std::integral or std::optional<std::integral>, and "
692 "3) a std::integral.");
693
694 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
695 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
696
697 if constexpr (detail::decays_to_ignore_v<header_type>)
698 {
699 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
700 "You can either construct the output file with ref_ids and ref_seqs information and "
701 "the header will be created for you, or you can access the `header` member directly."};
702 }
703 else
704 {
705 // ---------------------------------------------------------------------
706 // logical Requirements
707 // ---------------------------------------------------------------------
708
709 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
710 throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
711
712 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
713
714 // ---------------------------------------------------------------------
715 // Writing the BAM Header on first call
716 // ---------------------------------------------------------------------
717 if (!header_was_written)
718 {
719 stream << "BAM\1";
721 write_header(os, options, header); // write SAM header to temporary stream to query the size.
722 int32_t l_text{static_cast<int32_t>(os.str().size())};
723 std::ranges::copy_n(reinterpret_cast<char *>(&l_text), 4, stream_it); // write read id
724
725 stream << os.str();
726
727 int32_t n_ref{static_cast<int32_t>(header.ref_ids().size())};
728 std::ranges::copy_n(reinterpret_cast<char *>(&n_ref), 4, stream_it); // write read id
729
730 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
731 {
732 int32_t l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
733 std::ranges::copy_n(reinterpret_cast<char *>(&l_name), 4, stream_it); // write l_name
734 // write reference name:
735 std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
736 stream_it = '\0';
737 // write reference sequence length:
738 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
739 }
740
741 header_was_written = true;
742 }
743
744 // ---------------------------------------------------------------------
745 // Writing the Record
746 // ---------------------------------------------------------------------
747 int32_t ref_length{};
748
749 // if alignment is non-empty, replace cigar_vector.
750 // else, compute the ref_length from given cigar_vector which is needed to fill field `bin`.
751 if (!std::ranges::empty(cigar_vector))
752 {
753 int32_t dummy_seq_length{};
754 for (auto & [count, operation] : cigar_vector)
755 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
756 }
757 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
758 {
759 ref_length = std::ranges::distance(get<1>(align));
760
761 // compute possible distance from alignment end to sequence end
762 // which indicates soft clipping at the end.
763 // This should be replaced by a free count_gaps function for
764 // aligned sequences which is more efficient if possible.
765 int32_t off_end{static_cast<int32_t>(std::ranges::distance(seq)) - offset};
766
767 for (auto chr : get<1>(align))
768 if (chr == gap{})
769 ++off_end;
770
771 off_end -= ref_length;
772 cigar_vector = detail::get_cigar_vector(align, offset, off_end);
773 }
774
775 if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
776 {
777 tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
778 cigar_vector.resize(2);
779 cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
780 cigar_vector[1] = cigar{static_cast<uint32_t>(std::ranges::distance(get<1>(align))), 'N'_cigar_operation};
781 }
782
783 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
784
785 // Compute the value for the l_read_name field for the bam record.
786 // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
787 // the data type to store the value is uint8_t and 255 is the maximal size.
788 // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
789 // 2 bytes.
790 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
791 read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
792
793 alignment_record_core core
794 {
795 /* block_size */ 0, // will be initialised right after
796 /* refID */ -1, // will be initialised right after
797 /* pos */ ref_offset.value_or(-1),
798 /* l_read_name */ read_name_size,
799 /* mapq */ mapq,
800 /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
801 /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
802 /* flag */ flag,
803 /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
804 /* next_refId */ -1, // will be initialised right after
805 /* next_pos */ get<1>(mate).value_or(-1),
806 /* tlen */ get<2>(mate)
807 };
808
809 auto check_and_assign_id_to = [&header] ([[maybe_unused]] auto & id_source,
810 [[maybe_unused]] auto & id_target)
811 {
812 using id_t = std::remove_reference_t<decltype(id_source)>;
813
814 if constexpr (!detail::decays_to_ignore_v<id_t>)
815 {
816 if constexpr (std::integral<id_t>)
817 {
818 id_target = id_source;
819 }
820 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
821 {
822 id_target = id_source.value_or(-1);
823 }
824 else
825 {
826 if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
827 {
828 auto id_it = header.ref_dict.end();
829
830 if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
831 std::ranges::sized_range<decltype(id_source)> &&
832 std::ranges::borrowed_range<decltype(id_source)>)
833 {
834 id_it = header.ref_dict.find(std::span{std::ranges::data(id_source),
835 std::ranges::size(id_source)});
836 }
837 else
838 {
839 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
840
841 static_assert(implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
842 "The ref_id type is not convertible to the reference id information stored in the "
843 "reference dictionary of the header object.");
844
845 id_it = header.ref_dict.find(id_source);
846 }
847
848 if (id_it == header.ref_dict.end())
849 {
850 throw format_error{detail::to_string("Unknown reference name '", id_source, "' could "
851 "not be found in BAM header ref_dict: ",
852 header.ref_dict, ".")};
853 }
854
855 id_target = id_it->second;
856 }
857 }
858 }
859 };
860
861 // initialise core.refID
862 check_and_assign_id_to(ref_id, core.refID);
863
864 // initialise core.next_refID
865 check_and_assign_id_to(get<0>(mate), core.next_refID);
866
867 // initialise core.block_size
868 core.block_size = sizeof(core) - 4/*block_size excluded*/ +
869 core.l_read_name +
870 core.n_cigar_op * 4 + // each int32_t has 4 bytes
871 (core.l_seq + 1) / 2 + // bitcompressed seq
872 core.l_seq + // quality string
873 tag_dict_binary_str.size();
874
875 std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
876
877 if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
878 stream_it = '*';
879 else
880 std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
881 stream_it = '\0';
882
883 // write cigar
884 for (auto [cigar_count, op] : cigar_vector)
885 {
886 cigar_count = cigar_count << 4;
887 cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
888 std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
889 }
890
891 // write seq (bit-compressed: dna16sam characters go into one byte)
892 using alph_t = std::ranges::range_value_t<seq_type>;
893 constexpr auto to_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
894
895 auto sit = std::ranges::begin(seq);
896 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
897 {
898 uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
899 ++sidx, ++sit;
900 compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
901 stream_it = static_cast<char>(compressed_chr);
902 }
903
904 if (core.l_seq & 1) // write one more
905 stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
906
907 // write qual
908 if (std::ranges::empty(qual))
909 {
910 auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
911 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
912 }
913 else
914 {
915 if (std::ranges::distance(qual) != core.l_seq)
916 throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
917 core.l_seq, ". Got quality with size ",
918 std::ranges::distance(qual), " instead.")};
919
920 auto v = qual | std::views::transform([] (auto chr) { return static_cast<char>(to_rank(chr)); });
921 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
922 }
923
924 // write optional fields
925 stream << tag_dict_binary_str;
926 } // if constexpr (!detail::decays_to_ignore_v<header_type>)
927}
928
930template <typename stream_view_type, typename value_type>
931inline void format_bam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
932 stream_view_type && stream_view,
933 value_type const & SEQAN3_DOXYGEN_ONLY(value))
934{
935 int32_t count;
936 read_field(stream_view, count); // read length of vector
937 std::vector<value_type> tmp_vector;
938 tmp_vector.reserve(count);
939
940 while (count > 0)
941 {
942 value_type tmp{};
943 read_field(stream_view, tmp);
944 tmp_vector.push_back(std::move(tmp));
945 --count;
946 }
947 variant = std::move(tmp_vector);
948}
949
967template <typename stream_view_type>
968inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
969{
970 /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
971 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
972 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
973 VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
974 by the length (int32_t) of the array, followed by the values.
975 */
976 auto it = std::ranges::begin(stream_view);
977 uint16_t tag = static_cast<uint16_t>(*it) << 8;
978 ++it; // skip char read before
979
980 tag += static_cast<uint16_t>(*it);
981 ++it; // skip char read before
982
983 char type_id = *it;
984 ++it; // skip char read before
985
986 switch (type_id)
987 {
988 case 'A' : // char
989 {
990 target[tag] = *it;
991 ++it; // skip char that has been read
992 break;
993 }
994 // all integer sizes are possible
995 case 'c' : // int8_t
996 {
997 int8_t tmp;
998 read_field(stream_view, tmp);
999 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1000 break;
1001 }
1002 case 'C' : // uint8_t
1003 {
1004 uint8_t tmp;
1005 read_field(stream_view, tmp);
1006 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1007 break;
1008 }
1009 case 's' : // int16_t
1010 {
1011 int16_t tmp;
1012 read_field(stream_view, tmp);
1013 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1014 break;
1015 }
1016 case 'S' : // uint16_t
1017 {
1018 uint16_t tmp;
1019 read_field(stream_view, tmp);
1020 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1021 break;
1022 }
1023 case 'i' : // int32_t
1024 {
1025 int32_t tmp;
1026 read_field(stream_view, tmp);
1027 target[tag] = std::move(tmp); // readable sam format only allows int32_t
1028 break;
1029 }
1030 case 'I' : // uint32_t
1031 {
1032 uint32_t tmp;
1033 read_field(stream_view, tmp);
1034 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1035 break;
1036 }
1037 case 'f' : // float
1038 {
1039 float tmp;
1040 read_field(stream_view, tmp);
1041 target[tag] = tmp;
1042 break;
1043 }
1044 case 'Z' : // string
1045 {
1046 string_buffer.clear();
1047 while (!is_char<'\0'>(*it))
1048 {
1049 string_buffer.push_back(*it);
1050 ++it;
1051 }
1052 ++it; // skip \0
1053 target[tag] = string_buffer;
1054 break;
1055 }
1056 case 'H' : // byte array, represented as null-terminated string; specification requires even number of bytes
1057 {
1058 std::vector<std::byte> byte_array;
1059 std::byte value;
1060 while (!is_char<'\0'>(*it))
1061 {
1062 string_buffer.clear();
1063 string_buffer.push_back(*it);
1064 ++it;
1065
1066 if (*it == '\0')
1067 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1068
1069 string_buffer.push_back(*it);
1070 ++it;
1071 read_field(string_buffer, value);
1072 byte_array.push_back(value);
1073 }
1074 ++it; // skip \0
1075 target[tag] = byte_array;
1076 break;
1077 }
1078 case 'B' : // Array. Value type depends on second char [cCsSiIf]
1079 {
1080 char array_value_type_id = *it;
1081 ++it; // skip char read before
1082
1083 switch (array_value_type_id)
1084 {
1085 case 'c' : // int8_t
1086 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1087 break;
1088 case 'C' : // uint8_t
1089 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1090 break;
1091 case 's' : // int16_t
1092 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1093 break;
1094 case 'S' : // uint16_t
1095 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1096 break;
1097 case 'i' : // int32_t
1098 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1099 break;
1100 case 'I' : // uint32_t
1101 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1102 break;
1103 case 'f' : // float
1104 read_sam_dict_vector(target[tag], stream_view, float{});
1105 break;
1106 default:
1107 throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1108 "must be one of [cCsSiIf] but '", array_value_type_id, "' was given.")};
1109 }
1110 break;
1111 }
1112 default:
1113 throw format_error{detail::to_string("The second character in the numerical id of a "
1114 "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id, "' was given.")};
1115 }
1116}
1117
1132template <typename cigar_input_type>
1133inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1134{
1135 std::vector<cigar> operations{};
1136 char operation{'\0'};
1137 uint32_t count{};
1138 int32_t ref_length{}, seq_length{};
1139 uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1140 constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1141 constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1142
1143 if (n_cigar_op == 0) // [[unlikely]]
1144 return std::tuple{operations, ref_length, seq_length};
1145
1146 // parse the rest of the cigar
1147 // -------------------------------------------------------------------------------------------------------------
1148 while (n_cigar_op > 0) // until stream is not empty
1149 {
1150 std::ranges::copy_n(std::ranges::begin(cigar_input),
1151 sizeof(operation_and_count),
1152 reinterpret_cast<char*>(&operation_and_count));
1153 operation = cigar_mapping[operation_and_count & cigar_mask];
1154 count = operation_and_count >> 4;
1155
1156 detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1157 operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1158 --n_cigar_op;
1159 }
1160
1161 return std::tuple{operations, ref_length, seq_length};
1162}
1163
1167inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary const & tag_dict)
1168{
1169 std::string result{};
1170
1171 auto stream_variant_fn = [&result] (auto && arg) // helper to print a std::variant
1172 {
1173 // T is either char, int32_t, float, std::string, or a std::vector<some int>
1174 using T = std::remove_cvref_t<decltype(arg)>;
1175
1176 if constexpr (std::same_as<T, int32_t>)
1177 {
1178 // always choose the smallest possible representation [cCsSiI]
1179 size_t const absolute_arg = std::abs(arg);
1180 auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1181 bool const negative = arg < 0;
1182 n = n * n + 2 * negative; // for switch case order
1183
1184 switch (n)
1185 {
1186 case 0:
1187 {
1188 result[result.size() - 1] = 'C';
1189 result.append(reinterpret_cast<char const *>(&arg), 1);
1190 break;
1191 }
1192 case 1:
1193 {
1194 result[result.size() - 1] = 'S';
1195 result.append(reinterpret_cast<char const *>(&arg), 2);
1196 break;
1197 }
1198 case 2:
1199 {
1200 result[result.size() - 1] = 'c';
1201 int8_t tmp = static_cast<int8_t>(arg);
1202 result.append(reinterpret_cast<char const *>(&tmp), 1);
1203 break;
1204 }
1205 case 3:
1206 {
1207 result[result.size() - 1] = 's';
1208 int16_t tmp = static_cast<int16_t>(arg);
1209 result.append(reinterpret_cast<char const *>(&tmp), 2);
1210 break;
1211 }
1212 default:
1213 {
1214 result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1215 break;
1216 }
1217 }
1218 }
1219 else if constexpr (std::same_as<T, std::string>)
1220 {
1221 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1/*+ null character*/);
1222 }
1223 else if constexpr (!std::ranges::range<T>) // char, float
1224 {
1225 result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1226 }
1227 else // std::vector of some arithmetic_type type
1228 {
1229 int32_t sz{static_cast<int32_t>(arg.size())};
1230 result.append(reinterpret_cast<char *>(&sz), 4);
1231 result.append(reinterpret_cast<char const *>(arg.data()),
1232 arg.size() * sizeof(std::ranges::range_value_t<T>));
1233 }
1234 };
1235
1236 for (auto & [tag, variant] : tag_dict)
1237 {
1238 result.push_back(static_cast<char>(tag / 256));
1239 result.push_back(static_cast<char>(tag % 256));
1240
1241 result.push_back(detail::sam_tag_type_char[variant.index()]);
1242
1243 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1244 result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1245
1246 std::visit(stream_variant_fn, variant);
1247 }
1248
1249 return result;
1250}
1251
1252} // namespace seqan3
T begin(T... args)
The <bit> header from C++20's standard library.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:165
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:191
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The BAM format.
Definition: format_bam.hpp:50
format_bam()=default
Defaulted.
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:616
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:277
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
format_bam(format_bam &&)=default
Defaulted.
~format_bam()=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:67
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:35
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:160
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:121
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:157
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:337
T clear(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T end(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:162
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:228
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
Provides seqan3::debug_stream and related types.
T push_back(T... args)
The <ranges> header from C++20's standard library.
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:26
T swap(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T visit(T... args)