SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
format_bam.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <bit>
13#include <cstring>
14#include <iterator>
15#include <ranges>
16#include <string>
17#include <vector>
18
32
33namespace seqan3
34{
35
49{
50public:
54 // string_buffer is of type std::string and has some problems with pre-C++11 ABI
55 format_bam() = default;
56 format_bam(format_bam const &) = default;
57 format_bam & operator=(format_bam const &) = default;
58 format_bam(format_bam &&) = default;
60 ~format_bam() = default;
61
63
66
67protected:
68 template <typename stream_type, // constraints checked by file
69 typename seq_legal_alph_type,
70 typename ref_seqs_type,
71 typename ref_ids_type,
72 typename stream_pos_type,
73 typename seq_type,
74 typename id_type,
75 typename ref_seq_type,
76 typename ref_id_type,
77 typename ref_offset_type,
78 typename cigar_type,
79 typename flag_type,
80 typename mapq_type,
81 typename qual_type,
82 typename mate_type,
83 typename tag_dict_type,
84 typename e_value_type,
85 typename bit_score_type>
86 void read_alignment_record(stream_type & stream,
88 ref_seqs_type & ref_seqs,
90 stream_pos_type & position_buffer,
91 seq_type & seq,
92 qual_type & qual,
93 id_type & id,
94 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
95 ref_id_type & ref_id,
96 ref_offset_type & ref_offset,
97 cigar_type & cigar_vector,
98 flag_type & flag,
99 mapq_type & mapq,
100 mate_type & mate,
101 tag_dict_type & tag_dict,
102 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
103 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
104
105 template <typename stream_type,
106 typename header_type,
107 typename seq_type,
108 typename id_type,
109 typename ref_seq_type,
110 typename ref_id_type,
111 typename cigar_type,
112 typename qual_type,
113 typename mate_type,
114 typename tag_dict_type>
115 void write_alignment_record([[maybe_unused]] stream_type & stream,
116 [[maybe_unused]] sam_file_output_options const & options,
117 [[maybe_unused]] header_type && header,
118 [[maybe_unused]] seq_type && seq,
119 [[maybe_unused]] qual_type && qual,
120 [[maybe_unused]] id_type && id,
121 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
122 [[maybe_unused]] ref_id_type && ref_id,
123 [[maybe_unused]] std::optional<int32_t> ref_offset,
124 [[maybe_unused]] cigar_type && cigar_vector,
125 [[maybe_unused]] sam_flag const flag,
126 [[maybe_unused]] uint8_t const mapq,
127 [[maybe_unused]] mate_type && mate,
128 [[maybe_unused]] tag_dict_type && tag_dict,
129 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
130 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
131
133 template <typename stream_t, typename header_type>
134 void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
135
136private:
138 bool header_was_read{false};
139
142
145 { // naming corresponds to official SAM/BAM specifications
146 int32_t block_size;
147 int32_t refID;
148 int32_t pos;
149 uint32_t l_read_name : 8;
150 uint32_t mapq : 8;
151 uint32_t bin : 16;
152 uint32_t n_cigar_op : 16;
154 int32_t l_seq;
155 int32_t next_refID;
156 int32_t next_pos;
157 int32_t tlen;
158 };
159
160 static_assert(sizeof(alignment_record_core) == 36);
161
162 // clang-format off
165 {
166 []() constexpr {
168
169 using index_t = std::make_unsigned_t<char>;
170
171 // ret['M'] = 0; set anyway by initialization
172 ret[static_cast<index_t>('I')] = 1;
173 ret[static_cast<index_t>('D')] = 2;
174 ret[static_cast<index_t>('N')] = 3;
175 ret[static_cast<index_t>('S')] = 4;
176 ret[static_cast<index_t>('H')] = 5;
177 ret[static_cast<index_t>('P')] = 6;
178 ret[static_cast<index_t>('=')] = 7;
179 ret[static_cast<index_t>('X')] = 8;
180
181 return ret;
182 }()
183 };
184 // clang-format on
185
187 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
188 {
189 --end;
190 if (beg >> 14 == end >> 14)
191 return ((1 << 15) - 1) / 7 + (beg >> 14);
192 if (beg >> 17 == end >> 17)
193 return ((1 << 12) - 1) / 7 + (beg >> 17);
194 if (beg >> 20 == end >> 20)
195 return ((1 << 9) - 1) / 7 + (beg >> 20);
196 if (beg >> 23 == end >> 23)
197 return ((1 << 6) - 1) / 7 + (beg >> 23);
198 if (beg >> 26 == end >> 26)
199 return ((1 << 3) - 1) / 7 + (beg >> 26);
200 return 0;
201 }
202
209 template <typename stream_view_type, std::integral number_type>
210 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
211 {
212 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
213 }
214
216 template <std::integral number_type>
217 void read_integral_byte_field(std::string_view const str, number_type & target)
218 {
219 std::memcpy(&target, str.data(), sizeof(target));
220 }
221
227 template <typename stream_view_type>
228 void read_float_byte_field(stream_view_type && stream_view, float & target)
229 {
230 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
231 }
232
233 template <typename value_type>
235 std::string_view const str,
236 value_type const & SEQAN3_DOXYGEN_ONLY(value));
237
238 void read_sam_dict(std::string_view const tag_str, sam_tag_dictionary & target);
239
241
242 static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
243};
244
246template <typename stream_type, // constraints checked by file
247 typename seq_legal_alph_type,
248 typename ref_seqs_type,
249 typename ref_ids_type,
250 typename stream_pos_type,
251 typename seq_type,
252 typename id_type,
253 typename ref_seq_type,
254 typename ref_id_type,
255 typename ref_offset_type,
256 typename cigar_type,
257 typename flag_type,
258 typename mapq_type,
259 typename qual_type,
260 typename mate_type,
261 typename tag_dict_type,
262 typename e_value_type,
263 typename bit_score_type>
264inline void format_bam::read_alignment_record(stream_type & stream,
266 ref_seqs_type & ref_seqs,
268 stream_pos_type & position_buffer,
269 seq_type & seq,
270 qual_type & qual,
271 id_type & id,
272 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
273 ref_id_type & ref_id,
274 ref_offset_type & ref_offset,
275 cigar_type & cigar_vector,
276 flag_type & flag,
277 mapq_type & mapq,
278 mate_type & mate,
279 tag_dict_type & tag_dict,
280 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
281 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
282{
283 static_assert(detail::decays_to_ignore_v<ref_offset_type>
284 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
285 "The ref_offset must be a specialisation of std::optional.");
286
287 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
288 "The type of field::mapq must be uint8_t.");
289
290 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
291 "The type of field::flag must be seqan3::sam_flag.");
292
293 auto stream_view = seqan3::detail::istreambuf(stream);
294
295 // Header
296 // -------------------------------------------------------------------------------------------------------------
297 if (!header_was_read)
298 {
299 // magic BAM string
301 throw format_error{"File is not in BAM format."};
302
303 int32_t l_text{}; // length of header text including \0 character
304 int32_t n_ref{}; // number of reference sequences
305 int32_t l_name{}; // 1 + length of reference name including \0 character
306 int32_t l_ref{}; // length of reference sequence
307
308 read_integral_byte_field(stream_view, l_text);
309
310 if (l_text > 0) // header text is present
311 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs, options);
312
313 read_integral_byte_field(stream_view, n_ref);
314
315 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
316 {
317 read_integral_byte_field(stream_view, l_name);
318
319 string_buffer.resize(l_name - 1);
321 l_name - 1,
322 string_buffer.data()); // copy without \0 character
323 ++std::ranges::begin(stream_view); // skip \0 character
324
325 read_integral_byte_field(stream_view, l_ref);
326
327 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
328 {
329 // If there was no header text, we parse reference sequences block as header information
330 if (l_text == 0)
331 {
332 auto & reference_ids = header.ref_ids();
333 // put the length of the reference sequence into ref_id_info
334 header.ref_id_info.emplace_back(l_ref, "");
335 // put the reference name into reference_ids
336 reference_ids.push_back(string_buffer);
337 // assign the reference name an ascending reference id (starts at index 0).
338 header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
339 continue;
340 }
341 }
342
343 auto id_it = header.ref_dict.find(string_buffer);
344
345 // sanity checks of reference information to existing header object:
346 if (id_it == header.ref_dict.end()) // [unlikely]
347 {
348 throw format_error{detail::to_string("Unknown reference name '" + string_buffer
349 + "' found in BAM file header (header.ref_ids():",
350 header.ref_ids(),
351 ").")};
352 }
353 else if (id_it->second != ref_idx) // [unlikely]
354 {
355 throw format_error{detail::to_string("Reference id '",
357 "' at position ",
358 ref_idx,
359 " does not correspond to the position ",
360 id_it->second,
361 " in the header (header.ref_ids():",
362 header.ref_ids(),
363 ").")};
364 }
365 else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
366 {
367 throw format_error{"Provided reference has unequal length as specified in the header."};
368 }
369 }
370
371 header_was_read = true;
372
373 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
374 return;
375 }
376
377 // read alignment record into buffer
378 // -------------------------------------------------------------------------------------------------------------
379 position_buffer = stream.tellg();
380
381 auto stream_it = detail::fast_istreambuf_iterator{*stream.rdbuf()};
382
384 std::string_view const core_str = stream_it.cache_bytes(sizeof(core));
385 std::ranges::copy(core_str, reinterpret_cast<char *>(&core));
386
387 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
388 {
389 throw format_error{detail::to_string("Reference id index '",
390 core.refID,
391 "' is not in range of ",
392 "header.ref_ids(), which has size ",
393 header.ref_ids().size(),
394 ".")};
395 }
396 else if (core.refID > -1) // not unmapped
397 {
398 ref_id = core.refID; // field::ref_id
399 }
400
401 flag = core.flag; // field::flag
402 mapq = static_cast<uint8_t>(core.mapq); // field::mapq
403
404 if (core.pos > -1) // [[likely]]
405 ref_offset = core.pos; // field::ref_offset
406
407 if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
408 {
409 if (core.next_refID > -1)
410 get<0>(mate) = core.next_refID;
411
412 if (core.next_pos > -1) // [[likely]]
413 get<1>(mate) = core.next_pos;
414
415 get<2>(mate) = core.tlen;
416 }
417
418 // read id
419 // -------------------------------------------------------------------------------------------------------------
420 std::string_view record_str = stream_it.cache_bytes(core.block_size - (sizeof(alignment_record_core) - 4));
421 size_t considered_bytes{0};
422
423 if constexpr (!detail::decays_to_ignore_v<id_type>)
424 read_forward_range_field(record_str.substr(0, core.l_read_name - 1), id);
425
426 considered_bytes += core.l_read_name;
427
428 // read cigar string
429 // -------------------------------------------------------------------------------------------------------------
430 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
431 cigar_vector = parse_binary_cigar(record_str.substr(considered_bytes, core.n_cigar_op * 4));
432
433 considered_bytes += core.n_cigar_op * 4;
434
435 // read sequence
436 // -------------------------------------------------------------------------------------------------------------
437 if constexpr (!detail::decays_to_ignore_v<seq_type>)
438 {
439 size_t const number_of_bytes = (core.l_seq + 1) / 2;
440 std::string_view const seq_str = record_str.substr(considered_bytes, number_of_bytes);
441
442 seq.resize(
443 core.l_seq
444 + 1 /* reserve one more in case size is uneven. will be corrected */); // TODO: .resize() is not generic
445
446 using alph_t = std::ranges::range_value_t<decltype(seq)>;
447 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
448
449 // 1 byte encodes two sequence characters
450 for (size_t i = 0, j = 0; i < number_of_bytes; ++i, j += 2)
451 {
452 seq[j] = from_dna16[to_rank(dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(seq_str[i]) >> 4)))];
453 seq[j + 1] =
454 from_dna16[to_rank(dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(seq_str[i]) & 0x0f)))];
455 }
456
457 seq.resize(core.l_seq); // remove extra letter
458 }
459
460 considered_bytes += (core.l_seq + 1) / 2;
461
462 // read qual string
463 // -------------------------------------------------------------------------------------------------------------
464 if constexpr (!detail::decays_to_ignore_v<qual_type>)
465 {
466 std::string_view const qual_str = record_str.substr(considered_bytes, core.l_seq);
467 qual.resize(core.l_seq); // TODO: this is not generic
468
469 for (int32_t i = 0; i < core.l_seq; ++i)
470 qual[i] = assign_char_to(static_cast<char>(qual_str[i] + 33), std::ranges::range_value_t<qual_type>{});
471 }
472
473 considered_bytes += core.l_seq;
474
475 // All remaining optional fields if any: SAM tags dictionary
476 // -------------------------------------------------------------------------------------------------------------
477 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
478 read_sam_dict(record_str.substr(considered_bytes), tag_dict);
479
480 // DONE READING - wrap up
481 // -------------------------------------------------------------------------------------------------------------
482 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
483 {
484 int32_t const sc_front = soft_clipping_at_front(cigar_vector);
485
486 // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
487 // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
488 // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
489 if (core.l_seq != 0 && sc_front == core.l_seq)
490 {
491 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
492 { // maybe only throw in debug mode and otherwise return an empty alignment?
493 throw format_error{
494 detail::to_string("The cigar string '",
495 detail::get_cigar_string(cigar_vector),
496 "' suggests that the cigar string exceeded 65535 elements and was therefore ",
497 "stored in the optional field CG. You need to read in the field::tags and "
498 "field::seq in order to access this information.")};
499 }
500 else
501 {
502 auto it = tag_dict.find("CG"_tag);
503
504 if (it == tag_dict.end())
505 throw format_error{
506 detail::to_string("The cigar string '",
507 detail::get_cigar_string(cigar_vector),
508 "' suggests that the cigar string exceeded 65535 elements and was therefore ",
509 "stored in the optional field CG but this tag is not present in the given ",
510 "record.")};
511
512 cigar_vector = detail::parse_cigar(std::get<std::string>(it->second));
513 tag_dict.erase(it); // remove redundant information
514 }
515 }
516 }
517}
518
520template <typename stream_type,
521 typename header_type,
522 typename seq_type,
523 typename id_type,
524 typename ref_seq_type,
525 typename ref_id_type,
526 typename cigar_type,
527 typename qual_type,
528 typename mate_type,
529 typename tag_dict_type>
530inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
531 [[maybe_unused]] sam_file_output_options const & options,
532 [[maybe_unused]] header_type && header,
533 [[maybe_unused]] seq_type && seq,
534 [[maybe_unused]] qual_type && qual,
535 [[maybe_unused]] id_type && id,
536 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
537 [[maybe_unused]] ref_id_type && ref_id,
538 [[maybe_unused]] std::optional<int32_t> ref_offset,
539 [[maybe_unused]] cigar_type && cigar_vector,
540 [[maybe_unused]] sam_flag const flag,
541 [[maybe_unused]] uint8_t const mapq,
542 [[maybe_unused]] mate_type && mate,
543 [[maybe_unused]] tag_dict_type && tag_dict,
544 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
545 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
546{
547 // ---------------------------------------------------------------------
548 // Type Requirements (as static asserts for user friendliness)
549 // ---------------------------------------------------------------------
550 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
551 "The seq object must be a std::ranges::forward_range over "
552 "letters that model seqan3::alphabet.");
553
554 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
555 "The id object must be a std::ranges::forward_range over "
556 "letters that model seqan3::alphabet.");
557
558 static_assert((std::ranges::forward_range<ref_seq_type> && alphabet<std::ranges::range_reference_t<ref_seq_type>>),
559 "The ref_seq object must be a std::ranges::forward_range "
560 "over letters that model seqan3::alphabet.");
561
562 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
563 {
564 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
565 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
566 "The ref_id object must be a std::ranges::forward_range "
567 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
568 }
569
570 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
571 "The qual object must be a std::ranges::forward_range "
572 "over letters that model seqan3::alphabet.");
573
575 "The mate object must be a std::tuple of size 3 with "
576 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
577 "2) a std::integral or std::optional<std::integral>, and "
578 "3) a std::integral.");
579
580 static_assert(
581 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
582 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
583 || detail::is_type_specialisation_of_v<
584 std::remove_cvref_t<decltype(std::get<0>(mate))>,
585 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
586 || detail::is_type_specialisation_of_v<
587 std::remove_cvref_t<decltype(std::get<1>(mate))>,
588 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
589 "The mate object must be a std::tuple of size 3 with "
590 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
591 "2) a std::integral or std::optional<std::integral>, and "
592 "3) a std::integral.");
593
594 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
595 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
596
597 if constexpr (detail::decays_to_ignore_v<header_type>)
598 {
599 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
600 "You can either construct the output file with reference names and reference length "
601 "information and the header will be created for you, or you can access the `header` member "
602 "directly."};
603 }
604 else
605 {
606 // ---------------------------------------------------------------------
607 // logical Requirements
608 // ---------------------------------------------------------------------
609
610 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
611 throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
612
613 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
614
615 // ---------------------------------------------------------------------
616 // Writing the BAM Header on first call
617 // ---------------------------------------------------------------------
619 {
620 write_header(stream, options, header);
621 header_was_written = true;
622 }
623
624 // ---------------------------------------------------------------------
625 // Writing the Record
626 // ---------------------------------------------------------------------
627 int32_t ref_length{};
628
629 // Compute the ref_length from given cigar_vector which is needed to fill field `bin`.
630 if (!std::ranges::empty(cigar_vector))
631 {
632 int32_t dummy_seq_length{};
633 for (auto & [count, operation] : cigar_vector)
634 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
635 }
636
637 if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
638 {
639 tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
640 cigar_vector.resize(2);
641 cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
642 cigar_vector[1] = cigar{static_cast<uint32_t>(ref_length), 'N'_cigar_operation};
643 }
644
645 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
646
647 // Compute the value for the l_read_name field for the bam record.
648 // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
649 // the data type to store the value is uint8_t and 255 is the maximal size.
650 // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
651 // 2 bytes.
652 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
653 read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
654
655 alignment_record_core core{/* block_size */ 0, // will be initialised right after
656 /* refID */ -1, // will be initialised right after
657 /* pos */ ref_offset.value_or(-1),
658 /* l_read_name */ read_name_size,
659 /* mapq */ mapq,
660 /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
661 /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
662 /* flag */ flag,
663 /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
664 /* next_refId */ -1, // will be initialised right after
665 /* next_pos */ get<1>(mate).value_or(-1),
666 /* tlen */ get<2>(mate)};
667
668 auto check_and_assign_id_to = [&header]([[maybe_unused]] auto & id_source, [[maybe_unused]] auto & id_target)
669 {
670 using id_t = std::remove_reference_t<decltype(id_source)>;
671
672 if constexpr (!detail::decays_to_ignore_v<id_t>)
673 {
674 if constexpr (std::integral<id_t>)
675 {
676 id_target = id_source;
677 }
678 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
679 {
680 id_target = id_source.value_or(-1);
681 }
682 else
683 {
684 if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
685 {
686 auto id_it = header.ref_dict.end();
687
688 if constexpr (std::ranges::contiguous_range<decltype(id_source)>
689 && std::ranges::sized_range<decltype(id_source)>
690 && std::ranges::borrowed_range<decltype(id_source)>)
691 {
692 id_it = header.ref_dict.find(
693 std::span{std::ranges::data(id_source), std::ranges::size(id_source)});
694 }
695 else
696 {
697 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
698
699 static_assert(
700 implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
701 "The ref_id type is not convertible to the reference id information stored in the "
702 "reference dictionary of the header object.");
703
704 id_it = header.ref_dict.find(id_source);
705 }
706
707 if (id_it == header.ref_dict.end())
708 {
709 throw format_error{detail::to_string("Unknown reference name '",
710 id_source,
711 "' could "
712 "not be found in BAM header ref_dict: ",
713 header.ref_dict,
714 ".")};
715 }
716
717 id_target = id_it->second;
718 }
719 }
720 }
721 };
722
723 // initialise core.refID
724 check_and_assign_id_to(ref_id, core.refID);
725
726 // initialise core.next_refID
727 check_and_assign_id_to(get<0>(mate), core.next_refID);
728
729 // initialise core.block_size
730 core.block_size = sizeof(core) - 4 /*block_size excluded*/ + core.l_read_name + core.n_cigar_op * 4
731 + // each int32_t has 4 bytes
732 (core.l_seq + 1) / 2 + // bitcompressed seq
733 core.l_seq + // quality string
734 tag_dict_binary_str.size();
735
736 std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
737
738 if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
739 stream_it = '*';
740 else
741 std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
742 stream_it = '\0';
743
744 // write cigar
745 for (auto [cigar_count, op] : cigar_vector)
746 {
747 cigar_count = cigar_count << 4;
748 cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
749 std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
750 }
751
752 // write seq (bit-compressed: dna16sam characters go into one byte)
753 using alph_t = std::ranges::range_value_t<seq_type>;
754 constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
755
756 auto sit = std::ranges::begin(seq);
757 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
758 {
759 uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
760 ++sidx, ++sit;
761 compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
762 stream_it = static_cast<char>(compressed_chr);
763 }
764
765 if (core.l_seq & 1) // write one more
766 stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
767
768 // write qual
769 if (std::ranges::empty(qual))
770 {
771 auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
772 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
773 }
774 else
775 {
776 if (std::ranges::distance(qual) != core.l_seq)
777 throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
778 core.l_seq,
779 ". Got quality with size ",
780 std::ranges::distance(qual),
781 " instead.")};
782
783 auto v = qual
784 | std::views::transform(
785 [](auto chr)
786 {
787 return static_cast<char>(to_rank(chr));
788 });
789 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
790 }
791
792 // write optional fields
793 stream << tag_dict_binary_str;
794 } // if constexpr (!detail::decays_to_ignore_v<header_type>)
795}
796
798template <typename stream_t, typename header_type>
799inline void format_bam::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
800{
801 if constexpr (detail::decays_to_ignore_v<header_type>)
802 {
803 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
804 "You can either construct the output file with reference names and reference length "
805 "information and the header will be created for you, or you can access the `header` member "
806 "directly."};
807 }
808 else
809 {
810 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
811
812 std::ranges::copy_n("BAM\1", 4, stream_it); // Do not copy the null terminator
813
814 // write SAM header to temporary stream first to query its size.
816 detail::format_sam_base::write_header(os, options, header);
817#if SEQAN3_WORKAROUND_GCC_NO_CXX11_ABI
818 int32_t const l_text{static_cast<int32_t>(os.str().size())};
819#else
820 int32_t const l_text{static_cast<int32_t>(os.view().size())};
821#endif
822 std::ranges::copy_n(reinterpret_cast<char const *>(&l_text), 4, stream_it); // write text length
823
824#if SEQAN3_WORKAROUND_GCC_NO_CXX11_ABI
825 auto header_view = os.str();
826#else
827 auto header_view = os.view();
828#endif
829 std::ranges::copy(header_view, stream_it);
830
831 assert(header.ref_ids().size() < (1ull << 32));
832 int32_t const n_ref{static_cast<int32_t>(header.ref_ids().size())};
833 std::ranges::copy_n(reinterpret_cast<char const *>(&n_ref), 4, stream_it); // write number of references
834
835 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
836 {
837 assert(header.ref_ids()[ridx].size() + 1 < (1ull << 32));
838 int32_t const l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
839 std::ranges::copy_n(reinterpret_cast<char const *>(&l_name), 4, stream_it); // write l_name
840 // write reference name:
841 std::ranges::copy(header.ref_ids()[ridx], stream_it);
842 stream_it = '\0'; // ++ is not necessary for ostream_iterator
843 // write reference sequence length:
844 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
845 }
846 }
847}
848
867template <typename value_type>
869 std::string_view const str,
870 value_type const & SEQAN3_DOXYGEN_ONLY(value))
871{
872 auto it = str.begin();
873
874 // Read vector size from string_view and advance `it`.
875 int32_t const vector_size = [&]()
876 {
877 int32_t size{};
879 it += sizeof(size);
880 return size;
881 }();
882
883 int32_t bytes_left{vector_size};
884
885 std::vector<value_type> tmp_vector;
886 tmp_vector.reserve(vector_size);
887
888 value_type tmp{};
889
890 while (bytes_left > 0)
891 {
892 if constexpr (std::integral<value_type>)
894 else if constexpr (std::same_as<value_type, float>)
896 else
897 static_assert(std::is_same_v<value_type, void>, "format_bam::read_sam_dict_vector: unsupported value_type");
898
899 it += sizeof(tmp);
900 tmp_vector.push_back(std::move(tmp));
901 --bytes_left;
902 }
903
904 variant = std::move(tmp_vector);
905
906 return vector_size;
907}
908
925{
926 /* Every BAM tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
927 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
928 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
929 VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
930 by the length (int32_t) of the array, followed by the values.
931 */
932 auto it = tag_str.begin();
933
934 // Deduces int_t from passed argument.
935 auto parse_integer_into_target = [&]<std::integral int_t>(uint16_t const tag, int_t)
936 {
937 int_t tmp{};
938 read_integral_byte_field(std::string_view{it, tag_str.end()}, tmp);
939 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
940 it += sizeof(tmp);
941 };
942
943 // Deduces array_value_t from passed argument.
944 auto parse_array_into_target = [&]<arithmetic array_value_t>(uint16_t const tag, array_value_t)
945 {
946 int32_t const count = read_sam_dict_vector(target[tag], std::string_view{it, tag_str.end()}, array_value_t{});
947 it += sizeof(int32_t) /*length is stored within the vector*/ + sizeof(array_value_t) * count;
948 };
949
950 // Read uint16_t from string_view and advance `it`.
951 auto parse_tag = [&]()
952 {
953 uint16_t tag = static_cast<uint16_t>(*it) << 8;
954 ++it; // skip char read before
955 tag |= static_cast<uint16_t>(*it);
956 ++it; // skip char read before
957 return tag;
958 };
959
960 while (it != tag_str.end())
961 {
962 uint16_t const tag = parse_tag();
963
964 char const type_id{*it};
965 ++it; // skip char read before
966
967 switch (type_id)
968 {
969 case 'A': // char
970 {
971 target[tag] = *it;
972 ++it; // skip char that has been read
973 break;
974 }
975 // all integer sizes are possible
976 case 'c': // int8_t
977 {
978 parse_integer_into_target(tag, int8_t{});
979 break;
980 }
981 case 'C': // uint8_t
982 {
983 parse_integer_into_target(tag, uint8_t{});
984 break;
985 }
986 case 's': // int16_t
987 {
988 parse_integer_into_target(tag, int16_t{});
989 break;
990 }
991 case 'S': // uint16_t
992 {
993 parse_integer_into_target(tag, uint16_t{});
994 break;
995 }
996 case 'i': // int32_t
997 {
998 parse_integer_into_target(tag, int32_t{});
999 break;
1000 }
1001 case 'I': // uint32_t
1002 {
1003 parse_integer_into_target(tag, uint32_t{});
1004 break;
1005 }
1006 case 'f': // float
1007 {
1008 float tmp{};
1009 read_float_byte_field(std::string_view{it, tag_str.end()}, tmp);
1010 target[tag] = tmp;
1011 it += sizeof(float);
1012 break;
1013 }
1014 case 'Z': // string
1015 {
1016 std::string const v{static_cast<char const *>(it)}; // parses until '\0'
1017 it += v.size() + 1;
1018 target[tag] = std::move(v);
1019 break;
1020 }
1021 case 'H': // byte array, represented as null-terminated string; specification requires even number of bytes
1022 {
1023 std::string_view const str{static_cast<char const *>(it)}; // parses until '\0'
1024
1025 std::vector<std::byte> tmp_vector{};
1026 // std::from_chars cannot directly parse into a std::byte
1027 uint8_t dummy_byte{};
1028
1029 if (str.size() % 2 != 0)
1030 throw format_error{"[CORRUPTED BAM FILE] Hexadecimal tag must have even number of digits."};
1031
1032 // H encodes bytes in a hexadecimal format. Two hex values are stored for each byte as characters.
1033 // E.g., '1' and 'A' need one byte each and are read as `\x1A`, which is 27 in decimal.
1034 for (auto hex_begin = str.begin(), hex_end = str.begin() + 2; hex_begin != str.end();
1035 hex_begin += 2, hex_end += 2)
1036 {
1037 auto res = std::from_chars(hex_begin, hex_end, dummy_byte, 16);
1038
1039 if (res.ec == std::errc::invalid_argument)
1040 throw format_error{std::string("[CORRUPTED BAM FILE] The string '")
1041 + std::string(hex_begin, hex_end) + "' could not be cast into type uint8_t."};
1042
1043 if (res.ec == std::errc::result_out_of_range)
1044 throw format_error{std::string("[CORRUPTED BAM FILE] Casting '") + std::string(str)
1045 + "' into type uint8_t would cause an overflow."};
1046
1047 tmp_vector.push_back(std::byte{dummy_byte});
1048 }
1049
1050 target[tag] = std::move(tmp_vector);
1051
1052 it += str.size() + 1;
1053
1054 break;
1055 }
1056 case 'B': // Array. Value type depends on second char [cCsSiIf]
1057 {
1058 char array_value_type_id = *it;
1059 ++it; // skip char read before
1060
1061 switch (array_value_type_id)
1062 {
1063 case 'c': // int8_t
1064 parse_array_into_target(tag, int8_t{});
1065 break;
1066 case 'C': // uint8_t
1067 parse_array_into_target(tag, uint8_t{});
1068 break;
1069 case 's': // int16_t
1070 parse_array_into_target(tag, int16_t{});
1071 break;
1072 case 'S': // uint16_t
1073 parse_array_into_target(tag, uint16_t{});
1074 break;
1075 case 'i': // int32_t
1076 parse_array_into_target(tag, int32_t{});
1077 break;
1078 case 'I': // uint32_t
1079 parse_array_into_target(tag, uint32_t{});
1080 break;
1081 case 'f': // float
1082 parse_array_into_target(tag, float{});
1083 break;
1084 default:
1085 throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1086 "must be one of [cCsSiIf] but '",
1087 array_value_type_id,
1088 "' was given.")};
1089 }
1090 break;
1091 }
1092 default:
1093 throw format_error{detail::to_string("The second character in the numerical id of a "
1094 "SAM tag must be one of [A,i,Z,H,B,f] but '",
1095 type_id,
1096 "' was given.")};
1097 }
1098 }
1099}
1100
1107{
1108 // The cigar operation is encoded in 4 bits.
1109 constexpr std::array<char, 16>
1110 cigar_operation_mapping{'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', '*', '*', '*', '*', '*', '*', '*'};
1111 // The rightmost 4 bits encode the operation, the other bits encode the count.
1112 constexpr uint32_t cigar_operation_mask = 0x0f; // rightmost 4 bits are set to one
1113
1114 std::vector<cigar> cigar_vector{};
1115 char operation{'\0'};
1116 uint32_t count{};
1117 uint32_t operation_and_count{}; // In BAM, operation and count values are stored within one 32 bit integer.
1118
1119 assert(cigar_str.size() % 4 == 0); // One cigar letter is stored in 4 bytes (uint32_t).
1120
1121 for (auto it = cigar_str.begin(); it != cigar_str.end(); it += sizeof(operation_and_count))
1122 {
1123 std::memcpy(&operation_and_count, it, sizeof(operation_and_count));
1124 operation = cigar_operation_mapping[operation_and_count & cigar_operation_mask];
1125 count = operation_and_count >> 4;
1126
1127 cigar_vector.emplace_back(count, seqan3::assign_char_strictly_to(operation, cigar::operation{}));
1128 }
1129
1130 return cigar_vector;
1131}
1132
1137{
1138 std::string result{};
1139
1140 auto stream_variant_fn = [&result](auto && arg) // helper to print a std::variant
1141 {
1142 // T is either char, int32_t, float, std::string, or a std::vector<some int>
1143 using T = std::remove_cvref_t<decltype(arg)>;
1144
1145 if constexpr (std::same_as<T, int32_t>)
1146 {
1147 // always choose the smallest possible representation [cCsSiI]
1148 size_t const absolute_arg = std::abs(arg);
1149 auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1150 bool const negative = arg < 0;
1151 n = n * n + 2 * negative; // for switch case order
1152
1153 switch (n)
1154 {
1155 case 0:
1156 {
1157 result[result.size() - 1] = 'C';
1158 result.append(reinterpret_cast<char const *>(&arg), 1);
1159 break;
1160 }
1161 case 1:
1162 {
1163 result[result.size() - 1] = 'S';
1164 result.append(reinterpret_cast<char const *>(&arg), 2);
1165 break;
1166 }
1167 case 2:
1168 {
1169 result[result.size() - 1] = 'c';
1170 int8_t tmp = static_cast<int8_t>(arg);
1171 result.append(reinterpret_cast<char const *>(&tmp), 1);
1172 break;
1173 }
1174 case 3:
1175 {
1176 result[result.size() - 1] = 's';
1177 int16_t tmp = static_cast<int16_t>(arg);
1178 result.append(reinterpret_cast<char const *>(&tmp), 2);
1179 break;
1180 }
1181 default:
1182 {
1183 result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1184 break;
1185 }
1186 }
1187 }
1188 else if constexpr (std::same_as<T, std::string>)
1189 {
1190 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1 /*+ null character*/);
1191 }
1192 else if constexpr (!std::ranges::range<T>) // char, float
1193 {
1194 result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1195 }
1196 else // std::vector of some arithmetic_type type
1197 {
1198 int32_t sz{static_cast<int32_t>(arg.size())};
1199 result.append(reinterpret_cast<char *>(&sz), 4);
1200 result.append(reinterpret_cast<char const *>(arg.data()),
1201 arg.size() * sizeof(std::ranges::range_value_t<T>));
1202 }
1203 };
1204
1205 for (auto & [tag, variant] : tag_dict)
1206 {
1207 result.push_back(static_cast<char>(tag / 256));
1208 result.push_back(static_cast<char>(tag % 256));
1209
1210 result.push_back(detail::sam_tag_type_char[variant.index()]);
1211
1212 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1213 result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1214
1215 std::visit(stream_variant_fn, variant);
1216 }
1217
1218 return result;
1219}
1220
1221} // namespace seqan3
T begin(T... args)
T bit_ceil(T... args)
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition alphabet_base.hpp:184
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition alphabet/cigar/cigar.hpp:57
Functionally the same as std::istreambuf_iterator, but faster.
Definition fast_istreambuf_iterator.hpp:37
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition fast_ostreambuf_iterator.hpp:37
The alignment base format.
Definition format_sam_base.hpp:43
void write_header(stream_t &stream, sam_file_output_options const &options, header_type &header)
Writes the SAM header.
Definition format_sam_base.hpp:621
int32_t soft_clipping_at_front(std::vector< cigar > const &cigar_vector) const
Returns the soft clipping value at the front of the cigar_vector or 0 if none present.
Definition format_sam_base.hpp:147
void read_forward_range_field(stream_view_type &&stream_view, target_range_type &target)
Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to.
Definition format_sam_base.hpp:189
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition format_sam_base.hpp:64
void read_header(stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &, sam_file_input_options< seq_legal_alph_type > const &options)
Reads the SAM header.
Definition format_sam_base.hpp:279
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition dna16sam.hpp:45
The actual implementation of seqan3::cigar::operation for documentation purposes only.
Definition cigar_operation.hpp:45
The BAM format.
Definition format_bam.hpp:49
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition format_bam.hpp:264
format_bam()=default
Defaulted.
int32_t read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, std::string_view const str, value_type const &value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition format_bam.hpp:868
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
Write the given fields to the specified stream.
Definition format_bam.hpp:530
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
void write_header(stream_t &stream, sam_file_output_options const &options, header_type &header)
Writes the SAM header.
Definition format_bam.hpp:799
std::vector< cigar > parse_binary_cigar(std::string_view const cigar_str) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition format_bam.hpp:1106
format_bam(format_bam &&)=default
Defaulted.
void read_integral_byte_field(stream_view_type &&stream_view, number_type &target)
Reads a arithmetic field from binary stream by directly reinterpreting the bits.
Definition format_bam.hpp:210
void read_integral_byte_field(std::string_view const str, number_type &target)
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition format_bam.hpp:217
void read_sam_dict(std::string_view const tag_str, sam_tag_dictionary &target)
Reads the optional tag fields into the seqan3::sam_tag_dictionary.
Definition format_bam.hpp:924
~format_bam()=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
void read_float_byte_field(stream_view_type &&stream_view, float &target)
Reads a float field from binary stream by directly reinterpreting the bits.
Definition format_bam.hpp:228
static std::string get_tag_dict_str(sam_tag_dictionary const &tag_dict)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition format_bam.hpp:1136
std::string string_buffer
Local buffer to read into while avoiding reallocation.
Definition format_bam.hpp:141
static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
Computes the bin number for a given region [beg, end), copied from the official SAM specifications.
Definition format_bam.hpp:187
bool header_was_read
A variable that tracks whether the content of header has been read or not.
Definition format_bam.hpp:138
static constexpr std::array< uint8_t, 256 > char_to_sam_rank
Converts a cigar op character to the rank according to the official BAM specifications.
Definition format_bam.hpp:165
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition format_bam.hpp:65
Stores the header information of SAM/BAM files.
Definition header.hpp:46
ref_ids_type & ref_ids()
The range of reference ids.
Definition header.hpp:140
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition header.hpp:179
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition header.hpp:176
The SAM tag dictionary class that stores all optional SAM fields.
Definition sam_tag_dictionary.hpp:327
T copy(T... args)
T copy_n(T... args)
T countr_zero(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T emplace_back(T... args)
T end(T... args)
T equal(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
T from_chars(T... args)
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition alphabet/concept.hpp:521
constexpr auto assign_char_strictly_to
Assign a character to an alphabet object, throw if the character is not valid.
Definition alphabet/concept.hpp:731
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition alphabet/concept.hpp:152
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition sam_flag.hpp:73
std::string get_cigar_string(std::vector< cigar > const &cigar_vector)
Transforms a vector of cigar elements into a string representation.
Definition io/sam_file/detail/cigar.hpp:118
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition sam_tag_dictionary.hpp:42
void update_alignment_lengths(int32_t &ref_length, int32_t &seq_length, char const cigar_operation, uint32_t const cigar_count)
Updates the sequence lengths by cigar_count depending on the cigar operation op.
Definition io/sam_file/detail/cigar.hpp:48
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition sam_tag_dictionary.hpp:39
constexpr std::vector< cigar > parse_cigar(std::string_view const cigar_str)
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition io/sam_file/detail/cigar.hpp:87
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition take_exactly_view.hpp:587
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition istreambuf_view.hpp:104
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ bit_score
The bit score (statistical significance indicator), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ id
The identifier, usually a string.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition predicate.hpp:60
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition repeat_n.hpp:88
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
A type that satisfies std::is_arithmetic_v<t>.
Checks whether from can be implicityly converted to to.
Whether a type behaves like a tuple.
Auxiliary functions for the SAM IO.
Provides seqan3::detail::istreambuf.
T memcpy(T... args)
T min(T... args)
std::string to_string(value_type &&... values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition to_string.hpp:26
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
Provides seqan3::debug_stream and related types.
T push_back(T... args)
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Stores all fixed length variables which can be read/written directly by reinterpreting the binary str...
Definition format_bam.hpp:145
uint32_t bin
The bin number.
Definition format_bam.hpp:151
sam_flag flag
The flag value (uint16_t enum).
Definition format_bam.hpp:153
uint32_t n_cigar_op
The number of cigar operations of the alignment.
Definition format_bam.hpp:152
int32_t next_refID
The reference id of the mate.
Definition format_bam.hpp:155
int32_t l_seq
The number of bases of the read sequence.
Definition format_bam.hpp:154
int32_t refID
The reference id the read was mapped to.
Definition format_bam.hpp:147
int32_t pos
The begin position of the alignment.
Definition format_bam.hpp:148
int32_t block_size
The size in bytes of the whole BAM record.
Definition format_bam.hpp:146
uint32_t l_read_name
The length of the read name including the \0 character.
Definition format_bam.hpp:149
int32_t tlen
The template length of the read and its mate.
Definition format_bam.hpp:157
uint32_t mapq
The mapping quality.
Definition format_bam.hpp:150
int32_t next_pos
The begin position of the mate alignment.
Definition format_bam.hpp:156
Thrown if information given to output format didn't match expectations.
Definition io/exception.hpp:88
The options type defines various option members that influence the behaviour of all or some formats.
Definition sam_file/input_options.hpp:26
The options type defines various option members that influence the behavior of all or some formats.
Definition sam_file/output_options.hpp:23
T substr(T... args)
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::debug_stream and related types.
T visit(T... args)
Hide me