70 template <
typename stream_type,
71 typename seq_legal_alph_type,
72 typename ref_seqs_type,
73 typename ref_ids_type,
77 typename ref_seq_type,
79 typename ref_offset_type,
86 typename tag_dict_type,
87 typename e_value_type,
88 typename bit_score_type>
91 ref_seqs_type & ref_seqs,
97 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
99 ref_offset_type & ref_offset,
101 cigar_type & cigar_vector,
105 tag_dict_type & tag_dict,
106 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
107 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
109 template <
typename stream_type,
110 typename header_type,
113 typename ref_seq_type,
114 typename ref_id_type,
119 typename tag_dict_type>
122 [[maybe_unused]] header_type && header,
123 [[maybe_unused]] seq_type && seq,
124 [[maybe_unused]] qual_type && qual,
125 [[maybe_unused]] id_type &&
id,
126 [[maybe_unused]] int32_t
const offset,
127 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
128 [[maybe_unused]] ref_id_type && ref_id,
130 [[maybe_unused]] align_type && align,
131 [[maybe_unused]] cigar_type && cigar_vector,
132 [[maybe_unused]]
sam_flag const flag,
133 [[maybe_unused]] uint8_t
const mapq,
134 [[maybe_unused]] mate_type && mate,
135 [[maybe_unused]] tag_dict_type && tag_dict,
136 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
137 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score));
141 bool header_was_read{
false};
147 struct alignment_record_core
152 uint32_t l_read_name:8;
155 uint32_t n_cigar_op:16;
173 ret[
static_cast<index_t
>(
'I')] = 1;
174 ret[
static_cast<index_t
>(
'D')] = 2;
175 ret[
static_cast<index_t
>(
'N')] = 3;
176 ret[
static_cast<index_t
>(
'S')] = 4;
177 ret[
static_cast<index_t
>(
'H')] = 5;
178 ret[
static_cast<index_t
>(
'P')] = 6;
179 ret[
static_cast<index_t
>(
'=')] = 7;
180 ret[
static_cast<index_t
>(
'X')] = 8;
187 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
190 if (beg >> 14 == end >> 14)
return ((1 << 15) - 1) / 7 + (beg >> 14);
191 if (beg >> 17 == end >> 17)
return ((1 << 12) - 1) / 7 + (beg >> 17);
192 if (beg >> 20 == end >> 20)
return ((1 << 9) - 1) / 7 + (beg >> 20);
193 if (beg >> 23 == end >> 23)
return ((1 << 6) - 1) / 7 + (beg >> 23);
194 if (beg >> 26 == end >> 26)
return ((1 << 3) - 1) / 7 + (beg >> 26);
198 using format_sam_base::read_field;
206 template <
typename stream_view_type, std::
integral number_type>
207 void read_field(stream_view_type && stream_view, number_type & target)
209 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(target),
reinterpret_cast<char *
>(&target));
217 template <
typename stream_view_type>
218 void read_field(stream_view_type && stream_view,
float & target)
220 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(int32_t),
reinterpret_cast<char *
>(&target));
233 template <
typename stream_view_type,
typename optional_value_type>
236 optional_value_type tmp;
237 read_field(std::forward<stream_view_type>(stream_view), tmp);
241 template <
typename stream_view_type,
typename value_type>
243 stream_view_type && stream_view,
244 value_type
const & SEQAN3_DOXYGEN_ONLY(value));
246 template <
typename stream_view_type>
247 void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
249 template <
typename cigar_input_type>
250 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const;
252 static std::string get_tag_dict_str(sam_tag_dictionary
const & tag_dict);
256 template <
typename stream_type,
257 typename seq_legal_alph_type,
258 typename ref_seqs_type,
259 typename ref_ids_type,
262 typename offset_type,
263 typename ref_seq_type,
264 typename ref_id_type,
265 typename ref_offset_type,
272 typename tag_dict_type,
273 typename e_value_type,
274 typename bit_score_type>
277 ref_seqs_type & ref_seqs,
282 offset_type & offset,
283 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
284 ref_id_type & ref_id,
285 ref_offset_type & ref_offset,
287 cigar_type & cigar_vector,
291 tag_dict_type & tag_dict,
292 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
293 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
295 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
296 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
297 "The ref_offset must be a specialisation of std::optional.");
299 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
300 "The type of field::mapq must be uint8_t.");
302 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
303 "The type of field::flag must be seqan3::sam_flag.");
305 auto stream_view = seqan3::detail::istreambuf(stream);
308 [[maybe_unused]] int32_t offset_tmp{};
309 [[maybe_unused]] int32_t soft_clipping_end{};
311 [[maybe_unused]] int32_t ref_length{0}, seq_length{0};
315 if (!header_was_read)
318 if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4),
std::string_view{
"BAM\1"}))
326 read_field(stream_view, l_text);
329 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
331 read_field(stream_view, n_ref);
333 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
335 read_field(stream_view, l_name);
337 string_buffer.
resize(l_name - 1);
338 std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.
data());
339 std::ranges::next(std::ranges::begin(stream_view));
341 read_field(stream_view, l_ref);
343 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>)
348 auto & reference_ids = header.
ref_ids();
352 reference_ids.push_back(string_buffer);
354 header.
ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
359 auto id_it = header.
ref_dict.find(string_buffer);
364 throw format_error{detail::to_string(
"Unknown reference name '" + string_buffer +
365 "' found in BAM file header (header.ref_ids():",
368 else if (id_it->second != ref_idx)
370 throw format_error{detail::to_string(
"Reference id '", string_buffer,
"' at position ", ref_idx,
371 " does not correspond to the position ", id_it->second,
372 " in the header (header.ref_ids():", header.
ref_ids(),
").")};
374 else if (std::get<0>(header.
ref_id_info[id_it->second]) != l_ref)
376 throw format_error{
"Provided reference has unequal length as specified in the header."};
380 header_was_read =
true;
382 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view))
388 alignment_record_core core;
389 std::ranges::copy(stream_view | detail::take_exactly_or_throw(
sizeof(core)),
reinterpret_cast<char *
>(&core));
391 if (core.refID >=
static_cast<int32_t
>(header.
ref_ids().size()) || core.refID < -1)
393 throw format_error{detail::to_string(
"Reference id index '", core.refID,
"' is not in range of ",
394 "header.ref_ids(), which has size ", header.
ref_ids().size(),
".")};
396 else if (core.refID > -1)
407 if constexpr (!detail::decays_to_ignore_v<mate_type>)
409 if (core.next_refID > -1)
410 get<0>(
mate) = core.next_refID;
412 if (core.next_pos > -1)
413 get<1>(
mate) = core.next_pos;
415 get<2>(
mate) = core.tlen;
420 read_field(stream_view | detail::take_exactly_or_throw(core.l_read_name - 1),
id);
421 std::ranges::next(std::ranges::begin(stream_view));
425 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
427 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
428 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
433 detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
442 auto seq_stream = stream_view
443 | detail::take_exactly_or_throw(core.l_seq / 2)
450 if constexpr (detail::decays_to_ignore_v<seq_type>)
452 auto skip_sequence_bytes = [&] ()
454 detail::consume(seq_stream);
456 std::ranges::next(std::ranges::begin(stream_view));
459 if constexpr (!detail::decays_to_ignore_v<align_type>)
462 "If you want to read ALIGNMENT but not SEQ, the alignment"
463 " object must store a sequence container at the second (query) position.");
465 if (!tmp_cigar_vector.empty())
467 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end));
468 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
469 constexpr
auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
471 get<1>(align).reserve(seq_length);
473 auto tmp_iter = std::ranges::begin(seq_stream);
474 std::ranges::advance(tmp_iter, offset_tmp / 2);
478 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
483 for (
size_t i = (seq_length / 2); i > 0; --i)
485 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
486 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
492 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
497 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
501 skip_sequence_bytes();
507 skip_sequence_bytes();
512 using alph_t = std::ranges::range_value_t<decltype(
seq)>;
513 constexpr
auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
515 for (
auto [d1, d2] : seq_stream)
525 std::ranges::next(std::ranges::begin(stream_view));
528 if constexpr (!detail::decays_to_ignore_v<align_type>)
530 assign_unaligned(get<1>(align),
531 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
532 std::ranges::distance(
seq) - soft_clipping_end));
539 read_field(stream_view | detail::take_exactly_or_throw(core.l_seq)
544 int32_t remaining_bytes = core.block_size - (
sizeof(alignment_record_core) - 4) -
545 core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
546 assert(remaining_bytes >= 0);
547 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
549 while (tags_view.size() > 0)
550 read_field(tags_view, tag_dict);
554 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
559 if (core.l_seq != 0 && offset_tmp == core.l_seq)
561 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
563 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
564 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
565 "stored in the optional field CG. You need to read in the field::tags and "
566 "field::seq in order to access this information.")};
570 auto it = tag_dict.find(
"CG"_tag);
572 if (it == tag_dict.end())
573 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
574 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
575 "stored in the optional field CG but this tag is not present in the given ",
578 auto cigar_view = std::views::all(std::get<std::string>(it->second));
579 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
580 offset_tmp = soft_clipping_end = 0;
581 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
584 if constexpr (!detail::decays_to_ignore_v<align_type>)
586 assign_unaligned(get<1>(align),
587 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
588 std::ranges::distance(
seq) - soft_clipping_end));
595 if constexpr (!detail::decays_to_ignore_v<align_type>)
596 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length);
598 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
599 std::swap(cigar_vector, tmp_cigar_vector);
603 template <
typename stream_type,
604 typename header_type,
607 typename ref_seq_type,
608 typename ref_id_type,
613 typename tag_dict_type>
616 [[maybe_unused]] header_type && header,
617 [[maybe_unused]] seq_type && seq,
618 [[maybe_unused]] qual_type && qual,
619 [[maybe_unused]] id_type &&
id,
620 [[maybe_unused]] int32_t
const offset,
621 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
622 [[maybe_unused]] ref_id_type && ref_id,
624 [[maybe_unused]] align_type && align,
625 [[maybe_unused]] cigar_type && cigar_vector,
626 [[maybe_unused]]
sam_flag const flag,
627 [[maybe_unused]] uint8_t
const mapq,
628 [[maybe_unused]] mate_type && mate,
629 [[maybe_unused]] tag_dict_type && tag_dict,
630 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
631 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score))
636 static_assert((std::ranges::forward_range<seq_type> &&
637 alphabet<std::ranges::range_reference_t<seq_type>>),
638 "The seq object must be a std::ranges::forward_range over "
639 "letters that model seqan3::alphabet.");
641 static_assert((std::ranges::forward_range<id_type> &&
642 alphabet<std::ranges::range_reference_t<id_type>>),
643 "The id object must be a std::ranges::forward_range over "
644 "letters that model seqan3::alphabet.");
646 static_assert((std::ranges::forward_range<ref_seq_type> &&
647 alphabet<std::ranges::range_reference_t<ref_seq_type>>),
648 "The ref_seq object must be a std::ranges::forward_range "
649 "over letters that model seqan3::alphabet.");
651 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
653 static_assert((std::ranges::forward_range<ref_id_type> ||
656 "The ref_id object must be a std::ranges::forward_range "
657 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
661 "The align object must be a std::pair of two ranges whose "
662 "value_type is comparable to seqan3::gap");
665 std::equality_comparable_with<
gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
666 std::equality_comparable_with<
gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
667 "The align object must be a std::pair of two ranges whose "
668 "value_type is comparable to seqan3::gap");
670 static_assert((std::ranges::forward_range<qual_type> &&
671 alphabet<std::ranges::range_reference_t<qual_type>>),
672 "The qual object must be a std::ranges::forward_range "
673 "over letters that model seqan3::alphabet.");
676 "The mate object must be a std::tuple of size 3 with "
677 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
678 "2) a std::integral or std::optional<std::integral>, and "
679 "3) a std::integral.");
681 static_assert(((std::ranges::forward_range<decltype(std::get<0>(
mate))> ||
687 "The mate object must be a std::tuple of size 3 with "
688 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
689 "2) a std::integral or std::optional<std::integral>, and "
690 "3) a std::integral.");
693 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
695 if constexpr (detail::decays_to_ignore_v<header_type>)
697 throw format_error{
"BAM can only be written with a header but you did not provide enough information! "
698 "You can either construct the output file with ref_ids and ref_seqs information and "
699 "the header will be created for you, or you can access the `header` member directly."};
710 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
715 if (!header_was_written)
719 write_header(os, options, header);
720 int32_t l_text{
static_cast<int32_t
>(os.
str().size())};
721 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_text), 4, stream_it);
725 int32_t n_ref{
static_cast<int32_t
>(header.
ref_ids().size())};
726 std::ranges::copy_n(
reinterpret_cast<char *
>(&n_ref), 4, stream_it);
728 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
730 int32_t l_name{
static_cast<int32_t
>(header.
ref_ids()[ridx].size()) + 1};
731 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_name), 4, stream_it);
733 std::ranges::copy(header.
ref_ids()[ridx].begin(), header.
ref_ids()[ridx].end(), stream_it);
736 std::ranges::copy_n(
reinterpret_cast<char *
>(&get<0>(header.
ref_id_info[ridx])), 4, stream_it);
739 header_was_written =
true;
745 int32_t ref_length{};
749 if (!std::ranges::empty(cigar_vector))
751 int32_t dummy_seq_length{};
752 for (
auto & [
count, operation] : cigar_vector)
753 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(),
count);
755 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
757 ref_length = std::ranges::distance(get<1>(align));
763 int32_t off_end{
static_cast<int32_t
>(std::ranges::distance(
seq)) -
offset};
765 for (
auto chr : get<1>(align))
769 off_end -= ref_length;
770 cigar_vector = detail::get_cigar_vector(align,
offset, off_end);
773 if (cigar_vector.size() >= (1 << 16))
775 tag_dict[
"CG"_tag] = detail::get_cigar_string(cigar_vector);
776 cigar_vector.resize(2);
777 cigar_vector[0] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(
seq)),
'S'_cigar_operation};
778 cigar_vector[1] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(get<1>(align))),
'N'_cigar_operation};
781 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
788 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(
id), 254) + 1;
789 read_name_size +=
static_cast<uint8_t
>(read_name_size == 1);
791 alignment_record_core core
799 static_cast<uint16_t
>(cigar_vector.size()),
801 static_cast<int32_t
>(std::ranges::distance(
seq)),
803 get<1>(
mate).value_or(-1),
807 auto check_and_assign_id_to = [&header] ([[maybe_unused]]
auto & id_source,
808 [[maybe_unused]]
auto & id_target)
812 if constexpr (!detail::decays_to_ignore_v<id_t>)
814 if constexpr (std::integral<id_t>)
816 id_target = id_source;
818 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
820 id_target = id_source.value_or(-1);
824 if (!std::ranges::empty(id_source))
828 if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
829 std::ranges::sized_range<decltype(id_source)> &&
830 std::ranges::borrowed_range<decltype(id_source)>)
840 "The ref_id type is not convertible to the reference id information stored in the "
841 "reference dictionary of the header object.");
843 id_it = header.
ref_dict.find(id_source);
848 throw format_error{detail::to_string(
"Unknown reference name '", id_source,
"' could "
849 "not be found in BAM header ref_dict: ",
853 id_target = id_it->second;
860 check_and_assign_id_to(
ref_id, core.refID);
863 check_and_assign_id_to(get<0>(
mate), core.next_refID);
866 core.block_size =
sizeof(core) - 4 +
868 core.n_cigar_op * 4 +
869 (core.l_seq + 1) / 2 +
871 tag_dict_binary_str.
size();
873 std::ranges::copy_n(
reinterpret_cast<char *
>(&core),
sizeof(core), stream_it);
875 if (std::ranges::empty(
id))
878 std::ranges::copy_n(std::ranges::begin(
id), core.l_read_name - 1, stream_it);
882 for (
auto [cigar_count, op] : cigar_vector)
884 cigar_count = cigar_count << 4;
885 cigar_count |=
static_cast<int32_t
>(char_to_sam_rank[op.to_char()]);
886 std::ranges::copy_n(
reinterpret_cast<char *
>(&cigar_count), 4, stream_it);
890 using alph_t = std::ranges::range_value_t<seq_type>;
891 constexpr
auto to_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
893 auto sit = std::ranges::begin(
seq);
894 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
899 stream_it =
static_cast<char>(compressed_chr);
903 stream_it =
static_cast<char>(
to_rank(to_dna16[
to_rank(*sit)]) << 4);
906 if (std::ranges::empty(
qual))
909 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
913 if (std::ranges::distance(
qual) != core.l_seq)
914 throw format_error{detail::to_string(
"Expected quality of same length as sequence with size ",
915 core.l_seq,
". Got quality with size ",
916 std::ranges::distance(
qual),
" instead.")};
919 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
923 stream << tag_dict_binary_str;
928 template <
typename stream_view_type,
typename value_type>
930 stream_view_type && stream_view,
931 value_type const & SEQAN3_DOXYGEN_ONLY(value))
934 read_field(stream_view,
count);
941 read_field(stream_view, tmp);
965 template <
typename stream_view_type>
966 inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
975 uint16_t tag =
static_cast<uint16_t
>(*it) << 8;
978 tag +=
static_cast<uint16_t
>(*it);
996 read_field(stream_view, tmp);
997 target[tag] =
static_cast<int32_t
>(tmp);
1003 read_field(stream_view, tmp);
1004 target[tag] =
static_cast<int32_t
>(tmp);
1010 read_field(stream_view, tmp);
1011 target[tag] =
static_cast<int32_t
>(tmp);
1017 read_field(stream_view, tmp);
1018 target[tag] =
static_cast<int32_t
>(tmp);
1024 read_field(stream_view, tmp);
1031 read_field(stream_view, tmp);
1032 target[tag] =
static_cast<int32_t
>(tmp);
1038 read_field(stream_view, tmp);
1044 string_buffer.
clear();
1045 while (!is_char<'\0'>(*it))
1051 target[tag] = string_buffer;
1058 while (!is_char<'\0'>(*it))
1060 string_buffer.
clear();
1065 throw format_error{
"Hexadecimal tag has an uneven number of digits!"};
1069 read_field(string_buffer, value);
1073 target[tag] = byte_array;
1078 char array_value_type_id = *it;
1081 switch (array_value_type_id)
1084 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1087 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1090 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1093 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1096 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1099 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1102 read_sam_dict_vector(target[tag], stream_view,
float{});
1105 throw format_error{detail::to_string(
"The first character in the numerical id of a SAM tag ",
1106 "must be one of [cCsSiIf] but '", array_value_type_id,
"' was given.")};
1111 throw format_error{detail::to_string(
"The second character in the numerical id of a "
1112 "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id,
"' was given.")};
1130 template <
typename cigar_input_type>
1131 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const
1134 char operation{
'\0'};
1136 int32_t ref_length{}, seq_length{};
1137 uint32_t operation_and_count{};
1138 constexpr
char const * cigar_mapping =
"MIDNSHP=X*******";
1139 constexpr uint32_t cigar_mask = 0x0f;
1141 if (n_cigar_op == 0)
1142 return std::tuple{operations, ref_length, seq_length};
1146 while (n_cigar_op > 0)
1148 std::ranges::copy_n(std::ranges::begin(cigar_input),
1149 sizeof(operation_and_count),
1150 reinterpret_cast<char*
>(&operation_and_count));
1151 operation = cigar_mapping[operation_and_count & cigar_mask];
1152 count = operation_and_count >> 4;
1154 detail::update_alignment_lengths(ref_length, seq_length, operation,
count);
1159 return std::tuple{operations, ref_length, seq_length};
1165 inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary
const & tag_dict)
1169 auto stream_variant_fn = [&result] (
auto && arg)
1174 if constexpr (std::same_as<T, int32_t>)
1177 size_t const absolute_arg = std::abs(arg);
1179 bool const negative = arg < 0;
1180 n = n * n + 2 * negative;
1186 result[result.size() - 1] =
'C';
1187 result.append(
reinterpret_cast<char const *
>(&arg), 1);
1192 result[result.size() - 1] =
'S';
1193 result.append(
reinterpret_cast<char const *
>(&arg), 2);
1198 result[result.size() - 1] =
'c';
1199 int8_t tmp =
static_cast<int8_t
>(arg);
1200 result.append(
reinterpret_cast<char const *
>(&tmp), 1);
1205 result[result.size() - 1] =
's';
1206 int16_t tmp =
static_cast<int16_t
>(arg);
1207 result.append(
reinterpret_cast<char const *
>(&tmp), 2);
1212 result.append(
reinterpret_cast<char const *
>(&arg), 4);
1217 else if constexpr (std::same_as<T, std::string>)
1219 result.append(
reinterpret_cast<char const *
>(arg.data()), arg.size() + 1);
1221 else if constexpr (!std::ranges::range<T>)
1223 result.append(
reinterpret_cast<char const *
>(&arg),
sizeof(arg));
1227 int32_t sz{
static_cast<int32_t
>(arg.size())};
1228 result.append(
reinterpret_cast<char *
>(&sz), 4);
1229 result.append(
reinterpret_cast<char const *
>(arg.data()),
1230 arg.size() *
sizeof(std::ranges::range_value_t<T>));
1234 for (
auto & [tag, variant] : tag_dict)
1236 result.push_back(
static_cast<char>(tag / 256));
1237 result.push_back(
static_cast<char>(tag % 256));
1239 result.push_back(detail::sam_tag_type_char[variant.
index()]);
1241 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.
index()]))
1242 result.push_back(detail::sam_tag_type_char_extra[variant.
index()]);
Provides the C++20 <bit> header if it is not already available.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:211
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:264
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=').
Definition: dna16sam.hpp:48
The alphabet of a gap character '-'.
Definition: gap.hpp:39
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:332
Provides seqan3::dna16sam.
T emplace_back(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:133
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:199
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:73
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:189
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:95
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:74
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
Adaptations of concepts from the Ranges TS.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides helper data structures for the seqan3::sam_file_output.
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:23
Exposes the value_type of another type.
Definition: pre.hpp:58
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::views::slice.