19 #include <seqan3/alphabet/detail/convert.hpp>
80 template <
typename stream_type,
81 typename seq_legal_alph_type,
82 typename ref_seqs_type,
83 typename ref_ids_type,
87 typename ref_seq_type,
89 typename ref_offset_type,
96 typename tag_dict_type,
97 typename e_value_type,
98 typename bit_score_type>
101 ref_seqs_type & ref_seqs,
106 offset_type & offset,
107 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
108 ref_id_type & ref_id,
109 ref_offset_type & ref_offset,
111 cigar_type & cigar_vector,
115 tag_dict_type & tag_dict,
116 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
117 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
119 template <
typename stream_type,
120 typename header_type,
123 typename ref_seq_type,
124 typename ref_id_type,
129 typename tag_dict_type>
132 [[maybe_unused]] header_type && header,
133 [[maybe_unused]] seq_type &&
seq,
134 [[maybe_unused]] qual_type && qual,
135 [[maybe_unused]] id_type &&
id,
136 [[maybe_unused]] int32_t
const offset,
137 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
138 [[maybe_unused]] ref_id_type && ref_id,
140 [[maybe_unused]] align_type && align,
141 [[maybe_unused]] cigar_type && cigar_vector,
142 [[maybe_unused]]
sam_flag const flag,
143 [[maybe_unused]] uint8_t
const mapq,
144 [[maybe_unused]] mate_type && mate,
145 [[maybe_unused]] tag_dict_type && tag_dict,
146 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
147 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score));
151 bool header_was_read{
false};
157 struct alignment_record_core
162 uint32_t l_read_name:8;
165 uint32_t n_cigar_op:16;
183 ret[
static_cast<index_t
>(
'I')] = 1;
184 ret[
static_cast<index_t
>(
'D')] = 2;
185 ret[
static_cast<index_t
>(
'N')] = 3;
186 ret[
static_cast<index_t
>(
'S')] = 4;
187 ret[
static_cast<index_t
>(
'H')] = 5;
188 ret[
static_cast<index_t
>(
'P')] = 6;
189 ret[
static_cast<index_t
>(
'=')] = 7;
190 ret[
static_cast<index_t
>(
'X')] = 8;
197 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
200 if (beg >> 14 == end >> 14)
return ((1 << 15) - 1) / 7 + (beg >> 14);
201 if (beg >> 17 == end >> 17)
return ((1 << 12) - 1) / 7 + (beg >> 17);
202 if (beg >> 20 == end >> 20)
return ((1 << 9) - 1) / 7 + (beg >> 20);
203 if (beg >> 23 == end >> 23)
return ((1 << 6) - 1) / 7 + (beg >> 23);
204 if (beg >> 26 == end >> 26)
return ((1 << 3) - 1) / 7 + (beg >> 26);
208 using format_sam_base::read_field;
216 template <
typename stream_view_type, std::
integral number_type>
217 void read_field(stream_view_type && stream_view, number_type & target)
219 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(target),
reinterpret_cast<char *
>(&target));
227 template <
typename stream_view_type>
228 void read_field(stream_view_type && stream_view,
float & target)
230 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(int32_t),
reinterpret_cast<char *
>(&target));
243 template <
typename stream_view_type,
typename optional_value_type>
246 optional_value_type tmp;
247 read_field(std::forward<stream_view_type>(stream_view), tmp);
251 template <
typename stream_view_type,
typename value_type>
253 stream_view_type && stream_view,
254 value_type
const & SEQAN3_DOXYGEN_ONLY(value));
256 template <
typename stream_view_type>
257 void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
259 template <
typename cigar_input_type>
260 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const;
262 static std::string get_tag_dict_str(sam_tag_dictionary
const & tag_dict);
266 template <
typename stream_type,
267 typename seq_legal_alph_type,
268 typename ref_seqs_type,
269 typename ref_ids_type,
272 typename offset_type,
273 typename ref_seq_type,
274 typename ref_id_type,
275 typename ref_offset_type,
282 typename tag_dict_type,
283 typename e_value_type,
284 typename bit_score_type>
287 ref_seqs_type & ref_seqs,
292 offset_type & offset,
293 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
294 ref_id_type & ref_id,
295 ref_offset_type & ref_offset,
297 cigar_type & cigar_vector,
301 tag_dict_type & tag_dict,
302 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
303 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
305 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
306 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
307 "The ref_offset must be a specialisation of std::optional.");
309 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
310 "The type of field::mapq must be uint8_t.");
312 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
313 "The type of field::flag must be seqan3::sam_flag.");
316 auto stream_view = std::ranges::subrange<decltype(stream_buf_t{stream}), decltype(stream_buf_t{})>
317 {stream_buf_t{stream}, stream_buf_t{}};
320 [[maybe_unused]] int32_t offset_tmp{};
321 [[maybe_unused]] int32_t soft_clipping_end{};
323 [[maybe_unused]] int32_t ref_length{0}, seq_length{0};
327 if (!header_was_read)
334 read_field(stream_view, tmp32);
340 read_field(stream_view, n_ref);
342 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
344 read_field(stream_view, tmp32);
346 string_buffer.
resize(tmp32 - 1);
347 std::ranges::copy_n(std::ranges::begin(stream_view), tmp32 - 1, string_buffer.
data());
348 std::ranges::next(std::ranges::begin(stream_view));
350 read_field(stream_view, tmp32);
352 auto id_it = header.
ref_dict.find(string_buffer);
357 throw format_error{detail::to_string(
"Unknown reference name '" + string_buffer +
358 "' found in BAM file header (header.ref_ids():",
361 else if (id_it->second != ref_idx)
363 throw format_error{detail::to_string(
"Reference id '", string_buffer,
"' at position ", ref_idx,
364 " does not correspond to the position ", id_it->second,
365 " in the header (header.ref_ids():", header.
ref_ids(),
").")};
367 else if (std::get<0>(header.
ref_id_info[id_it->second]) != tmp32)
369 throw format_error{
"Provided reference has unequal length as specified in the header."};
373 header_was_read =
true;
375 if (stream_buf_t{stream} == stream_buf_t{})
381 alignment_record_core core;
384 if (core.refID >=
static_cast<int32_t
>(header.
ref_ids().size()) || core.refID < -1)
386 throw format_error{detail::to_string(
"Reference id index '", core.refID,
"' is not in range of ",
387 "header.ref_ids(), which has size ", header.
ref_ids().size(),
".")};
389 else if (core.refID > -1)
398 ref_offset = core.pos;
400 if constexpr (!detail::decays_to_ignore_v<mate_type>)
402 if (core.next_refID > -1)
403 get<0>(mate) = core.next_refID;
405 if (core.next_pos > -1)
406 get<1>(mate) = core.next_pos;
408 get<2>(mate) = core.tlen;
414 std::ranges::next(std::ranges::begin(stream_view));
418 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
420 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
421 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
435 auto seq_stream = stream_view
443 if constexpr (detail::decays_to_ignore_v<seq_type>)
445 if constexpr (!detail::decays_to_ignore_v<align_type>)
448 "If you want to read ALIGNMENT but not SEQ, the alignment"
449 " object must store a sequence container at the second (query) position.");
451 if (!tmp_cigar_vector.empty())
453 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end));
454 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
455 constexpr
auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
457 get<1>(align).reserve(seq_length);
459 auto tmp_iter = std::ranges::begin(seq_stream);
460 std::ranges::advance(tmp_iter, offset_tmp / 2);
464 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
469 for (
size_t i = (seq_length / 2); i > 0; --i)
471 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
472 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
478 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
483 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
492 detail::consume(seq_stream);
494 std::ranges::next(std::ranges::begin(stream_view));
499 using alph_t = std::ranges::range_value_t<decltype(
seq)>;
500 constexpr
auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
502 for (
auto [d1, d2] : seq_stream)
512 std::ranges::next(std::ranges::begin(stream_view));
515 if constexpr (!detail::decays_to_ignore_v<align_type>)
517 assign_unaligned(get<1>(align),
518 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
519 std::ranges::distance(
seq) - soft_clipping_end));
531 int32_t remaining_bytes = core.block_size - (
sizeof(alignment_record_core) - 4) -
532 core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
533 assert(remaining_bytes >= 0);
536 while (tags_view.size() > 0)
537 read_field(tags_view, tag_dict);
541 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
546 if (core.l_seq != 0 && offset_tmp == core.l_seq)
548 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
550 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
551 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
552 "stored in the optional field CG. You need to read in the field::tags and "
553 "field::seq in order to access this information.")};
557 auto it = tag_dict.find(
"CG"_tag);
559 if (it == tag_dict.end())
560 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
561 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
562 "stored in the optional field CG but this tag is not present in the given ",
565 auto cigar_view = std::views::all(std::get<std::string>(it->second));
566 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_cigar(cigar_view);
567 offset_tmp = soft_clipping_end = 0;
568 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
571 if constexpr (!detail::decays_to_ignore_v<align_type>)
573 assign_unaligned(get<1>(align),
574 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
575 std::ranges::distance(
seq) - soft_clipping_end));
582 if constexpr (!detail::decays_to_ignore_v<align_type>)
583 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length);
585 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
586 std::swap(cigar_vector, tmp_cigar_vector);
590 template <
typename stream_type,
591 typename header_type,
594 typename ref_seq_type,
595 typename ref_id_type,
600 typename tag_dict_type>
603 [[maybe_unused]] header_type && header,
604 [[maybe_unused]] seq_type &&
seq,
605 [[maybe_unused]] qual_type && qual,
606 [[maybe_unused]] id_type &&
id,
607 [[maybe_unused]] int32_t
const offset,
608 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
609 [[maybe_unused]] ref_id_type && ref_id,
611 [[maybe_unused]] align_type && align,
612 [[maybe_unused]] cigar_type && cigar_vector,
613 [[maybe_unused]]
sam_flag const flag,
614 [[maybe_unused]] uint8_t
const mapq,
615 [[maybe_unused]] mate_type && mate,
616 [[maybe_unused]] tag_dict_type && tag_dict,
617 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
618 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score))
623 static_assert((std::ranges::forward_range<seq_type> &&
624 alphabet<std::ranges::range_reference_t<seq_type>>),
625 "The seq object must be a std::ranges::forward_range over "
626 "letters that model seqan3::alphabet.");
628 static_assert((std::ranges::forward_range<id_type> &&
629 alphabet<std::ranges::range_reference_t<id_type>>),
630 "The id object must be a std::ranges::forward_range over "
631 "letters that model seqan3::alphabet.");
633 static_assert((std::ranges::forward_range<ref_seq_type> &&
634 alphabet<std::ranges::range_reference_t<ref_seq_type>>),
635 "The ref_seq object must be a std::ranges::forward_range "
636 "over letters that model seqan3::alphabet.");
638 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
640 static_assert((std::ranges::forward_range<ref_id_type> ||
643 "The ref_id object must be a std::ranges::forward_range "
644 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
648 "The align object must be a std::pair of two ranges whose "
649 "value_type is comparable to seqan3::gap");
652 std::equality_comparable_with<
gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
653 std::equality_comparable_with<
gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
654 "The align object must be a std::pair of two ranges whose "
655 "value_type is comparable to seqan3::gap");
657 static_assert((std::ranges::forward_range<qual_type> &&
658 alphabet<std::ranges::range_reference_t<qual_type>>),
659 "The qual object must be a std::ranges::forward_range "
660 "over letters that model seqan3::alphabet.");
663 "The mate object must be a std::tuple of size 3 with "
664 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
665 "2) a std::integral or std::optional<std::integral>, and "
666 "3) a std::integral.");
668 static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
674 "The mate object must be a std::tuple of size 3 with "
675 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
676 "2) a std::integral or std::optional<std::integral>, and "
677 "3) a std::integral.");
680 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
682 if constexpr (detail::decays_to_ignore_v<header_type>)
684 throw format_error{
"BAM can only be written with a header but you did not provide enough information! "
685 "You can either construct the output file with ref_ids and ref_seqs information and "
686 "the header will be created for you, or you can access the `header` member directly."};
694 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
695 throw format_error{detail::to_string(
"The ref_offset object must be >= -1 but is: ", ref_offset)};
697 std::cpp20::ostreambuf_iterator stream_it{stream};
702 if (!header_was_written)
706 write_header(os, options, header);
707 int32_t l_text{
static_cast<int32_t
>(os.
str().size())};
708 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_text), 4, stream_it);
712 int32_t n_ref{
static_cast<int32_t
>(header.
ref_ids().size())};
713 std::ranges::copy_n(
reinterpret_cast<char *
>(&n_ref), 4, stream_it);
715 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
717 int32_t l_name{
static_cast<int32_t
>(header.
ref_ids()[ridx].size()) + 1};
718 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_name), 4, stream_it);
720 std::ranges::copy(header.
ref_ids()[ridx].begin(), header.
ref_ids()[ridx].end(), stream_it);
723 std::ranges::copy_n(
reinterpret_cast<char *
>(&get<0>(header.
ref_id_info[ridx])), 4, stream_it);
726 header_was_written =
true;
732 int32_t ref_length{};
736 if (!std::ranges::empty(cigar_vector))
738 int32_t dummy_seq_length{};
739 for (
auto & [
count, operation] : cigar_vector)
740 update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(),
count);
742 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
744 ref_length = std::ranges::distance(get<1>(align));
750 int32_t off_end{
static_cast<int32_t
>(std::ranges::distance(
seq)) - offset};
752 for (
auto chr : get<1>(align))
756 off_end -= ref_length;
757 cigar_vector = detail::get_cigar_vector(align, offset, off_end);
760 if (cigar_vector.size() >= (1 << 16))
762 tag_dict[
"CG"_tag] = detail::get_cigar_string(cigar_vector);
763 cigar_vector.resize(2);
764 cigar_vector[0] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(
seq)),
'S'_cigar_op};
765 cigar_vector[1] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(get<1>(align))),
'N'_cigar_op};
768 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
775 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(
id), 254) + 1;
776 read_name_size +=
static_cast<uint8_t
>(read_name_size == 1);
778 alignment_record_core core
782 ref_offset.value_or(-1),
785 reg2bin(ref_offset.value_or(-1), ref_length),
786 static_cast<uint16_t
>(cigar_vector.size()),
788 static_cast<int32_t
>(std::ranges::distance(
seq)),
790 get<1>(mate).value_or(-1),
794 auto check_and_assign_id_to = [&header] ([[maybe_unused]]
auto & id_source,
795 [[maybe_unused]]
auto & id_target)
799 if constexpr (!detail::decays_to_ignore_v<id_t>)
801 if constexpr (std::integral<id_t>)
803 id_target = id_source;
805 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
807 id_target = id_source.value_or(-1);
811 if (!std::ranges::empty(id_source))
815 if constexpr (std::ranges::contiguous_range<decltype(id_source)> &&
816 std::ranges::sized_range<decltype(id_source)> &&
817 std::ranges::borrowed_range<decltype(id_source)>)
827 "The ref_id type is not convertible to the reference id information stored in the "
828 "reference dictionary of the header object.");
830 id_it = header.
ref_dict.find(id_source);
835 throw format_error{detail::to_string(
"Unknown reference name '", id_source,
"' could "
836 "not be found in BAM header ref_dict: ",
840 id_target = id_it->second;
847 check_and_assign_id_to(ref_id, core.refID);
850 check_and_assign_id_to(get<0>(mate), core.next_refID);
853 core.block_size =
sizeof(core) - 4 +
855 core.n_cigar_op * 4 +
856 (core.l_seq + 1) / 2 +
858 tag_dict_binary_str.
size();
860 std::ranges::copy_n(
reinterpret_cast<char *
>(&core),
sizeof(core), stream_it);
862 if (std::ranges::empty(
id))
865 std::ranges::copy_n(std::ranges::begin(
id), core.l_read_name - 1, stream_it);
869 for (
auto [cigar_count, op] : cigar_vector)
871 cigar_count = cigar_count << 4;
872 cigar_count |=
static_cast<int32_t
>(char_to_sam_rank[op.to_char()]);
873 std::ranges::copy_n(
reinterpret_cast<char *
>(&cigar_count), 4, stream_it);
877 using alph_t = std::ranges::range_value_t<seq_type>;
878 constexpr
auto to_dna16 = detail::convert_through_char_representation<sam_dna16, alph_t>;
880 auto sit = std::ranges::begin(
seq);
881 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
886 stream_it =
static_cast<char>(compressed_chr);
890 stream_it =
static_cast<char>(
to_rank(to_dna16[
to_rank(*sit)]) << 4);
893 if (std::ranges::empty(qual))
896 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
900 if (std::ranges::distance(qual) != core.l_seq)
901 throw format_error{detail::to_string(
"Expected quality of same length as sequence with size ",
902 core.l_seq,
". Got quality with size ",
903 std::ranges::distance(qual),
" instead.")};
906 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
910 stream << tag_dict_binary_str;
915 template <
typename stream_view_type,
typename value_type>
917 stream_view_type && stream_view,
918 value_type const & SEQAN3_DOXYGEN_ONLY(value))
921 read_field(stream_view,
count);
928 read_field(stream_view, tmp);
952 template <
typename stream_view_type>
953 inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
962 std::ranges::next(std::ranges::begin(stream_view));
964 std::ranges::next(std::ranges::begin(stream_view));
966 std::ranges::next(std::ranges::begin(stream_view));
973 std::ranges::next(std::ranges::begin(stream_view));
980 read_field(stream_view, tmp);
981 target[tag] =
static_cast<int32_t
>(tmp);
987 read_field(stream_view, tmp);
988 target[tag] =
static_cast<int32_t
>(tmp);
994 read_field(stream_view, tmp);
995 target[tag] =
static_cast<int32_t
>(tmp);
1001 read_field(stream_view, tmp);
1002 target[tag] =
static_cast<int32_t
>(tmp);
1008 read_field(stream_view, tmp);
1015 read_field(stream_view, tmp);
1016 target[tag] =
static_cast<int32_t
>(tmp);
1022 read_field(stream_view, tmp);
1028 string_buffer.
clear();
1029 while (!is_char<'\0'>(*std::ranges::begin(stream_view)))
1031 string_buffer.
push_back(*std::ranges::begin(stream_view));
1032 std::ranges::next(std::ranges::begin(stream_view));
1034 std::ranges::next(std::ranges::begin(stream_view));
1035 target[tag] = string_buffer;
1046 std::ranges::next(std::ranges::begin(stream_view));
1048 switch (array_value_type_id)
1051 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1054 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1057 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1060 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1063 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1066 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1069 read_sam_dict_vector(target[tag], stream_view,
float{});
1072 throw format_error{detail::to_string(
"The first character in the numerical id of a SAM tag ",
1073 "must be one of [cCsSiIf] but '", array_value_type_id,
"' was given.")};
1078 throw format_error{detail::to_string(
"The second character in the numerical id of a "
1079 "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id,
"' was given.")};
1097 template <
typename cigar_input_type>
1098 inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const
1101 char operation{
'\0'};
1103 int32_t ref_length{}, seq_length{};
1104 uint32_t operation_and_count{};
1105 constexpr
char const * cigar_mapping =
"MIDNSHP=X*******";
1106 constexpr uint32_t cigar_mask = 0x0f;
1108 if (n_cigar_op == 0)
1109 return std::tuple{operations, ref_length, seq_length};
1113 while (n_cigar_op > 0)
1115 std::ranges::copy_n(std::ranges::begin(cigar_input),
1116 sizeof(operation_and_count),
1117 reinterpret_cast<char*
>(&operation_and_count));
1118 operation = cigar_mapping[operation_and_count & cigar_mask];
1119 count = operation_and_count >> 4;
1121 update_alignment_lengths(ref_length, seq_length, operation,
count);
1122 operations.emplace_back(
count, cigar_op{}.assign_char(operation));
1126 return std::tuple{operations, ref_length, seq_length};
1132 inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary
const & tag_dict)
1136 auto stream_variant_fn = [&result] (
auto && arg)
1141 if constexpr (std::same_as<T, int32_t>)
1144 bool negative = arg < 0;
1145 auto n = __builtin_ctz(detail::next_power_of_two(((negative) ? arg * -1 : arg) + 1) >> 1) / 8;
1146 n = n * n + 2 * negative;
1152 result[result.size() - 1] =
'C';
1153 result.append(
reinterpret_cast<char const *
>(&arg), 1);
1158 result[result.size() - 1] =
'S';
1159 result.append(
reinterpret_cast<char const *
>(&arg), 2);
1164 result[result.size() - 1] =
'c';
1165 int8_t tmp =
static_cast<int8_t
>(arg);
1166 result.append(
reinterpret_cast<char const *
>(&tmp), 1);
1171 result[result.size() - 1] =
's';
1172 int16_t tmp =
static_cast<int16_t
>(arg);
1173 result.append(
reinterpret_cast<char const *
>(&tmp), 2);
1178 result.append(
reinterpret_cast<char const *
>(&arg), 4);
1183 else if constexpr (std::same_as<T, std::string>)
1185 result.append(
reinterpret_cast<char const *
>(arg.data()), arg.size() + 1);
1187 else if constexpr (!std::ranges::range<T>)
1189 result.append(
reinterpret_cast<char const *
>(&arg),
sizeof(arg));
1193 int32_t sz{
static_cast<int32_t
>(arg.size())};
1194 result.append(
reinterpret_cast<char *
>(&sz), 4);
1195 result.append(
reinterpret_cast<char const *
>(arg.data()),
1196 arg.size() *
sizeof(std::ranges::range_value_t<T>));
1200 for (
auto & [tag, variant] : tag_dict)
1202 result.push_back(
static_cast<char>(tag / 256));
1203 result.push_back(
static_cast<char>(tag % 256));
1205 result.push_back(detail::sam_tag_type_char[variant.
index()]);
1207 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.
index()]))
1208 result.push_back(detail::sam_tag_type_char_extra[variant.
index()]);