72 template <
typename stream_type,
73 typename seq_legal_alph_type,
74 typename ref_seqs_type,
75 typename ref_ids_type,
79 typename ref_seq_type,
81 typename ref_offset_type,
88 typename tag_dict_type,
89 typename e_value_type,
90 typename bit_score_type>
93 ref_seqs_type & ref_seqs,
99 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
100 ref_id_type & ref_id,
101 ref_offset_type & ref_offset,
103 cigar_type & cigar_vector,
107 tag_dict_type & tag_dict,
108 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
109 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
111 template <
typename stream_type,
112 typename header_type,
115 typename ref_seq_type,
116 typename ref_id_type,
121 typename tag_dict_type>
124 [[maybe_unused]] header_type && header,
125 [[maybe_unused]] seq_type && seq,
126 [[maybe_unused]] qual_type && qual,
127 [[maybe_unused]] id_type &&
id,
128 [[maybe_unused]] int32_t
const offset,
129 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
130 [[maybe_unused]] ref_id_type && ref_id,
132 [[maybe_unused]] align_type && align,
133 [[maybe_unused]] cigar_type && cigar_vector,
134 [[maybe_unused]]
sam_flag const flag,
135 [[maybe_unused]] uint8_t
const mapq,
136 [[maybe_unused]] mate_type && mate,
137 [[maybe_unused]] tag_dict_type && tag_dict,
138 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
139 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score));
143 bool header_was_read{
false};
149 struct alignment_record_core
154 uint32_t l_read_name:8;
157 uint32_t n_cigar_op:16;
175 ret[
static_cast<index_t
>(
'I')] = 1;
176 ret[
static_cast<index_t
>(
'D')] = 2;
177 ret[
static_cast<index_t
>(
'N')] = 3;
178 ret[
static_cast<index_t
>(
'S')] = 4;
179 ret[
static_cast<index_t
>(
'H')] = 5;
180 ret[
static_cast<index_t
>(
'P')] = 6;
181 ret[
static_cast<index_t
>(
'=')] = 7;
182 ret[
static_cast<index_t
>(
'X')] = 8;
189 static uint16_t reg2bin(int32_t beg, int32_t end)
noexcept
192 if (beg >> 14 == end >> 14)
return ((1 << 15) - 1) / 7 + (beg >> 14);
193 if (beg >> 17 == end >> 17)
return ((1 << 12) - 1) / 7 + (beg >> 17);
194 if (beg >> 20 == end >> 20)
return ((1 << 9) - 1) / 7 + (beg >> 20);
195 if (beg >> 23 == end >> 23)
return ((1 << 6) - 1) / 7 + (beg >> 23);
196 if (beg >> 26 == end >> 26)
return ((1 << 3) - 1) / 7 + (beg >> 26);
200 using format_sam_base::read_field;
208 template <
typename stream_view_type, std::
integral number_type>
209 void read_field(stream_view_type && stream_view, number_type & target)
211 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(target),
reinterpret_cast<char *
>(&target));
219 template <
typename stream_view_type>
220 void read_field(stream_view_type && stream_view,
float & target)
222 std::ranges::copy_n(std::ranges::begin(stream_view),
sizeof(int32_t),
reinterpret_cast<char *
>(&target));
235 template <
typename stream_view_type,
typename optional_value_type>
238 optional_value_type tmp;
239 read_field(std::forward<stream_view_type>(stream_view), tmp);
243 template <
typename stream_view_type,
typename value_type>
245 stream_view_type && stream_view,
246 value_type
const & SEQAN3_DOXYGEN_ONLY(value));
248 template <
typename stream_view_type>
249 void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
251 template <
typename cigar_input_type>
252 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const;
254 static std::string get_tag_dict_str(sam_tag_dictionary
const & tag_dict);
258template <
typename stream_type,
259 typename seq_legal_alph_type,
260 typename ref_seqs_type,
261 typename ref_ids_type,
264 typename offset_type,
265 typename ref_seq_type,
266 typename ref_id_type,
267 typename ref_offset_type,
274 typename tag_dict_type,
275 typename e_value_type,
276 typename bit_score_type>
279 ref_seqs_type & ref_seqs,
284 offset_type & offset,
285 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
286 ref_id_type & ref_id,
287 ref_offset_type & ref_offset,
289 cigar_type & cigar_vector,
293 tag_dict_type & tag_dict,
294 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
295 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
297 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
298 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
299 "The ref_offset must be a specialisation of std::optional.");
301 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
302 "The type of field::mapq must be uint8_t.");
304 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
305 "The type of field::flag must be seqan3::sam_flag.");
307 auto stream_view = seqan3::detail::istreambuf(stream);
310 [[maybe_unused]] int32_t offset_tmp{};
311 [[maybe_unused]] int32_t soft_clipping_end{};
313 [[maybe_unused]] int32_t ref_length{0}, seq_length{0};
317 if (!header_was_read)
320 if (!std::ranges::equal(stream_view | detail::take_exactly_or_throw(4),
std::string_view{
"BAM\1"}))
328 read_field(stream_view, l_text);
331 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
333 read_field(stream_view, n_ref);
335 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
337 read_field(stream_view, l_name);
339 string_buffer.
resize(l_name - 1);
340 std::ranges::copy_n(std::ranges::begin(stream_view), l_name - 1, string_buffer.
data());
341 std::ranges::next(std::ranges::begin(stream_view));
343 read_field(stream_view, l_ref);
345 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>)
350 auto & reference_ids = header.
ref_ids();
354 reference_ids.push_back(string_buffer);
356 header.
ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
361 auto id_it = header.
ref_dict.find(string_buffer);
366 throw format_error{detail::to_string(
"Unknown reference name '" + string_buffer +
367 "' found in BAM file header (header.ref_ids():",
370 else if (id_it->second != ref_idx)
372 throw format_error{detail::to_string(
"Reference id '", string_buffer,
"' at position ", ref_idx,
373 " does not correspond to the position ", id_it->second,
374 " in the header (header.ref_ids():", header.
ref_ids(),
").")};
376 else if (std::get<0>(header.
ref_id_info[id_it->second]) != l_ref)
378 throw format_error{
"Provided reference has unequal length as specified in the header."};
382 header_was_read =
true;
384 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view))
390 alignment_record_core core;
391 std::ranges::copy(stream_view | detail::take_exactly_or_throw(
sizeof(core)),
reinterpret_cast<char *
>(&core));
393 if (core.refID >=
static_cast<int32_t
>(header.
ref_ids().size()) || core.refID < -1)
395 throw format_error{detail::to_string(
"Reference id index '", core.refID,
"' is not in range of ",
396 "header.ref_ids(), which has size ", header.
ref_ids().size(),
".")};
398 else if (core.refID > -1)
409 if constexpr (!detail::decays_to_ignore_v<mate_type>)
411 if (core.next_refID > -1)
412 get<0>(
mate) = core.next_refID;
414 if (core.next_pos > -1)
415 get<1>(
mate) = core.next_pos;
417 get<2>(
mate) = core.tlen;
422 read_field(stream_view | detail::take_exactly_or_throw(core.l_read_name - 1),
id);
423 std::ranges::next(std::ranges::begin(stream_view));
427 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
429 std::tie(tmp_cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
430 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
435 detail::consume(stream_view | detail::take_exactly_or_throw(core.n_cigar_op * 4));
444 auto seq_stream = stream_view
445 | detail::take_exactly_or_throw(core.l_seq / 2)
452 if constexpr (detail::decays_to_ignore_v<seq_type>)
454 auto skip_sequence_bytes = [&] ()
456 detail::consume(seq_stream);
458 std::ranges::next(std::ranges::begin(stream_view));
461 if constexpr (!detail::decays_to_ignore_v<align_type>)
464 "If you want to read ALIGNMENT but not SEQ, the alignment"
465 " object must store a sequence container at the second (query) position.");
467 if (!tmp_cigar_vector.empty())
469 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end));
470 using alph_t = std::ranges::range_value_t<decltype(get<1>(align))>;
471 constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
473 get<1>(align).reserve(seq_length);
475 auto tmp_iter = std::ranges::begin(seq_stream);
476 std::ranges::advance(tmp_iter, offset_tmp / 2);
480 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
485 for (
size_t i = (seq_length / 2); i > 0; --i)
487 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
488 get<1>(align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
494 get<1>(align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
499 std::ranges::advance(tmp_iter, (soft_clipping_end + !(seq_length & 1)) / 2);
503 skip_sequence_bytes();
509 skip_sequence_bytes();
514 using alph_t = std::ranges::range_value_t<
decltype(
seq)>;
515 constexpr auto from_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
517 for (
auto [d1, d2] : seq_stream)
527 std::ranges::next(std::ranges::begin(stream_view));
530 if constexpr (!detail::decays_to_ignore_v<align_type>)
532 assign_unaligned(get<1>(align),
533 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
534 std::ranges::distance(
seq) - soft_clipping_end));
541 read_field(stream_view | detail::take_exactly_or_throw(core.l_seq)
546 int32_t remaining_bytes = core.block_size - (
sizeof(alignment_record_core) - 4) -
547 core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
548 assert(remaining_bytes >= 0);
549 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
551 while (tags_view.size() > 0)
552 read_field(tags_view, tag_dict);
556 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
561 if (core.l_seq != 0 && offset_tmp == core.l_seq)
563 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
565 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
566 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
567 "stored in the optional field CG. You need to read in the field::tags and "
568 "field::seq in order to access this information.")};
572 auto it = tag_dict.find(
"CG"_tag);
574 if (it == tag_dict.end())
575 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
576 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
577 "stored in the optional field CG but this tag is not present in the given ",
580 auto cigar_view = std::views::all(std::get<std::string>(it->second));
581 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
582 offset_tmp = soft_clipping_end = 0;
583 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
586 if constexpr (!detail::decays_to_ignore_v<align_type>)
588 assign_unaligned(get<1>(align),
589 seq |
views::slice(
static_cast<std::ranges::range_difference_t<seq_type>
>(offset_tmp),
590 std::ranges::distance(
seq) - soft_clipping_end));
597 if constexpr (!detail::decays_to_ignore_v<align_type>)
598 construct_alignment(align, tmp_cigar_vector, core.refID, ref_seqs, core.pos, ref_length);
600 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
601 std::swap(cigar_vector, tmp_cigar_vector);
605template <
typename stream_type,
606 typename header_type,
609 typename ref_seq_type,
610 typename ref_id_type,
615 typename tag_dict_type>
618 [[maybe_unused]] header_type && header,
619 [[maybe_unused]] seq_type && seq,
620 [[maybe_unused]] qual_type && qual,
621 [[maybe_unused]] id_type &&
id,
622 [[maybe_unused]] int32_t
const offset,
623 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
624 [[maybe_unused]] ref_id_type && ref_id,
626 [[maybe_unused]] align_type && align,
627 [[maybe_unused]] cigar_type && cigar_vector,
628 [[maybe_unused]]
sam_flag const flag,
629 [[maybe_unused]] uint8_t
const mapq,
630 [[maybe_unused]] mate_type && mate,
631 [[maybe_unused]] tag_dict_type && tag_dict,
632 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
633 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score))
638 static_assert((std::ranges::forward_range<seq_type> &&
640 "The seq object must be a std::ranges::forward_range over "
641 "letters that model seqan3::alphabet.");
643 static_assert((std::ranges::forward_range<id_type> &&
645 "The id object must be a std::ranges::forward_range over "
646 "letters that model seqan3::alphabet.");
648 static_assert((std::ranges::forward_range<ref_seq_type> &&
650 "The ref_seq object must be a std::ranges::forward_range "
651 "over letters that model seqan3::alphabet.");
653 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
655 static_assert((std::ranges::forward_range<ref_id_type> ||
656 std::integral<std::remove_reference_t<ref_id_type>> ||
657 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>,
std::optional>),
658 "The ref_id object must be a std::ranges::forward_range "
659 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
663 "The align object must be a std::pair of two ranges whose "
664 "value_type is comparable to seqan3::gap");
666 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
667 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
668 std::equality_comparable_with<
gap, std::ranges::range_reference_t<
decltype(std::get<1>(align))>>),
669 "The align object must be a std::pair of two ranges whose "
670 "value_type is comparable to seqan3::gap");
672 static_assert((std::ranges::forward_range<qual_type> &&
674 "The qual object must be a std::ranges::forward_range "
675 "over letters that model seqan3::alphabet.");
678 "The mate object must be a std::tuple of size 3 with "
679 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
680 "2) a std::integral or std::optional<std::integral>, and "
681 "3) a std::integral.");
683 static_assert(((std::ranges::forward_range<decltype(std::get<0>(
mate))> ||
686 (std::integral<std::remove_cvref_t<decltype(std::get<1>(
mate))>> ||
688 std::integral<std::remove_cvref_t<decltype(std::get<2>(
mate))>>),
689 "The mate object must be a std::tuple of size 3 with "
690 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
691 "2) a std::integral or std::optional<std::integral>, and "
692 "3) a std::integral.");
695 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
697 if constexpr (detail::decays_to_ignore_v<header_type>)
699 throw format_error{
"BAM can only be written with a header but you did not provide enough information! "
700 "You can either construct the output file with ref_ids and ref_seqs information and "
701 "the header will be created for you, or you can access the `header` member directly."};
712 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
717 if (!header_was_written)
721 write_header(os, options, header);
722 int32_t l_text{
static_cast<int32_t
>(os.
str().size())};
723 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_text), 4, stream_it);
727 int32_t n_ref{
static_cast<int32_t
>(header.
ref_ids().size())};
728 std::ranges::copy_n(
reinterpret_cast<char *
>(&n_ref), 4, stream_it);
730 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
732 int32_t l_name{
static_cast<int32_t
>(header.
ref_ids()[ridx].size()) + 1};
733 std::ranges::copy_n(
reinterpret_cast<char *
>(&l_name), 4, stream_it);
735 std::ranges::copy(header.
ref_ids()[ridx].begin(), header.
ref_ids()[ridx].end(), stream_it);
738 std::ranges::copy_n(
reinterpret_cast<char *
>(&get<0>(header.
ref_id_info[ridx])), 4, stream_it);
741 header_was_written =
true;
747 int32_t ref_length{};
751 if (!std::ranges::empty(cigar_vector))
753 int32_t dummy_seq_length{};
754 for (
auto & [
count, operation] : cigar_vector)
755 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(),
count);
757 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
759 ref_length = std::ranges::distance(get<1>(align));
765 int32_t off_end{
static_cast<int32_t
>(std::ranges::distance(
seq)) -
offset};
767 for (
auto chr : get<1>(align))
771 off_end -= ref_length;
772 cigar_vector = detail::get_cigar_vector(align,
offset, off_end);
775 if (cigar_vector.size() >= (1 << 16))
777 tag_dict[
"CG"_tag] = detail::get_cigar_string(cigar_vector);
778 cigar_vector.resize(2);
779 cigar_vector[0] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(
seq)),
'S'_cigar_operation};
780 cigar_vector[1] =
cigar{
static_cast<uint32_t
>(std::ranges::distance(get<1>(align))),
'N'_cigar_operation};
783 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
790 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(
id), 254) + 1;
791 read_name_size +=
static_cast<uint8_t
>(read_name_size == 1);
793 alignment_record_core core
801 static_cast<uint16_t
>(cigar_vector.size()),
803 static_cast<int32_t
>(std::ranges::distance(
seq)),
805 get<1>(
mate).value_or(-1),
809 auto check_and_assign_id_to = [&header] ([[maybe_unused]]
auto & id_source,
810 [[maybe_unused]]
auto & id_target)
814 if constexpr (!detail::decays_to_ignore_v<id_t>)
816 if constexpr (std::integral<id_t>)
818 id_target = id_source;
820 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
822 id_target = id_source.value_or(-1);
826 if (!std::ranges::empty(id_source))
830 if constexpr (std::ranges::contiguous_range<
decltype(id_source)> &&
831 std::ranges::sized_range<
decltype(id_source)> &&
832 std::ranges::borrowed_range<
decltype(id_source)>)
842 "The ref_id type is not convertible to the reference id information stored in the "
843 "reference dictionary of the header object.");
845 id_it = header.
ref_dict.find(id_source);
850 throw format_error{detail::to_string(
"Unknown reference name '", id_source,
"' could "
851 "not be found in BAM header ref_dict: ",
855 id_target = id_it->second;
862 check_and_assign_id_to(
ref_id, core.refID);
865 check_and_assign_id_to(get<0>(
mate), core.next_refID);
868 core.block_size =
sizeof(core) - 4 +
870 core.n_cigar_op * 4 +
871 (core.l_seq + 1) / 2 +
873 tag_dict_binary_str.
size();
875 std::ranges::copy_n(
reinterpret_cast<char *
>(&core),
sizeof(core), stream_it);
877 if (std::ranges::empty(
id))
880 std::ranges::copy_n(std::ranges::begin(
id), core.l_read_name - 1, stream_it);
884 for (
auto [cigar_count, op] : cigar_vector)
886 cigar_count = cigar_count << 4;
887 cigar_count |=
static_cast<int32_t
>(char_to_sam_rank[op.to_char()]);
888 std::ranges::copy_n(
reinterpret_cast<char *
>(&cigar_count), 4, stream_it);
892 using alph_t = std::ranges::range_value_t<seq_type>;
893 constexpr auto to_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
895 auto sit = std::ranges::begin(
seq);
896 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
901 stream_it =
static_cast<char>(compressed_chr);
905 stream_it =
static_cast<char>(
to_rank(to_dna16[
to_rank(*sit)]) << 4);
908 if (std::ranges::empty(
qual))
911 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
915 if (std::ranges::distance(
qual) != core.l_seq)
916 throw format_error{detail::to_string(
"Expected quality of same length as sequence with size ",
917 core.l_seq,
". Got quality with size ",
918 std::ranges::distance(
qual),
" instead.")};
921 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
925 stream << tag_dict_binary_str;
930template <
typename stream_view_type,
typename value_type>
932 stream_view_type && stream_view,
933 value_type
const & SEQAN3_DOXYGEN_ONLY(value))
936 read_field(stream_view,
count);
943 read_field(stream_view, tmp);
947 variant = std::move(tmp_vector);
967template <
typename stream_view_type>
968inline void format_bam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
977 uint16_t tag =
static_cast<uint16_t
>(*it) << 8;
980 tag +=
static_cast<uint16_t
>(*it);
998 read_field(stream_view, tmp);
999 target[tag] =
static_cast<int32_t
>(tmp);
1005 read_field(stream_view, tmp);
1006 target[tag] =
static_cast<int32_t
>(tmp);
1012 read_field(stream_view, tmp);
1013 target[tag] =
static_cast<int32_t
>(tmp);
1019 read_field(stream_view, tmp);
1020 target[tag] =
static_cast<int32_t
>(tmp);
1026 read_field(stream_view, tmp);
1027 target[tag] = std::move(tmp);
1033 read_field(stream_view, tmp);
1034 target[tag] =
static_cast<int32_t
>(tmp);
1040 read_field(stream_view, tmp);
1046 string_buffer.
clear();
1047 while (!is_char<'\0'>(*it))
1053 target[tag] = string_buffer;
1060 while (!is_char<'\0'>(*it))
1062 string_buffer.
clear();
1067 throw format_error{
"Hexadecimal tag has an uneven number of digits!"};
1071 read_field(string_buffer, value);
1075 target[tag] = byte_array;
1080 char array_value_type_id = *it;
1083 switch (array_value_type_id)
1086 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1089 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1092 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1095 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1098 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1101 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1104 read_sam_dict_vector(target[tag], stream_view,
float{});
1107 throw format_error{detail::to_string(
"The first character in the numerical id of a SAM tag ",
1108 "must be one of [cCsSiIf] but '", array_value_type_id,
"' was given.")};
1113 throw format_error{detail::to_string(
"The second character in the numerical id of a "
1114 "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id,
"' was given.")};
1132template <
typename cigar_input_type>
1133inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op)
const
1136 char operation{
'\0'};
1138 int32_t ref_length{}, seq_length{};
1139 uint32_t operation_and_count{};
1140 constexpr char const * cigar_mapping =
"MIDNSHP=X*******";
1141 constexpr uint32_t cigar_mask = 0x0f;
1143 if (n_cigar_op == 0)
1144 return std::tuple{operations, ref_length, seq_length};
1148 while (n_cigar_op > 0)
1150 std::ranges::copy_n(std::ranges::begin(cigar_input),
1151 sizeof(operation_and_count),
1152 reinterpret_cast<char*
>(&operation_and_count));
1153 operation = cigar_mapping[operation_and_count & cigar_mask];
1154 count = operation_and_count >> 4;
1156 detail::update_alignment_lengths(ref_length, seq_length, operation,
count);
1161 return std::tuple{operations, ref_length, seq_length};
1167inline std::string format_bam::get_tag_dict_str(sam_tag_dictionary
const & tag_dict)
1171 auto stream_variant_fn = [&result] (
auto && arg)
1176 if constexpr (std::same_as<T, int32_t>)
1179 size_t const absolute_arg = std::abs(arg);
1181 bool const negative = arg < 0;
1182 n = n * n + 2 * negative;
1188 result[result.size() - 1] =
'C';
1189 result.append(
reinterpret_cast<char const *
>(&arg), 1);
1194 result[result.size() - 1] =
'S';
1195 result.append(
reinterpret_cast<char const *
>(&arg), 2);
1200 result[result.size() - 1] =
'c';
1201 int8_t tmp =
static_cast<int8_t
>(arg);
1202 result.append(
reinterpret_cast<char const *
>(&tmp), 1);
1207 result[result.size() - 1] =
's';
1208 int16_t tmp =
static_cast<int16_t
>(arg);
1209 result.append(
reinterpret_cast<char const *
>(&tmp), 2);
1214 result.append(
reinterpret_cast<char const *
>(&arg), 4);
1219 else if constexpr (std::same_as<T, std::string>)
1221 result.append(
reinterpret_cast<char const *
>(arg.data()), arg.size() + 1);
1223 else if constexpr (!std::ranges::range<T>)
1225 result.append(
reinterpret_cast<char const *
>(&arg),
sizeof(arg));
1229 int32_t sz{
static_cast<int32_t
>(arg.size())};
1230 result.append(
reinterpret_cast<char *
>(&sz), 4);
1231 result.append(
reinterpret_cast<char const *
>(arg.data()),
1232 arg.size() *
sizeof(std::ranges::range_value_t<T>));
1236 for (
auto & [tag, variant] : tag_dict)
1238 result.push_back(
static_cast<char>(tag / 256));
1239 result.push_back(
static_cast<char>(tag % 256));
1241 result.push_back(detail::sam_tag_type_char[variant.
index()]);
1243 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.
index()]))
1244 result.push_back(detail::sam_tag_type_char_extra[variant.
index()]);
The <bit> header from C++20's standard library.
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:165
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:191
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: cigar.hpp:60
exposition_only::cigar_operation operation
The (extended) cigar operation alphabet of M,D,I,H,N,P,S,X,=.
Definition: cigar.hpp:99
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The alphabet of a gap character '-'.
Definition: gap.hpp:39
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:337
Provides seqan3::dna16sam.
Provides seqan3::detail::fast_ostreambuf_iterator.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:155
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr T bit_ceil(T x) noexcept
Calculates the smallest integral power of two that is not smaller than x.
Definition: bit:162
constexpr int countr_zero(T x) noexcept
Returns the number of consecutive 0 bits in the value of x, starting from the least significant bit (...
Definition: bit:228
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: traits.hpp:169
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
A more refined container concept than seqan3::container.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
Provides seqan3::debug_stream and related types.
The <ranges> header from C++20's standard library.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::views::slice.
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:26
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.