19 #include <seqan3/alphabet/detail/convert.hpp> 74 struct alignment_record_core
79 uint32_t l_read_name:8;
82 uint32_t n_cigar_op:16;
93 class alignment_file_input_format<format_bam> : alignment_file_input_format<format_sam>
97 using format_tag = format_bam;
102 alignment_file_input_format() noexcept = default;
103 alignment_file_input_format(alignment_file_input_format const &) = delete;
106 alignment_file_input_format & operator=(alignment_file_input_format const &) = delete;
107 alignment_file_input_format(alignment_file_input_format &&) noexcept = default;
108 alignment_file_input_format & operator=(alignment_file_input_format &&) noexcept = default;
109 ~alignment_file_input_format() noexcept = default;
113 template <typename stream_type,
114 typename seq_legal_alph_type,
115 typename ref_seqs_type,
116 typename ref_ids_type,
119 typename offset_type,
120 typename ref_seq_type,
121 typename ref_id_type,
122 typename ref_offset_type,
128 typename tag_dict_type,
129 typename e_value_type,
130 typename bit_score_type>
131 void read(stream_type & stream,
132 alignment_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
133 ref_seqs_type & ref_seqs,
134 alignment_file_header<ref_ids_type> & header,
138 offset_type & offset,
139 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
140 ref_id_type & ref_id,
141 ref_offset_type & ref_offset,
146 tag_dict_type & tag_dict,
147 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
148 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
150 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
151 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
152 "The ref_offset must be a specialisation of std::optional.");
155 "The type of field::MAPQ must be uint8_t.");
158 "The type of field::FLAG must be uint8_t.");
162 {stream_buf_t{stream}, stream_buf_t{}};
165 [[maybe_unused]] int32_t offset_tmp{};
166 [[maybe_unused]] int32_t soft_clipping_end{};
168 [[maybe_unused]] int32_t ref_length{0}, seq_length{0};
172 if (!header_was_read)
176 throw format_error{
"File is not in BAM format."};
179 read_field(stream_view, tmp32);
188 read_field(stream_view, n_ref);
190 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
192 read_field(stream_view, tmp32);
194 string_buffer.resize(tmp32 - 1);
198 read_field(stream_view, tmp32);
200 auto id_it = header.ref_dict.find(string_buffer);
203 if (id_it == header.ref_dict.end())
205 throw format_error{detail::to_string(
"Unknown reference name '" + string_buffer +
206 "' found in BAM file header (header.ref_ids():",
207 header.ref_ids(),
").")};
209 else if (id_it->second != ref_idx)
211 throw format_error{detail::to_string(
"Reference id '", string_buffer,
"' at position ", ref_idx,
212 " does not correspond to the position ", id_it->second,
213 " in the header (header.ref_ids():", header.ref_ids(),
").")};
215 else if (std::get<0>(header.ref_id_info[id_it->second]) != tmp32)
217 throw format_error{
"Provided reference has unequal length as specified in the header."};
221 header_was_read =
true;
223 if (stream_buf_t{stream} == stream_buf_t{})
229 alignment_record_core core;
232 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1)
234 throw format_error{detail::to_string(
"Reference id index '", core.refID,
"' is not in range of ",
235 "header.ref_ids(), which has size ", header.ref_ids().size(),
".")};
237 else if (core.refID > -1)
246 ref_offset = core.pos;
248 if constexpr (!detail::decays_to_ignore_v<mate_type>)
250 if (core.next_refID > -1)
251 get<0>(mate) = core.next_refID;
253 if (core.next_pos > -1)
254 get<1>(mate) = core.next_pos;
256 get<2>(mate) = core.tlen;
266 if constexpr (!detail::decays_to_ignore_v<align_type>)
268 std::tie(cigar, ref_length, seq_length, offset_tmp, soft_clipping_end) =
269 detail::parse_binary_cigar(stream_view, core.n_cigar_op);
282 auto seq_stream = stream_view
286 return {sam_dna16{}.assign_rank(
std::min(15, static_cast<uint8_t>(c) >> 4)),
287 sam_dna16{}.assign_rank(
std::min(15, static_cast<uint8_t>(c) & 0x0f))};
290 if constexpr (detail::decays_to_ignore_v<seq_type>)
292 if constexpr (!detail::decays_to_ignore_v<align_type>)
295 "If you want to read ALIGNMENT but not SEQ, the alignment" 296 " object must store a sequence container at the second (query) position.");
300 assert(core.l_seq == (seq_length + offset_tmp + soft_clipping_end));
301 using alph_t = value_type_t<decltype(get<1>(
align))>;
302 constexpr
auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
304 get<1>(
align).reserve(seq_length);
311 get<1>(
align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
316 for (
size_t i = (seq_length / 2); i > 0; --i)
318 get<1>(
align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
319 get<1>(
align).push_back(from_dna16[
to_rank(get<1>(*tmp_iter))]);
325 get<1>(
align).push_back(from_dna16[
to_rank(get<0>(*tmp_iter))]);
339 detail::consume(seq_stream);
346 using alph_t = value_type_t<decltype(seq)>;
347 constexpr
auto from_dna16 = detail::convert_through_char_representation<alph_t, sam_dna16>;
349 for (
auto [d1, d2] : seq_stream)
362 if constexpr (!detail::decays_to_ignore_v<align_type>)
378 int32_t remaining_bytes = core.block_size - (
sizeof(alignment_record_core) - 4) -
379 core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
380 assert(remaining_bytes >= 0);
383 while (tags_view.size() > 0)
384 read_field(tags_view, tag_dict);
388 if constexpr (!detail::decays_to_ignore_v<align_type>)
393 if (core.l_seq != 0 && offset_tmp == core.l_seq)
395 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
397 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
398 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
399 "stored in the optional field CG. You need to read in the field::TAGS and " 400 "field::SEQ in order to access this information.")};
404 auto it = tag_dict.find(
"CG"_tag);
406 if (it == tag_dict.end())
407 throw format_error{detail::to_string(
"The cigar string '", offset_tmp,
"S", ref_length,
408 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
409 "stored in the optional field CG but this tag is not present in the given ",
412 auto cigar_view =
std::view::all(std::get<std::string>(it->second));
413 std::tie(cigar, ref_length, seq_length, offset_tmp, soft_clipping_end) = detail::parse_cigar(cigar_view);
422 construct_alignment(align, cigar, core.refID, ref_seqs, core.pos, ref_length);
430 bool header_was_read{
false};
436 using alignment_file_input_format<format_sam>::read_field;
444 template <
typename stream_view_type, std::Integral number_type>
445 void read_field(stream_view_type && stream_view, number_type & target)
456 template <
typename stream_view_type>
457 void read_field(stream_view_type && stream_view,
float & target)
463 template <
typename stream_view_type,
typename value_type>
465 stream_view_type && stream_view,
466 value_type
const & SEQAN3_DOXYGEN_ONLY(value))
469 read_field(stream_view, count);
476 read_field(stream_view, tmp);
480 variant = std::move(tmp_vector);
500 template <
typename stream_view_type>
501 void read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
528 read_field(stream_view, tmp);
529 target[tag] =
static_cast<int32_t
>(tmp);
535 read_field(stream_view, tmp);
536 target[tag] =
static_cast<int32_t
>(tmp);
542 read_field(stream_view, tmp);
543 target[tag] =
static_cast<int32_t
>(tmp);
549 read_field(stream_view, tmp);
550 target[tag] =
static_cast<int32_t
>(tmp);
556 read_field(stream_view, tmp);
557 target[tag] = std::move(tmp);
563 read_field(stream_view, tmp);
564 target[tag] =
static_cast<int32_t
>(tmp);
570 read_field(stream_view, tmp);
576 string_buffer.clear();
583 target[tag] = string_buffer;
596 switch (array_value_type_id)
600 read_sam_dict_vector(target[tag], stream_view, int8_t{});
605 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
610 read_sam_dict_vector(target[tag], stream_view, int16_t{});
615 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
620 read_sam_dict_vector(target[tag], stream_view, int32_t{});
625 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
630 read_sam_dict_vector(target[tag], stream_view,
float{});
634 throw format_error{detail::to_string(
"The first character in the numerical id of a SAM tag ",
635 "must be one of [cCsSiIf] but '", array_value_type_id,
"' was given.")};
640 throw format_error{detail::to_string(
"The second character in the numerical id of a " 641 "SAM tag must be one of [A,i,Z,H,B,f] but '", type_id,
"' was given.")};
649 class alignment_file_output_format<format_bam> : alignment_file_output_format<format_sam>
653 using format_tag = format_bam;
658 alignment_file_output_format() noexcept = default;
659 alignment_file_output_format(alignment_file_output_format const &) = delete;
662 alignment_file_output_format & operator=(alignment_file_output_format const &) = delete;
663 alignment_file_output_format(alignment_file_output_format &&) noexcept = default;
664 alignment_file_output_format & operator=(alignment_file_output_format &&) noexcept = default;
665 ~alignment_file_output_format() noexcept = default;
669 template <typename stream_type,
670 typename header_type,
673 typename ref_seq_type,
674 typename ref_id_type,
678 typename tag_dict_type>
679 void write([[maybe_unused]] stream_type & stream,
680 [[maybe_unused]] alignment_file_output_options const & options,
681 [[maybe_unused]] header_type && header,
682 [[maybe_unused]] seq_type &&
seq,
683 [[maybe_unused]] qual_type && qual,
684 [[maybe_unused]] id_type &&
id,
685 [[maybe_unused]] int32_t offset,
686 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
687 [[maybe_unused]] ref_id_type && ref_id,
688 [[maybe_unused]]
std::optional<int32_t> ref_offset,
689 [[maybe_unused]] align_type && align,
690 [[maybe_unused]] uint16_t flag,
691 [[maybe_unused]] uint8_t mapq,
692 [[maybe_unused]] mate_type && mate,
693 [[maybe_unused]] tag_dict_type && tag_dict,
694 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(e_value),
695 [[maybe_unused]]
double SEQAN3_DOXYGEN_ONLY(bit_score))
701 Alphabet<reference_t<seq_type>>),
702 "The seq object must be a std::ranges::ForwardRange over " 703 "letters that model seqan3::Alphabet.");
706 Alphabet<reference_t<id_type>>),
707 "The id object must be a std::ranges::ForwardRange over " 708 "letters that model seqan3::Alphabet.");
711 Alphabet<reference_t<ref_seq_type>>),
712 "The ref_seq object must be a std::ranges::ForwardRange " 713 "over letters that model seqan3::Alphabet.");
715 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
719 detail::is_type_specialisation_of_v<remove_cvref_t<ref_id_type>,
std::optional>),
720 "The ref_id object must be a std::ranges::ForwardRange " 721 "over letters that model seqan3::Alphabet or an Integral or a std::optional<Integral>.");
724 static_assert(TupleLike<remove_cvref_t<align_type>>,
725 "The align object must be a std::pair of two ranges whose " 726 "value_type is comparable to seqan3::gap");
731 "The align object must be a std::pair of two ranges whose " 732 "value_type is comparable to seqan3::gap");
735 Alphabet<reference_t<qual_type>>),
736 "The qual object must be a std::ranges::ForwardRange " 737 "over letters that model seqan3::Alphabet.");
739 static_assert(TupleLike<remove_cvref_t<mate_type>>,
740 "The mate object must be a std::tuple of size 3 with " 741 "1) a std::ranges::ForwardRange with a value_type modelling seqan3::Alphabet, " 742 "2) a std::Integral or std::optional<std::Integral>, and " 743 "3) a std::Integral.");
751 "The mate object must be a std::tuple of size 3 with " 752 "1) a std::ranges::ForwardRange with a value_type modelling seqan3::Alphabet, " 753 "2) a std::Integral or std::optional<std::Integral>, and " 754 "3) a std::Integral.");
756 static_assert(
std::Same<remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
757 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
759 if constexpr (detail::decays_to_ignore_v<header_type>)
761 throw format_error{
"BAM can only be written with a header but you did not provide enough information! " 762 "You can either construct the output file with ref_ids and ref_seqs information and " 763 "the header will be created for you, or you can access the `header` member directly."};
771 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
772 throw format_error{detail::to_string(
"The ref_offset object must be >= -1 but is: ", ref_offset)};
783 write_header(os, options, header);
784 int32_t l_text{
static_cast<int32_t
>(os.
str().size())};
789 int32_t n_ref{
static_cast<int32_t
>(header.ref_ids().size())};
792 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
794 int32_t l_name{
static_cast<int32_t
>(header.ref_ids()[ridx].size()) + 1};
797 std::ranges::copy(header.ref_ids()[ridx].begin(), header.ref_ids()[ridx].end(), stream_it);
800 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
803 written_header =
true;
816 for (
auto chr : get<1>(align))
824 if (cigar.
size() > 65535)
826 tag_dict[
"CG"_tag] = detail::get_cigar_string(align, offset, off_end);
832 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
834 alignment_record_core core
838 ref_offset.value_or(-1),
842 static_cast<uint16_t>(cigar.
size()),
846 get<1>(mate).value_or(-1),
850 auto check_and_assign_id_to = [&header] ([[maybe_unused]]
auto & id_source,
851 [[maybe_unused]]
auto & id_target)
855 if constexpr (!detail::decays_to_ignore_v<id_t>)
859 id_target = id_source;
861 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
863 id_target = id_source.value_or(-1);
869 auto id_it = header.ref_dict.find(id_source);
871 if (id_it == header.ref_dict.end())
873 throw format_error{detail::to_string(
"Unknown reference name '", id_source,
"' could " 874 "not be found in BAM header ref_dict: ",
875 header.ref_dict,
".")};
877 id_target = id_it->second;
884 check_and_assign_id_to(ref_id, core.refID);
887 check_and_assign_id_to(get<0>(mate), core.next_refID);
890 core.block_size =
sizeof(core) - 4 +
892 core.n_cigar_op * 4 +
893 (core.l_seq + 1) / 2 +
895 tag_dict_binary_str.
size();
903 for (
auto [cigar_op, cigar_count] : cigar)
905 cigar_count = cigar_count << 4;
906 cigar_count |=
static_cast<int32_t
>(char_to_sam_rank[cigar_op]);
911 using alph_t = value_type_t<seq_type>;
912 constexpr
auto to_dna16 = detail::convert_through_char_representation<sam_dna16, alph_t>;
915 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
920 stream_it =
static_cast<char>(compressed_chr);
924 stream_it =
static_cast<char>(
to_rank(to_dna16[
to_rank(*sit)]) << 4);
940 stream << tag_dict_binary_str;
954 ret[
static_cast<index_t
>(
'I')] = 1;
955 ret[
static_cast<index_t
>(
'D')] = 2;
956 ret[
static_cast<index_t
>(
'N')] = 3;
957 ret[
static_cast<index_t
>(
'S')] = 4;
958 ret[
static_cast<index_t
>(
'H')] = 5;
959 ret[
static_cast<index_t
>(
'P')] = 6;
960 ret[
static_cast<index_t
>(
'=')] = 7;
961 ret[
static_cast<index_t
>(
'X')] = 8;
970 static std::string get_tag_dict_str(sam_tag_dictionary
const & tag_dict)
974 auto stream_variant_fn = [&result] (
auto && arg)
977 using T = remove_cvref_t<decltype(arg)>;
982 bool negative = arg < 0;
983 auto n = __builtin_ctz(detail::next_power_of_two(((negative) ? arg * -1 : arg) + 1) >> 1) / 8;
984 n = n * n + 2 * negative;
990 result[result.size() - 1] =
'C';
991 result.append(reinterpret_cast<char const *>(&arg), 1);
996 result[result.size() - 1] =
'S';
997 result.append(reinterpret_cast<char const *>(&arg), 2);
1002 result[result.size() - 1] =
'c';
1003 int8_t tmp =
static_cast<int8_t
>(arg);
1004 result.append(reinterpret_cast<char const *>(&tmp), 1);
1009 result[result.size() - 1] =
's';
1010 int16_t tmp =
static_cast<int16_t
>(arg);
1011 result.append(reinterpret_cast<char const *>(&tmp), 2);
1016 result.append(reinterpret_cast<char const *>(&arg), 4);
1023 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1);
1027 result.append(reinterpret_cast<char const *>(&arg),
sizeof(arg));
1031 int32_t sz{
static_cast<int32_t
>(arg.size())};
1032 result.append(reinterpret_cast<char *>(&sz), 4);
1033 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() *
sizeof(value_type_t<T>));
1037 for (
auto & [tag, variant] : tag_dict)
1039 result.push_back(static_cast<char>(tag / 256));
1040 result.push_back(static_cast<char>(tag % 256));
1042 result.push_back(detail::sam_tag_type_char[variant.
index()]);
1044 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.
index()]))
1045 result.push_back(detail::sam_tag_type_char_extra[variant.
index()]);
1054 static uint16_t reg2bin(int32_t beg, int32_t
end) noexcept
1057 if (beg >> 14 ==
end >> 14)
return ((1 << 15) - 1) / 7 + (beg >> 14);
1058 if (beg >> 17 ==
end >> 17)
return ((1 << 12) - 1) / 7 + (beg >> 17);
1059 if (beg >> 20 ==
end >> 20)
return ((1 << 9) - 1) / 7 + (beg >> 20);
1060 if (beg >> 23 ==
end >> 23)
return ((1 << 6) - 1) / 7 + (beg >> 23);
1061 if (beg >> 26 ==
end >> 26)
return ((1 << 3) - 1) / 7 + (beg >> 26);
Defines the requirements of a type that allows iteration over its elements by providing a begin itera...
::ranges::equal equal
Alias for ranges::equal. Determines if two sets of elements are the same.
Definition: algorithm:54
Provides concepts for core language types and relations that don't have concepts in C++20 (yet)...
::ranges::distance distance
Alias for ranges::distance. Returns the number of hops from first to last.
Definition: iterator:321
void assign_unaligned(aligned_seq_t &aligned_seq, unaligned_sequence_type &&unaligned_seq)
An implementation of seqan3::AlignedSequence::assign_unaligned_sequence for sequence containers...
Definition: aligned_sequence_concept.hpp:345
::ranges::next next
Alias for ranges::next. Returns the nth successor of the given iterator.
Definition: iterator:331
::ranges::subrange< it_t, sen_t, k > subrange
Create a view from a pair of iterator and sentinel.
Definition: ranges:339
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: concept.hpp:103
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Writes successive characters onto the output stream from which...
Definition: iterator.hpp:56
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
Provides seqan3::type_list and auxiliary type traits.
std::remove_cv_t< std::remove_reference_t< t > > remove_cvref_t
Return the input type with const, volatile and references removed (type trait).
Definition: basic.hpp:35
::ranges::copy_n copy_n
Alias for ranges::copy_n. Copies a range of exactly n elements to a new location. ...
Definition: algorithm:49
SeqAn specific customisations in the standard namespace.
constexpr auto all
A view adaptor that behaves like std::view:all, but type erases contiguous ranges.
Definition: view_all.hpp:160
The main SeqAn3 namespace.
Auxiliary for pretty printing of exception messages.
Auxiliary functions for the alignment IO.
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:144
Provides seqan3::alignment_file_output_options.
Provides seqan3::sam_dna16.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides seqan3::TupleLike.
Provides various utility functions.
Provides various utility functions.
Adaptations of concepts from the Ranges TS.
::ranges::begin begin
Alias for ranges::begin. Returns an iterator to the beginning of a range.
Definition: ranges:174
::ranges::advance advance
Alias for ranges::advance. Advances the iterator by the given distance.
Definition: iterator:316
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: algorithm:44
Definition: aligned_sequence_concept.hpp:35
Provides character predicates for tokenisation.
Requires std::detail::WeaklyEqualityComparableWitht<t1,t2>, but also that t1 and t2, as well as their common_reference_t satisfy std::EqualityComparable.
Provides seqan3::view::slice.
Provides utility functions for bit twiddling.
::ranges::empty empty
Alias for ranges::empty. Checks whether a range is empty.
Definition: ranges:194
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:97
Provides various transformation traits used by the range module.
typename reference< t >::type reference_t
Shortcut for seqan3::reference (TransformationTrait shortcut).
Definition: pre.hpp:77
Specifies requirements of a Range type for which begin returns a type that models std::ForwardIterato...
Static reflection for arbitrary types.
The concept std::Same<T, U> is satisfied if and only if T and U denote the same type.
auto constexpr take_until_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:627
::ranges::end end
Alias for ranges::end. Returns an iterator to the end of a range.
Definition: ranges:179
constexpr auto transform
A range adaptor that takes a invocable and returns a view of the elements with the invocable applied...
Definition: ranges:911
Provides seqan3::view::take_exactly and seqan3::view::take_exactly_or_throw.
The concept Integral is satisfied if and only if T is an integral type.
auto constexpr take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:94