139 template <
typename stream_type,
140 typename seq_legal_alph_type,
148 qual_type & qualities);
150 template <
typename stream_type,
158 qual_type && qualities);
160 template <
typename stream_type,
161 typename seq_legal_alph_type,
162 typename ref_seqs_type,
163 typename ref_ids_type,
166 typename offset_type,
167 typename ref_seq_type,
168 typename ref_id_type,
169 typename ref_offset_type,
176 typename tag_dict_type,
177 typename e_value_type,
178 typename bit_score_type>
181 ref_seqs_type & ref_seqs,
186 offset_type & offset,
187 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
188 ref_id_type & ref_id,
189 ref_offset_type & ref_offset,
191 cigar_type & cigar_vector,
195 tag_dict_type & tag_dict,
196 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
197 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
199 template <
typename stream_type,
200 typename header_type,
203 typename ref_seq_type,
204 typename ref_id_type,
208 typename tag_dict_type,
209 typename e_value_type,
210 typename bit_score_type>
213 header_type && header,
217 int32_t
const offset,
218 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
219 ref_id_type && ref_id,
226 tag_dict_type && tag_dict,
227 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
228 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
238 sam_file_header<> default_header{};
241 bool ref_info_present_in_header{
false};
250 template <
typename t>
251 decltype(
auto) default_or(t && v)
const noexcept
253 return std::forward<t>(v);
256 using format_sam_base::read_field;
258 template <
typename stream_view_type,
typename value_type>
260 stream_view_type && stream_view,
263 template <
typename stream_view_type>
265 stream_view_type && stream_view);
267 template <
typename stream_view_type>
268 void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
270 template <
typename stream_it_t, std::ranges::forward_range field_type>
271 void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
273 template <
typename stream_it_t>
274 void write_range_or_asterisk(stream_it_t & stream_it,
char const *
const field_value);
276 template <
typename stream_it_t>
277 void write_tag_fields(stream_it_t & stream, sam_tag_dictionary
const & tag_dict,
char const separator);
281template <
typename stream_type,
282 typename seq_legal_alph_type,
290 qual_type & qualities)
296 std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore,
297 std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore);
300 if constexpr (!detail::decays_to_ignore_v<seq_type>)
301 if (std::ranges::distance(
sequence) == 0)
302 throw parse_error{
"The sequence information must not be empty."};
303 if constexpr (!detail::decays_to_ignore_v<id_type>)
304 if (std::ranges::distance(
id) == 0)
305 throw parse_error{
"The id information must not be empty."};
308 id =
id | detail::take_until_and_consume(
is_space) | views::to<id_type>;
312template <
typename stream_type,
320 qual_type && qualities)
331 default_or(qualities),
348template <
typename stream_type,
349 typename seq_legal_alph_type,
350 typename ref_seqs_type,
351 typename ref_ids_type,
354 typename offset_type,
355 typename ref_seq_type,
356 typename ref_id_type,
357 typename ref_offset_type,
364 typename tag_dict_type,
365 typename e_value_type,
366 typename bit_score_type>
369 ref_seqs_type & ref_seqs,
374 offset_type & offset,
375 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
376 ref_id_type & ref_id,
377 ref_offset_type & ref_offset,
379 cigar_type & cigar_vector,
383 tag_dict_type & tag_dict,
384 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
385 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
387 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
388 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
389 "The ref_offset must be a specialisation of std::optional.");
391 auto stream_view = detail::istreambuf(stream);
392 auto field_view = stream_view | detail::take_until_or_throw_and_consume(is_char<'\t'>);
395 int32_t ref_offset_tmp{};
396 std::ranges::range_value_t<
decltype(header.
ref_ids())> ref_id_tmp{};
397 [[maybe_unused]] int32_t offset_tmp{};
398 [[maybe_unused]] int32_t soft_clipping_end{};
400 [[maybe_unused]] int32_t ref_length{0}, seq_length{0};
404 if (is_char<'@'>(*std::ranges::begin(stream_view)))
406 read_header(stream_view, header, ref_seqs);
408 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view))
414 read_field(field_view,
id);
416 uint16_t flag_integral{};
417 read_field(field_view, flag_integral);
420 read_field(field_view, ref_id_tmp);
421 check_and_assign_ref_id(
ref_id, ref_id_tmp, header, ref_seqs);
423 read_field(field_view, ref_offset_tmp);
426 if (ref_offset_tmp == -1)
428 else if (ref_offset_tmp > -1)
430 else if (ref_offset_tmp < -1)
431 throw format_error{
"No negative values are allowed for field::ref_offset."};
433 read_field(field_view,
mapq);
437 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
439 if (!is_char<'*'>(*std::ranges::begin(stream_view)))
441 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view);
442 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
447 std::ranges::next(std::ranges::begin(field_view));
452 detail::consume(field_view);
459 if constexpr (!detail::decays_to_ignore_v<mate_type>)
461 std::ranges::range_value_t<
decltype(header.
ref_ids())> tmp_mate_ref_id{};
462 read_field(field_view, tmp_mate_ref_id);
464 if (tmp_mate_ref_id ==
"=")
466 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
469 check_and_assign_ref_id(get<0>(
mate), ref_id_tmp, header, ref_seqs);
473 check_and_assign_ref_id(get<0>(
mate), tmp_mate_ref_id, header, ref_seqs);
477 read_field(field_view, tmp_pnext);
480 get<1>(
mate) = --tmp_pnext;
481 else if (tmp_pnext < 0)
482 throw format_error{
"No negative values are allowed at the mate mapping position."};
485 read_field(field_view, get<2>(
mate));
489 for (
size_t i = 0; i < 3u; ++i)
491 detail::consume(field_view);
497 if (!is_char<'*'>(*std::ranges::begin(stream_view)))
499 auto constexpr is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
502 if (!is_legal_alph(c))
504 "char_is_valid_for<" +
505 detail::type_name_as_string<seq_legal_alph_type> +
506 "> evaluated to false on " +
507 detail::make_printable(c)};
511 if constexpr (detail::decays_to_ignore_v<seq_type>)
513 if constexpr (!detail::decays_to_ignore_v<align_type>)
516 "If you want to read ALIGNMENT but not SEQ, the alignment"
517 " object must store a sequence container at the second (query) position.");
519 if (!tmp_cigar_vector.empty())
522 auto tmp_iter = std::ranges::begin(seq_stream);
523 std::ranges::advance(tmp_iter, offset_tmp);
525 for (; seq_length > 0; --seq_length)
527 get<1>(align).push_back(std::ranges::range_value_t<
decltype(get<1>(align))>{}.assign_char(*tmp_iter));
531 std::ranges::advance(tmp_iter, soft_clipping_end);
540 detail::consume(seq_stream);
545 read_field(seq_stream,
seq);
547 if constexpr (!detail::decays_to_ignore_v<align_type>)
549 if (!tmp_cigar_vector.empty())
551 assign_unaligned(get<1>(align),
560 std::ranges::next(std::ranges::begin(field_view));
565 auto const tab_or_end = is_char<'\t'> || is_char<'\r'> || is_char<'\n'>;
566 read_field(stream_view | detail::take_until_or_throw(tab_or_end),
qual);
568 if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
570 if (std::ranges::distance(
seq) != 0 && std::ranges::distance(
qual) != 0 &&
571 std::ranges::distance(
seq) != std::ranges::distance(
qual))
573 throw format_error{detail::to_string(
"Sequence length (", std::ranges::distance(
seq),
574 ") and quality length (", std::ranges::distance(
qual),
575 ") must be the same.")};
581 while (is_char<'\t'>(*std::ranges::begin(stream_view)))
583 std::ranges::next(std::ranges::begin(stream_view));
584 read_field(stream_view | detail::take_until_or_throw(tab_or_end), tag_dict);
587 detail::consume(stream_view | detail::take_until(!(is_char<'\r'> || is_char<'\n'>)));
593 if constexpr (!detail::decays_to_ignore_v<align_type>)
595 int32_t ref_idx{(ref_id_tmp.empty()) ? -1 : 0};
597 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
599 if (!ref_id_tmp.empty())
601 assert(header.
ref_dict.count(ref_id_tmp) != 0);
602 ref_idx = header.
ref_dict[ref_id_tmp];
606 construct_alignment(align, tmp_cigar_vector, ref_idx, ref_seqs, ref_offset_tmp, ref_length);
609 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
610 std::swap(cigar_vector, tmp_cigar_vector);
614template <
typename stream_type,
615 typename header_type,
618 typename ref_seq_type,
619 typename ref_id_type,
623 typename tag_dict_type,
624 typename e_value_type,
625 typename bit_score_type>
628 header_type && header,
632 int32_t
const offset,
633 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
634 ref_id_type && ref_id,
641 tag_dict_type && tag_dict,
642 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
643 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
661 static_assert((std::ranges::forward_range<seq_type> &&
663 "The seq object must be a std::ranges::forward_range over "
664 "letters that model seqan3::alphabet.");
666 static_assert((std::ranges::forward_range<id_type> &&
668 "The id object must be a std::ranges::forward_range over "
669 "letters that model seqan3::alphabet.");
671 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
673 static_assert((std::ranges::forward_range<ref_id_type> ||
674 std::integral<std::remove_reference_t<ref_id_type>> ||
675 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>,
std::optional>),
676 "The ref_id object must be a std::ranges::forward_range "
677 "over letters that model seqan3::alphabet.");
679 if constexpr (std::integral<std::remove_cvref_t<ref_id_type>> ||
680 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>,
std::optional>)
681 static_assert(!detail::decays_to_ignore_v<header_type>,
682 "If you give indices as reference id information the header must also be present.");
686 "The align object must be a std::pair of two ranges whose "
687 "value_type is comparable to seqan3::gap");
689 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
690 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
691 std::equality_comparable_with<
gap, std::ranges::range_reference_t<
decltype(std::get<1>(align))>>),
692 "The align object must be a std::pair of two ranges whose "
693 "value_type is comparable to seqan3::gap");
695 static_assert((std::ranges::forward_range<qual_type> &&
697 "The qual object must be a std::ranges::forward_range "
698 "over letters that model seqan3::alphabet.");
701 "The mate object must be a std::tuple of size 3 with "
702 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
703 "2) a std::integral or std::optional<std::integral>, and "
704 "3) a std::integral.");
706 static_assert(((std::ranges::forward_range<decltype(std::get<0>(
mate))> ||
709 (std::integral<std::remove_cvref_t<decltype(std::get<1>(
mate))>> ||
711 std::integral<std::remove_cvref_t<decltype(std::get<2>(
mate))>>),
712 "The mate object must be a std::tuple of size 3 with "
713 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
714 "2) a std::integral or std::optional<std::integral>, and "
715 "3) a std::integral.");
717 if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(
mate))>> ||
719 static_assert(!detail::decays_to_ignore_v<header_type>,
720 "If you give indices as mate reference id information the header must also be present.");
723 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
728 if constexpr (!detail::decays_to_ignore_v<header_type> &&
729 !detail::decays_to_ignore_v<ref_id_type> &&
730 !std::integral<std::remove_reference_t<ref_id_type>> &&
731 !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>,
std::optional>)
738 if constexpr (std::ranges::contiguous_range<
decltype(
ref_id)> &&
739 std::ranges::sized_range<
decltype(
ref_id)> &&
740 std::ranges::borrowed_range<
decltype(
ref_id)>)
749 "The ref_id type is not convertible to the reference id information stored in the "
750 "reference dictionary of the header object.");
756 throw format_error{detail::to_string(
"The ref_id '",
ref_id,
"' was not in the list of references:",
762 throw format_error{
"The ref_offset object must be a std::integral >= 0."};
767 if constexpr (!detail::decays_to_ignore_v<header_type>)
771 write_header(stream, options, header);
772 header_was_written =
true;
780 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
781 constexpr char separator{
'\t'};
783 write_range_or_asterisk(stream_it,
id);
784 *stream_it = separator;
786 stream_it.write_number(
static_cast<uint16_t
>(
flag));
787 *stream_it = separator;
789 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
791 if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
793 write_range_or_asterisk(stream_it, (header.
ref_ids())[
ref_id]);
795 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>,
std::optional>)
798 write_range_or_asterisk(stream_it, (header.
ref_ids())[
ref_id.value()]);
804 write_range_or_asterisk(stream_it,
ref_id);
812 *stream_it = separator;
815 stream_it.write_number(
ref_offset.value_or(-1) + 1);
816 *stream_it = separator;
818 stream_it.write_number(
static_cast<unsigned>(
mapq));
819 *stream_it = separator;
821 if (!std::ranges::empty(cigar_vector))
823 for (
auto & c : cigar_vector)
824 stream_it.write_range(c.to_string());
826 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
833 for (
auto chr : get<1>(align))
841 write_range_or_asterisk(stream_it, detail::get_cigar_string(align,
offset, off_end));
848 *stream_it = separator;
850 if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(
mate))>>)
852 write_range_or_asterisk(stream_it, (header.
ref_ids())[get<0>(
mate)]);
854 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(
mate))>,
std::optional>)
856 if (get<0>(
mate).has_value())
859 write_range_or_asterisk(stream_it, header.
ref_ids()[get<0>(
mate).value_or(0)]);
865 write_range_or_asterisk(stream_it, get<0>(
mate));
868 *stream_it = separator;
870 if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(
mate))>,
std::optional>)
873 stream_it.write_number(get<1>(
mate).value_or(-1) + 1);
874 *stream_it = separator;
878 stream_it.write_number(get<1>(
mate));
879 *stream_it = separator;
882 stream_it.write_number(get<2>(
mate));
883 *stream_it = separator;
885 write_range_or_asterisk(stream_it,
seq);
886 *stream_it = separator;
888 write_range_or_asterisk(stream_it,
qual);
890 write_tag_fields(stream_it, tag_dict, separator);
913template <
typename stream_view_type,
typename value_type>
915 stream_view_type && stream_view,
919 while (std::ranges::begin(stream_view) != ranges::end(stream_view))
921 read_field(stream_view | detail::take_until(is_char<','>), value);
924 if (is_char<','>(*std::ranges::begin(stream_view)))
925 std::ranges::next(std::ranges::begin(stream_view));
927 variant = std::move(tmp_vector);
943template <
typename stream_view_type>
945 stream_view_type && stream_view)
950 while (std::ranges::begin(stream_view) != ranges::end(stream_view))
954 read_field(stream_view | detail::take_exactly_or_throw(2), value);
958 throw format_error{
"Hexadecimal tag has an uneven number of digits!"};
964 variant = std::move(tmp_vector);
984template <
typename stream_view_type>
985inline void format_sam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
993 std::ranges::next(std::ranges::begin(stream_view));
995 std::ranges::next(std::ranges::begin(stream_view));
996 std::ranges::next(std::ranges::begin(stream_view));
998 std::ranges::next(std::ranges::begin(stream_view));
999 std::ranges::next(std::ranges::begin(stream_view));
1006 std::ranges::next(std::ranges::begin(stream_view));
1012 read_field(stream_view, tmp);
1019 read_field(stream_view, tmp);
1025 target[tag] = stream_view | views::to<std::string>;
1030 read_sam_byte_vector(target[tag], stream_view);
1036 std::ranges::next(std::ranges::begin(stream_view));
1037 std::ranges::next(std::ranges::begin(stream_view));
1039 switch (array_value_type_id)
1042 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1045 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1048 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1051 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1054 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1057 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1060 read_sam_dict_vector(target[tag], stream_view,
float{});
1063 throw format_error{
std::string(
"The first character in the numerical ") +
1064 "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id +
1070 throw format_error{
std::string(
"The second character in the numerical id of a "
1071 "SAM tag must be one of [A,i,Z,H,B,f] but '") + type_id +
"' was given."};
1082template <
typename stream_it_t, std::ranges::forward_range field_type>
1083inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
1085 if (std::ranges::empty(field_value))
1091 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>,
char>)
1092 stream_it.write_range(field_value);
1104template <
typename stream_it_t>
1105inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it,
char const *
const field_value)
1117template <
typename stream_it_t>
1118inline void format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary
const & tag_dict,
char const separator)
1120 auto const stream_variant_fn = [&stream_it] (
auto && arg)
1124 if constexpr (std::ranges::input_range<T>)
1126 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>,
char>)
1128 stream_it.write_range(arg);
1130 else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>,
std::byte>)
1132 if (!std::ranges::empty(arg))
1134 stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1139 stream_it.write_number(std::to_integer<uint8_t>(elem));
1145 if (!std::ranges::empty(arg))
1147 stream_it.write_number(*std::ranges::begin(arg));
1152 stream_it.write_number(elem);
1157 else if constexpr (std::same_as<std::remove_cvref_t<T>,
char>)
1163 stream_it.write_number(arg);
1167 for (
auto & [tag, variant] : tag_dict)
1169 *stream_it = separator;
1171 char const char0 = tag / 256;
1172 char const char1 = tag % 256;
1177 *stream_it = detail::sam_tag_type_char[variant.
index()];
1180 if (detail::sam_tag_type_char_extra[variant.
index()] !=
'\0')
1182 *stream_it = detail::sam_tag_type_char_extra[variant.
index()];
Core alphabet concept and free function/type trait wrappers.
The alphabet of a gap character '-'.
Definition: gap.hpp:39
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:337
Provides seqan3::detail::fast_ostreambuf_iterator.
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:63
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ none
None of the flags below are set.
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:128
typename decltype(detail::split_after< i >(list_t{}))::second_type drop
Return a seqan3::type_list of the types in the input type list, except the first n.
Definition: traits.hpp:388
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
A more refined container concept than seqan3::container.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
The <ranges> header from C++20's standard library.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::sequence_file_output_options.
Provides seqan3::views::slice.
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:48
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:26
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:30
bool sam_require_header
Whether to require a header for SAM files.
Definition: output_options.hpp:44
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:26
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.