33namespace seqan3::detail
50 format_sam_base() =
default;
51 format_sam_base(format_sam_base
const &) =
default;
52 format_sam_base & operator=(format_sam_base
const &) =
default;
53 format_sam_base(format_sam_base &&) =
default;
54 format_sam_base & operator=(format_sam_base &&) =
default;
55 ~format_sam_base() =
default;
60 static constexpr std::array format_version{
'1',
'.',
'6'};
66 bool header_was_written{
false};
69 bool ref_info_present_in_header{
false};
71 template <
typename ref_
id_type,
typename ref_
id_tmp_type,
typename header_type,
typename ref_seqs_type>
72 void check_and_assign_ref_id(ref_id_type & ref_id,
73 ref_id_tmp_type & ref_id_tmp,
77 template <
typename align_type,
typename ref_seqs_type>
78 void construct_alignment(align_type & align,
80 [[maybe_unused]] int32_t rid,
81 [[maybe_unused]] ref_seqs_type & ref_seqs,
82 [[maybe_unused]] int32_t ref_start,
85 void transfer_soft_clipping_to(
std::vector<cigar> const & cigar_vector, int32_t & sc_begin, int32_t & sc_end)
const;
87 template <
typename stream_view_t>
88 void read_byte_field(stream_view_t && stream_view,
std::byte & byte_target);
90 template <
typename stream_view_type, std::ranges::forward_range target_range_type>
91 void read_forward_range_field(stream_view_type && stream_view, target_range_type & target);
93 template <
typename stream_view_t, arithmetic arithmetic_target_type>
94 void read_arithmetic_field(stream_view_t && stream_view, arithmetic_target_type & arithmetic_target);
96 template <
typename stream_view_type,
typename ref_
ids_type,
typename ref_seqs_type>
97 void read_header(stream_view_type && stream_view,
98 sam_file_header<ref_ids_type> & hdr,
101 template <
typename stream_t,
typename ref_
ids_type>
103 write_header(stream_t & stream, sam_file_output_options
const & options, sam_file_header<ref_ids_type> & header);
116template <
typename ref_
id_type,
typename ref_
id_tmp_type,
typename header_type,
typename ref_seqs_type>
117inline void format_sam_base::check_and_assign_ref_id(ref_id_type & ref_id,
118 ref_id_tmp_type & ref_id_tmp,
119 header_type & header,
122 if (!std::ranges::empty(ref_id_tmp))
124 auto search = header.ref_dict.find(ref_id_tmp);
126 if (
search == header.ref_dict.end())
128 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>)
130 if (ref_info_present_in_header)
132 throw format_error{
"Unknown reference id found in record which is not present in the header."};
136 header.ref_ids().push_back(ref_id_tmp);
138 header.ref_dict[header.ref_ids()[pos]] = pos;
144 throw format_error{
"Unknown reference id found in record which is not present in the given ids."};
159inline void format_sam_base::transfer_soft_clipping_to(
std::vector<cigar> const & cigar_vector,
161 int32_t & sc_end)
const
164 auto soft_clipping_at = [&](
size_t const index)
166 return cigar_vector[index] ==
'S'_cigar_operation;
169 auto hard_clipping_at = [&](
size_t const index)
171 return cigar_vector[index] ==
'H'_cigar_operation;
174 auto vector_size_at_least = [&](
size_t const min_size)
176 return cigar_vector.
size() >= min_size;
179 auto cigar_count_at = [&](
size_t const index)
181 return get<0>(cigar_vector[index]);
185 if (vector_size_at_least(1) && soft_clipping_at(0))
186 sc_begin = cigar_count_at(0);
187 else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1))
188 sc_begin = cigar_count_at(1);
193 auto last_index = cigar_vector.
size() - 1;
194 auto second_last_index = last_index - 1;
196 if (vector_size_at_least(2) && soft_clipping_at(last_index))
197 sc_end = cigar_count_at(last_index);
198 else if (vector_size_at_least(3) && hard_clipping_at(last_index) && soft_clipping_at(second_last_index))
199 sc_end = cigar_count_at(second_last_index);
212template <
typename align_type,
typename ref_seqs_type>
213inline void format_sam_base::construct_alignment(align_type & align,
215 [[maybe_unused]] int32_t rid,
216 [[maybe_unused]] ref_seqs_type & ref_seqs,
217 [[maybe_unused]] int32_t ref_start,
220 if (rid > -1 && ref_start > -1 &&
221 !cigar_vector.
empty() &&
222 !std::ranges::empty(get<1>(align)))
224 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
226 assert(
static_cast<size_t>(ref_start + ref_length) <=
std::ranges::size(ref_seqs[rid]));
228 assign_unaligned(get<0>(align), ref_seqs[rid] |
views::slice(ref_start, ref_start + ref_length));
233 auto dummy_seq =
views::repeat_n(std::ranges::range_value_t<unaligned_t>{}, ref_length)
235 static_assert(std::same_as<unaligned_t,
decltype(dummy_seq)>,
236 "No reference information was given so the type of the first alignment tuple position"
237 "must have an unaligned sequence type of a dummy sequence ("
238 "views::repeat_n(dna5{}, size_t{}) | "
239 "std::views::transform(detail::access_restrictor_fn{}))");
241 assign_unaligned(get<0>(align), dummy_seq);
245 detail::alignment_from_cigar(align, cigar_vector);
249 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
252 assign_unaligned(get<0>(align), ref_seqs[0] |
views::slice(0, 0));
257 assign_unaligned(get<0>(align),
273template <
typename stream_view_t>
274inline void format_sam_base::read_byte_field(stream_view_t && stream_view,
std::byte & byte_target)
282 std::from_chars_result res =
std::from_chars(arithmetic_buffer.begin(), end,
byte, 16);
284 if (res.ec == std::errc::invalid_argument || res.ptr != end)
285 throw format_error{
std::string(
"[CORRUPTED SAM FILE] The string '")
286 +
std::string(arithmetic_buffer.begin(), end) +
"' could not be cast into type uint8_t."};
288 if (res.ec == std::errc::result_out_of_range)
289 throw format_error{
std::string(
"[CORRUPTED SAM FILE] Casting '") +
std::string(arithmetic_buffer.begin(), end)
290 +
"' into type uint8_t would cause an overflow."};
301template <
typename stream_view_type, std::ranges::forward_range target_range_type>
302inline void format_sam_base::read_forward_range_field(stream_view_type && stream_view, target_range_type & target)
304 using target_range_value_t = std::ranges::range_value_t<target_range_type>;
305 using begin_iterator_t = std::ranges::iterator_t<stream_view_type>;
306 using end_iterator_t = std::ranges::sentinel_t<stream_view_type>;
313 if (
char c = *it; !(++it == std::ranges::end(stream_view) && c ==
'*'))
316 std::ranges::copy(std::ranges::subrange<begin_iterator_t, end_iterator_t>{it, std::ranges::end(stream_view)}
317 | views::char_to<target_range_value_t>,
333template <
typename stream_view_t, arithmetic arithmetic_target_type>
334inline void format_sam_base::read_arithmetic_field(stream_view_t && stream_view,
335 arithmetic_target_type & arithmetic_target)
340 std::from_chars_result res =
std::from_chars(arithmetic_buffer.begin(), end, arithmetic_target);
342 if (res.ec == std::errc::invalid_argument || res.ptr != end)
343 throw format_error{
std::string(
"[CORRUPTED SAM FILE] The string '")
344 +
std::string(arithmetic_buffer.begin(), end) +
"' could not be cast into type "
345 + detail::type_name_as_string<arithmetic_target_type>};
347 if (res.ec == std::errc::result_out_of_range)
348 throw format_error{
std::string(
"[CORRUPTED SAM FILE] Casting '") +
std::string(arithmetic_buffer.begin(), end)
349 +
"' into type " + detail::type_name_as_string<arithmetic_target_type>
350 +
" would cause an overflow."};
369template <
typename stream_view_type,
typename ref_
ids_type,
typename ref_seqs_type>
370inline void format_sam_base::read_header(stream_view_type && stream_view,
371 sam_file_header<ref_ids_type> & hdr,
375 auto end = std::ranges::end(stream_view);
378 auto make_tag = [](uint8_t char1, uint8_t char2)
constexpr
380 return static_cast<uint16_t
>(char1) | (
static_cast<uint16_t
>(char2) << CHAR_BIT);
385 auto parse_and_make_tag = [&]()
391 return make_tag(raw_tag[0], raw_tag[1]);
394 auto take_until_predicate = [&it, &string_buffer](
auto const & predicate)
396 string_buffer.clear();
397 while (!predicate(*it))
399 string_buffer.push_back(*it);
404 auto skip_until_predicate = [&it](
auto const & predicate)
406 while (!predicate(*it))
410 auto copy_next_tag_value_into_buffer = [&]()
412 skip_until_predicate(is_char<':'>);
414 take_until_predicate(is_char<'\t'> || is_char<'\n'>);
426 take_until_predicate(is_char<'\t'> || is_char<'\n'>);
431 read_forward_range_field(string_buffer, value);
434 auto print_cerr_of_unspported_tag = [&it](
char const *
const header_tag,
std::array<char, 2> raw_tag)
436 std::cerr <<
"Unsupported SAM header tag in @" << header_tag <<
": " << raw_tag[0] << raw_tag[1] <<
'\n';
439 while (it != end && is_char<'@'>(*it))
443 switch (parse_and_make_tag())
445 case make_tag(
'H',
'D'):
448 while (is_char<'\t'>(*it))
453 switch (parse_and_make_tag())
455 case make_tag(
'V',
'N'):
460 case make_tag(
'S',
'O'):
465 case make_tag(
'S',
'S'):
470 case make_tag(
'G',
'O'):
477 print_cerr_of_unspported_tag(
"HD", raw_tag);
481 if (header_entry !=
nullptr)
483 copy_next_tag_value_into_buffer();
484 read_forward_range_field(string_buffer, *header_entry);
487 skip_until_predicate(is_char<'\t'> || is_char<'\n'>);
491 if (hdr.format_version.empty())
492 throw format_error{
std::string{
"The required VN tag in @HD is missing."}};
497 case make_tag(
'S',
'Q'):
499 ref_info_present_in_header =
true;
500 std::ranges::range_value_t<
decltype(hdr.ref_ids())>
id;
505 while (is_char<'\t'>(*it))
509 switch (parse_and_make_tag())
511 case make_tag(
'S',
'N'):
513 copy_next_tag_value_into_buffer();
514 read_forward_range_field(string_buffer,
id);
517 case make_tag(
'L',
'N'):
519 int32_t sequence_length_tmp{};
520 copy_next_tag_value_into_buffer();
521 read_arithmetic_field(string_buffer, sequence_length_tmp);
522 sequence_length = sequence_length_tmp;
527 parse_and_append_unhandled_tag_to_string(get<1>(info), raw_tag);
534 throw format_error{
std::string{
"The required SN tag in @SQ is missing."}};
535 if (!sequence_length.has_value())
536 throw format_error{
std::string{
"The required LN tag in @SQ is missing."}};
537 if (sequence_length.value() <= 0)
538 throw format_error{
std::string{
"The value of LN in @SQ must be positive."}};
540 get<0>(info) = sequence_length.value();
543 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
545 auto id_it = hdr.ref_dict.
find(
id);
547 if (id_it == hdr.ref_dict.end())
548 throw format_error{detail::to_string(
"Unknown reference name '",
550 "' found in SAM header ",
551 "(header.ref_ids(): ",
555 auto & given_ref_info = hdr.ref_id_info[id_it->second];
557 if (std::get<0>(given_ref_info) != std::get<0>(info))
558 throw format_error{
"Provided and header-based reference length differ."};
560 hdr.ref_id_info[id_it->second] = std::move(info);
564 static_assert(!detail::is_type_specialisation_of_v<
decltype(hdr.ref_ids()),
std::deque>,
565 "The range over reference ids must be of type std::deque such that pointers are not "
568 hdr.ref_ids().push_back(
id);
569 hdr.ref_id_info.push_back(info);
570 hdr.ref_dict[(hdr.ref_ids())[(hdr.ref_ids()).
size() - 1]] = (hdr.ref_ids()).
size() - 1;
575 case make_tag(
'R',
'G'):
580 while (is_char<'\t'>(*it))
584 switch (parse_and_make_tag())
586 case make_tag(
'I',
'D'):
588 copy_next_tag_value_into_buffer();
589 read_forward_range_field(string_buffer, get<0>(tmp));
594 parse_and_append_unhandled_tag_to_string(get<1>(tmp), raw_tag);
600 if (get<0>(tmp).
empty())
601 throw format_error{
std::string{
"The required ID tag in @RG is missing."}};
603 hdr.read_groups.emplace_back(std::move(tmp));
607 case make_tag(
'P',
'G'):
609 typename sam_file_header<ref_ids_type>::program_info_t tmp{};
612 while (is_char<'\t'>(*it))
617 switch (parse_and_make_tag())
619 case make_tag(
'I',
'D'):
624 case make_tag(
'P',
'N'):
629 case make_tag(
'P',
'P'):
634 case make_tag(
'C',
'L'):
639 case make_tag(
'D',
'S'):
644 case make_tag(
'V',
'N'):
651 print_cerr_of_unspported_tag(
"PG", raw_tag);
655 if (program_info_entry !=
nullptr)
657 copy_next_tag_value_into_buffer();
658 read_forward_range_field(string_buffer, *program_info_entry);
661 skip_until_predicate(is_char<'\t'> || is_char<'\n'>);
666 throw format_error{
std::string{
"The required ID tag in @PG is missing."}};
668 hdr.program_infos.emplace_back(std::move(tmp));
672 case make_tag(
'C',
'O'):
676 take_until_predicate(is_char<'\n'>);
677 read_forward_range_field(string_buffer, tmp);
679 hdr.comments.emplace_back(std::move(tmp));
684 throw format_error{
std::string{
"Illegal SAM header tag starting with:"} + *it};
705template <
typename stream_t,
typename ref_
ids_type>
706inline void format_sam_base::write_header(stream_t & stream,
707 sam_file_output_options
const & options,
708 sam_file_header<ref_ids_type> & header)
716 if (!header.sorting.empty()
717 && !(header.sorting ==
"unknown" || header.sorting ==
"unsorted" || header.sorting ==
"queryname"
718 || header.sorting ==
"coordinate"))
719 throw format_error{
"SAM format error: The header.sorting member must be "
720 "one of [unknown, unsorted, queryname, coordinate]."};
722 if (!header.grouping.empty()
723 && !(header.grouping ==
"none" || header.grouping ==
"query" || header.grouping ==
"reference"))
724 throw format_error{
"SAM format error: The header.grouping member must be "
725 "one of [none, query, reference]."};
744 stream <<
"@HD\tVN:";
747 if (!header.sorting.empty())
748 stream <<
"\tSO:" << header.sorting;
750 if (!header.subsorting.empty())
751 stream <<
"\tSS:" << header.subsorting;
753 if (!header.grouping.empty())
754 stream <<
"\tGO:" << header.grouping;
756 detail::write_eol(stream_it, options.add_carriage_return);
759 for (
auto const & [ref_name, ref_info] :
views::zip(header.ref_ids(), header.ref_id_info))
761 stream <<
"@SQ\tSN:";
765 stream <<
"\tLN:" << get<0>(ref_info);
767 if (!get<1>(ref_info).
empty())
768 stream <<
"\t" << get<1>(ref_info);
770 detail::write_eol(stream_it, options.add_carriage_return);
774 for (
auto const & read_group : header.read_groups)
777 <<
"\tID:" << get<0>(read_group);
779 if (!get<1>(read_group).
empty())
780 stream <<
"\t" << get<1>(read_group);
782 detail::write_eol(stream_it, options.add_carriage_return);
786 for (
auto const & program : header.program_infos)
789 <<
"\tID:" << program.id;
791 if (!program.name.empty())
792 stream <<
"\tPN:" << program.name;
794 if (!program.command_line_call.empty())
795 stream <<
"\tCL:" << program.command_line_call;
797 if (!program.previous.empty())
798 stream <<
"\tPP:" << program.previous;
800 if (!program.description.empty())
801 stream <<
"\tDS:" << program.description;
803 if (!program.version.empty())
804 stream <<
"\tVN:" << program.version;
806 detail::write_eol(stream_it, options.add_carriage_return);
810 for (
auto const & comment : header.comments)
813 detail::write_eol(stream_it, options.add_carriage_return);
T back_inserter(T... args)
Provides seqan3::views::char_to.
Provides various utility functions.
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: concept.hpp:524
@ comment
Comment field of arbitrary content, usually a string.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
auto search(queries_t &&queries, index_t const &index, configuration_t const &cfg=search_cfg::default_configuration)
Search a query or a range of queries in an index.
Definition: search.hpp:103
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:470
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:146
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
constexpr auto zip
A view adaptor that takes several views and returns tuple-like values from every i-th element of each...
Definition: zip.hpp:573
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides various utility functions.
Auxiliary functions for the alignment IO.
Provides seqan3::debug_stream and related types.
Provides seqan3::views::repeat_n.
Provides seqan3::views::slice.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::views::zip.