51 namespace seqan3::detail
68 format_sam_base() noexcept = default;
69 format_sam_base(format_sam_base const &) noexcept = default;
70 format_sam_base & operator=(format_sam_base const &) noexcept = default;
71 format_sam_base(format_sam_base &&) noexcept = default;
72 format_sam_base & operator=(format_sam_base &&) noexcept = default;
73 ~format_sam_base() noexcept = default;
77 static constexpr
std::array format_version{
'1',
'.',
'6'};
83 bool header_was_written{
false};
86 bool ref_info_present_in_header{
false};
88 template <
typename ref_id_type,
89 typename ref_id_tmp_type,
91 typename ref_seqs_type>
92 void check_and_assign_ref_id(ref_id_type &
ref_id,
93 ref_id_tmp_type & ref_id_tmp,
97 static void update_alignment_lengths(int32_t & ref_length,
99 char const cigar_operation,
100 uint32_t
const cigar_count);
102 template <
typename align_type,
typename ref_seqs_type>
103 void construct_alignment(align_type & align,
105 [[maybe_unused]] int32_t rid,
106 [[maybe_unused]] ref_seqs_type & ref_seqs,
107 [[maybe_unused]] int32_t ref_start,
110 void transfer_soft_clipping_to(
std::vector<cigar> const & cigar_vector, int32_t & sc_begin, int32_t & sc_end)
const;
112 template <
typename cigar_input_type>
115 template <
typename stream_view_type>
116 void read_field(stream_view_type && stream_view, detail::ignore_t
const & SEQAN3_DOXYGEN_ONLY(target));
118 template <
typename stream_view_type, std::ranges::forward_range target_range_type>
119 void read_field(stream_view_type && stream_view, target_range_type & target);
121 template <
typename stream_view_t, arithmetic arithmetic_target_type>
122 void read_field(stream_view_t && stream_view, arithmetic_target_type & arithmetic_target);
124 template <
typename stream_view_type,
typename optional_value_type>
127 template <
typename stream_view_type,
typename ref_
ids_type,
typename ref_seqs_type>
128 void read_header(stream_view_type && stream_view,
129 alignment_file_header<ref_ids_type> & hdr,
132 template <
typename stream_t,
typename ref_
ids_type>
133 void write_header(stream_t & stream,
134 alignment_file_output_options
const & options,
135 alignment_file_header<ref_ids_type> & header);
148 template <
typename ref_id_type,
149 typename ref_id_tmp_type,
150 typename header_type,
151 typename ref_seqs_type>
152 inline void format_sam_base::check_and_assign_ref_id(ref_id_type &
ref_id,
153 ref_id_tmp_type & ref_id_tmp,
154 header_type & header,
157 if (!std::ranges::empty(ref_id_tmp))
159 auto search = header.ref_dict.find(ref_id_tmp);
161 if (
search == header.ref_dict.end())
163 if constexpr(detail::decays_to_ignore_v<ref_seqs_type>)
165 if (ref_info_present_in_header)
167 throw format_error{
"Unknown reference id found in record which is not present in the header."};
171 header.ref_ids().push_back(ref_id_tmp);
173 header.ref_dict[header.ref_ids()[pos]] = pos;
179 throw format_error{
"Unknown reference id found in record which is not present in the given ids."};
195 inline void format_sam_base::update_alignment_lengths(int32_t & ref_length,
196 int32_t & seq_length,
197 char const cigar_operation,
198 uint32_t
const cigar_count)
200 switch (cigar_operation)
202 case 'M':
case '=':
case 'X': ref_length += cigar_count, seq_length += cigar_count;
break;
203 case 'D':
case 'N': ref_length += cigar_count;
break;
204 case 'I' : seq_length += cigar_count;
break;
205 case 'S':
case 'H':
case 'P':
break;
206 default:
throw format_error{
"Illegal cigar operation: " +
std::string{cigar_operation}};
215 inline void format_sam_base::transfer_soft_clipping_to(
std::vector<cigar> const & cigar_vector,
217 int32_t & sc_end)
const
220 auto soft_clipping_at = [&] (
size_t const index) {
return cigar_vector[index] ==
'S'_cigar_op; };
222 auto hard_clipping_at = [&] (
size_t const index) {
return cigar_vector[index] ==
'H'_cigar_op; };
224 auto vector_size_at_least = [&] (
size_t const min_size) {
return cigar_vector.
size() >= min_size; };
226 auto cigar_count_at = [&] (
size_t const index) {
return get<0>(cigar_vector[index]); };
229 if (vector_size_at_least(1) && soft_clipping_at(0))
230 sc_begin = cigar_count_at(0);
231 else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1))
232 sc_begin = cigar_count_at(1);
237 auto last_index = cigar_vector.
size() - 1;
238 auto second_last_index = last_index - 1;
240 if (vector_size_at_least(2) && soft_clipping_at(last_index))
241 sc_end = cigar_count_at(last_index);
242 else if (vector_size_at_least(3) && hard_clipping_at(last_index) && soft_clipping_at(second_last_index))
243 sc_end = cigar_count_at(second_last_index);
259 template <
typename cigar_input_type>
264 char cigar_operation{};
265 uint32_t cigar_count{};
266 int32_t ref_length{}, seq_length{};
273 while (std::ranges::begin(cigar_view) != std::ranges::end(cigar_view))
277 std::ranges::next(std::ranges::begin(cigar_view));
280 throw format_error{
"Corrupted cigar string encountered"};
282 update_alignment_lengths(ref_length, seq_length, cigar_operation, cigar_count);
283 operations.emplace_back(cigar_count, cigar_op{}.assign_char(cigar_operation));
286 return {operations, ref_length, seq_length};
299 template <
typename align_type,
typename ref_seqs_type>
300 inline void format_sam_base::construct_alignment(align_type & align,
302 [[maybe_unused]] int32_t rid,
303 [[maybe_unused]] ref_seqs_type & ref_seqs,
304 [[maybe_unused]] int32_t ref_start,
307 if (rid > -1 && ref_start > -1 &&
308 !cigar_vector.
empty() &&
309 !std::ranges::empty(get<1>(align)))
311 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
313 assert(static_cast<size_t>(ref_start + ref_length) <=
std::ranges::size(ref_seqs[rid]));
319 using unaligned_t = remove_cvref_t<detail::unaligned_seq_t<decltype(get<0>(align))>>;
320 auto dummy_seq =
views::repeat_n(value_type_t<unaligned_t>{}, ref_length)
322 static_assert(
std::same_as<unaligned_t, decltype(dummy_seq)>,
323 "No reference information was given so the type of the first alignment tuple position"
324 "must have an unaligned sequence type of a dummy sequence ("
325 "views::repeat_n(dna5{}, size_t{}) | "
326 "std::views::transform(detail::access_restrictor_fn{}))");
332 detail::alignment_from_cigar(align, cigar_vector);
336 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
343 using unaligned_t = remove_cvref_t<detail::unaligned_seq_t<decltype(get<0>(align))>>;
356 template <
typename stream_view_type>
357 inline void format_sam_base::read_field(stream_view_type && stream_view,
358 detail::ignore_t
const & SEQAN3_DOXYGEN_ONLY(target))
360 detail::consume(stream_view);
370 template <
typename stream_view_type, std::ranges::forward_range target_range_type>
371 inline void format_sam_base::read_field(stream_view_type && stream_view, target_range_type & target)
373 if (!is_char<'*'>(*std::ranges::begin(stream_view)))
374 std::ranges::copy(stream_view |
views::char_to<value_type_t<target_range_type>>,
375 std::ranges::back_inserter(target));
377 std::ranges::next(std::ranges::begin(stream_view));
390 template <
typename stream_view_t, arithmetic arithmetic_target_type>
391 inline void format_sam_base::read_field(stream_view_t && stream_view, arithmetic_target_type & arithmetic_target)
394 auto [ignore,
end] = std::ranges::copy(stream_view, arithmetic_buffer.data());
398 if (res.ec == std::errc::invalid_argument || res.ptr != end)
399 throw format_error{
std::string(
"[CORRUPTED SAM FILE] The string '") +
401 "' could not be cast into type " +
402 detail::type_name_as_string<arithmetic_target_type>};
404 if (res.ec == std::errc::result_out_of_range)
406 "' into type " + detail::type_name_as_string<arithmetic_target_type> +
407 " would cause an overflow."};
420 template <
typename stream_view_type,
typename optional_value_type>
423 optional_value_type tmp;
424 read_field(std::forward<stream_view_type>(stream_view), tmp);
444 template <
typename stream_view_type,
typename ref_
ids_type,
typename ref_seqs_type>
445 inline void format_sam_base::read_header(stream_view_type && stream_view,
446 alignment_file_header<ref_ids_type> & hdr,
449 auto parse_tag_value = [&stream_view,
this] (
auto & value)
452 std::ranges::next(std::ranges::begin(stream_view));
456 while (is_char<'@'>(*std::ranges::begin(stream_view)))
458 std::ranges::next(std::ranges::begin(stream_view));
460 if (is_char<'H'>(*std::ranges::begin(stream_view)))
462 parse_tag_value(hdr.format_version);
465 while (is_char<'\t'>(*std::ranges::begin(stream_view)))
467 std::ranges::next(std::ranges::begin(stream_view));
470 if (is_char<'S'>(*std::ranges::begin(stream_view)))
472 std::ranges::next(std::ranges::begin(stream_view));
474 if (is_char<'O'>(*std::ranges::begin(stream_view)))
476 else if (is_char<'S'>(*std::ranges::begin(stream_view)))
479 throw format_error{
std::string{
"Illegal SAM header tag: S"} +
480 std::string{static_cast<char>(*std::ranges::begin(stream_view))}};
482 else if (!is_char<'G'>(*std::ranges::begin(stream_view)))
484 throw format_error{
std::string{
"Illegal SAM header tag in @HG starting with:"} +
485 std::string{static_cast<char>(*std::ranges::begin(stream_view))}};
488 parse_tag_value(*who);
490 std::ranges::next(std::ranges::begin(stream_view));
492 else if (is_char<'S'>(*std::ranges::begin(stream_view)))
494 ref_info_present_in_header =
true;
499 std::ranges::next(std::ranges::begin(stream_view));
500 parse_tag_value(get<0>(info));
502 if (is_char<'\t'>(*std::ranges::begin(stream_view)))
504 std::ranges::next(std::ranges::begin(stream_view));
507 std::ranges::next(std::ranges::begin(stream_view));
511 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
513 auto id_it = hdr.ref_dict.find(
id);
515 if (id_it == hdr.ref_dict.end())
516 throw format_error{detail::to_string(
"Unknown reference name '",
id,
"' found in SAM header ",
517 "(header.ref_ids(): ", hdr.ref_ids(),
").")};
519 auto & given_ref_info = hdr.ref_id_info[id_it->second];
521 if (std::get<0>(given_ref_info) != std::get<0>(info))
522 throw format_error{
"Provided reference has unequal length as specified in the header."};
524 hdr.ref_id_info[id_it->second] =
std::move(info);
528 static_assert(!detail::is_type_specialisation_of_v<decltype(hdr.ref_ids()),
std::deque>,
529 "The range over reference ids must be of type std::deque such that "
530 "pointers are not invalidated.");
532 hdr.ref_ids().push_back(
id);
533 hdr.ref_id_info.push_back(info);
534 hdr.ref_dict[(hdr.ref_ids())[(hdr.ref_ids()).
size() - 1]] = (hdr.ref_ids()).
size() - 1;
537 else if (is_char<'R'>(*std::ranges::begin(stream_view)))
541 parse_tag_value(get<0>(tmp));
543 if (is_char<'\t'>(*std::ranges::begin(stream_view)))
545 std::ranges::next(std::ranges::begin(stream_view));
548 std::ranges::next(std::ranges::begin(stream_view));
550 hdr.read_groups.emplace_back(
std::move(tmp));
552 else if (is_char<'P'>(*std::ranges::begin(stream_view)))
554 typename alignment_file_header<ref_ids_type>::program_info_t tmp{};
556 parse_tag_value(tmp.id);
559 while (is_char<'\t'>(*std::ranges::begin(stream_view)))
561 std::ranges::next(std::ranges::begin(stream_view));
564 if (is_char<'P'>(*std::ranges::begin(stream_view)))
566 std::ranges::next(std::ranges::begin(stream_view));
568 if (is_char<'N'>(*std::ranges::begin(stream_view)))
573 else if (is_char<'C'>(*std::ranges::begin(stream_view)))
575 who = &tmp.command_line_call;
577 else if (is_char<'D'>(*std::ranges::begin(stream_view)))
579 who = &tmp.description;
581 else if (!is_char<'V'>(*std::ranges::begin(stream_view)))
583 throw format_error{
std::string{
"Illegal SAM header tag starting with:"} +
584 std::string{static_cast<char>(*std::ranges::begin(stream_view))}};
587 parse_tag_value(*who);
589 std::ranges::next(std::ranges::begin(stream_view));
591 hdr.program_infos.emplace_back(
std::move(tmp));
593 else if (is_char<'C'>(*std::ranges::begin(stream_view)))
596 std::ranges::next(std::ranges::begin(stream_view));
597 std::ranges::next(std::ranges::begin(stream_view));
598 std::ranges::next(std::ranges::begin(stream_view));
600 std::ranges::next(std::ranges::begin(stream_view));
602 hdr.comments.emplace_back(
std::move(tmp));
606 throw format_error{
std::string{
"Illegal SAM header tag starting with:"} +
607 std::string{static_cast<char>(*std::ranges::begin(stream_view))}};
628 template <
typename stream_t,
typename ref_
ids_type>
629 inline void format_sam_base::write_header(stream_t & stream,
630 alignment_file_output_options
const & options,
631 alignment_file_header<ref_ids_type> & header)
639 if (!header.sorting.empty() &&
640 !(header.sorting ==
"unknown" ||
641 header.sorting ==
"unsorted" ||
642 header.sorting ==
"queryname" ||
643 header.sorting ==
"coordinate" ))
644 throw format_error{
"SAM format error: The header.sorting member must be "
645 "one of [unknown, unsorted, queryname, coordinate]."};
647 if (!header.grouping.empty() &&
648 !(header.grouping ==
"none" ||
649 header.grouping ==
"query" ||
650 header.grouping ==
"reference"))
651 throw format_error{
"SAM format error: The header.grouping member must be "
652 "one of [none, query, reference]."};
671 stream <<
"@HD\tVN:";
672 std::ranges::copy(format_version, stream_it);
674 if (!header.sorting.empty())
675 stream <<
"\tSO:" << header.sorting;
677 if (!header.subsorting.empty())
678 stream <<
"\tSS:" << header.subsorting;
680 if (!header.grouping.empty())
681 stream <<
"\tGO:" << header.grouping;
683 detail::write_eol(stream_it, options.add_carriage_return);
686 for (
auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info))
688 stream <<
"@SQ\tSN:";
690 std::ranges::copy(ref_name, stream_it);
692 stream <<
"\tLN:" << get<0>(ref_info);
694 if (!get<1>(ref_info).
empty())
695 stream <<
"\t" << get<1>(ref_info);
697 detail::write_eol(stream_it, options.add_carriage_return);
701 for (
auto const & read_group : header.read_groups)
704 <<
"\tID:" << get<0>(read_group);
706 if (!get<1>(read_group).
empty())
707 stream <<
"\t" << get<1>(read_group);
709 detail::write_eol(stream_it, options.add_carriage_return);
713 for (
auto const & program : header.program_infos)
716 <<
"\tID:" << program.id;
718 if (!program.name.empty())
719 stream <<
"\tPN:" << program.name;
721 if (!program.command_line_call.empty())
722 stream <<
"\tCL:" << program.command_line_call;
724 if (!program.previous.empty())
725 stream <<
"\tPP:" << program.previous;
727 if (!program.description.empty())
728 stream <<
"\tDS:" << program.description;
730 if (!program.version.empty())
731 stream <<
"\tVN:" << program.version;
733 detail::write_eol(stream_it, options.add_carriage_return);
737 for (
auto const &
comment : header.comments)
740 detail::write_eol(stream_it, options.add_carriage_return);