32 namespace seqan3::detail
49 format_sam_base() =
default;
50 format_sam_base(format_sam_base
const &) =
default;
51 format_sam_base & operator=(format_sam_base
const &) =
default;
52 format_sam_base(format_sam_base &&) =
default;
53 format_sam_base & operator=(format_sam_base &&) =
default;
54 ~format_sam_base() =
default;
59 static constexpr
std::array format_version{
'1',
'.',
'6'};
65 bool header_was_written{
false};
68 bool ref_info_present_in_header{
false};
70 template <
typename ref_id_type,
71 typename ref_id_tmp_type,
73 typename ref_seqs_type>
74 void check_and_assign_ref_id(ref_id_type & ref_id,
75 ref_id_tmp_type & ref_id_tmp,
79 template <
typename align_type,
typename ref_seqs_type>
80 void construct_alignment(align_type & align,
82 [[maybe_unused]] int32_t rid,
83 [[maybe_unused]] ref_seqs_type & ref_seqs,
84 [[maybe_unused]] int32_t ref_start,
87 void transfer_soft_clipping_to(
std::vector<cigar> const & cigar_vector, int32_t & sc_begin, int32_t & sc_end)
const;
89 template <
typename stream_view_type>
90 void read_field(stream_view_type && stream_view, detail::ignore_t
const & SEQAN3_DOXYGEN_ONLY(target));
92 template <
typename stream_view_t>
93 void read_field(stream_view_t && stream_view,
std::byte & byte_target);
95 template <
typename stream_view_type, std::ranges::forward_range target_range_type>
96 void read_field(stream_view_type && stream_view, target_range_type & target);
98 template <
typename stream_view_t, arithmetic arithmetic_target_type>
99 void read_field(stream_view_t && stream_view, arithmetic_target_type & arithmetic_target);
101 template <
typename stream_view_type,
typename optional_value_type>
104 template <
typename stream_view_type,
typename ref_
ids_type,
typename ref_seqs_type>
105 void read_header(stream_view_type && stream_view,
106 sam_file_header<ref_ids_type> & hdr,
109 template <
typename stream_t,
typename ref_
ids_type>
110 void write_header(stream_t & stream,
111 sam_file_output_options
const & options,
112 sam_file_header<ref_ids_type> & header);
125 template <
typename ref_id_type,
126 typename ref_id_tmp_type,
127 typename header_type,
128 typename ref_seqs_type>
129 inline void format_sam_base::check_and_assign_ref_id(ref_id_type & ref_id,
130 ref_id_tmp_type & ref_id_tmp,
131 header_type & header,
134 if (!std::ranges::empty(ref_id_tmp))
136 auto search = header.ref_dict.find(ref_id_tmp);
138 if (
search == header.ref_dict.end())
140 if constexpr(detail::decays_to_ignore_v<ref_seqs_type>)
142 if (ref_info_present_in_header)
144 throw format_error{
"Unknown reference id found in record which is not present in the header."};
148 header.ref_ids().push_back(ref_id_tmp);
150 header.ref_dict[header.ref_ids()[pos]] = pos;
156 throw format_error{
"Unknown reference id found in record which is not present in the given ids."};
171 inline void format_sam_base::transfer_soft_clipping_to(
std::vector<cigar> const & cigar_vector,
173 int32_t & sc_end)
const
176 auto soft_clipping_at = [&] (
size_t const index) {
return cigar_vector[index] ==
'S'_cigar_operation; };
178 auto hard_clipping_at = [&] (
size_t const index) {
return cigar_vector[index] ==
'H'_cigar_operation; };
180 auto vector_size_at_least = [&] (
size_t const min_size) {
return cigar_vector.
size() >= min_size; };
182 auto cigar_count_at = [&] (
size_t const index) {
return get<0>(cigar_vector[index]); };
185 if (vector_size_at_least(1) && soft_clipping_at(0))
186 sc_begin = cigar_count_at(0);
187 else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1))
188 sc_begin = cigar_count_at(1);
193 auto last_index = cigar_vector.
size() - 1;
194 auto second_last_index = last_index - 1;
196 if (vector_size_at_least(2) && soft_clipping_at(last_index))
197 sc_end = cigar_count_at(last_index);
198 else if (vector_size_at_least(3) && hard_clipping_at(last_index) && soft_clipping_at(second_last_index))
199 sc_end = cigar_count_at(second_last_index);
212 template <
typename align_type,
typename ref_seqs_type>
213 inline void format_sam_base::construct_alignment(align_type & align,
215 [[maybe_unused]] int32_t rid,
216 [[maybe_unused]] ref_seqs_type & ref_seqs,
217 [[maybe_unused]] int32_t ref_start,
220 if (rid > -1 && ref_start > -1 &&
221 !cigar_vector.
empty() &&
222 !std::ranges::empty(get<1>(align)))
224 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
226 assert(
static_cast<size_t>(ref_start + ref_length) <=
std::ranges::size(ref_seqs[rid]));
228 assign_unaligned(get<0>(align), ref_seqs[rid] |
views::slice(ref_start, ref_start + ref_length));
233 auto dummy_seq =
views::repeat_n(std::ranges::range_value_t<unaligned_t>{}, ref_length)
235 static_assert(std::same_as<unaligned_t, decltype(dummy_seq)>,
236 "No reference information was given so the type of the first alignment tuple position"
237 "must have an unaligned sequence type of a dummy sequence ("
238 "views::repeat_n(dna5{}, size_t{}) | "
239 "std::views::transform(detail::access_restrictor_fn{}))");
241 assign_unaligned(get<0>(align), dummy_seq);
245 detail::alignment_from_cigar(align, cigar_vector);
249 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
252 assign_unaligned(get<0>(align), ref_seqs[0] |
views::slice(0, 0));
257 assign_unaligned(get<0>(align),
views::repeat_n(std::ranges::range_value_t<unaligned_t>{}, 0)
269 template <
typename stream_view_type>
270 inline void format_sam_base::read_field(stream_view_type && stream_view,
271 detail::ignore_t
const & SEQAN3_DOXYGEN_ONLY(target))
273 detail::consume(stream_view);
285 template <
typename stream_view_t>
286 inline void format_sam_base::read_field(stream_view_t && stream_view,
std::byte & byte_target)
289 auto [ignore,
end] = std::ranges::copy(stream_view, arithmetic_buffer.data());
294 std::from_chars_result res =
std::from_chars(arithmetic_buffer.begin(), end,
byte, 16);
296 if (res.ec == std::errc::invalid_argument || res.ptr != end)
297 throw format_error{
std::string(
"[CORRUPTED SAM FILE] The string '") +
299 "' could not be cast into type uint8_t."};
301 if (res.ec == std::errc::result_out_of_range)
302 throw format_error{
std::string(
"[CORRUPTED SAM FILE] Casting '") +
std::string(arithmetic_buffer.begin(), end) +
303 "' into type uint8_t would cause an overflow."};
314 template <
typename stream_view_type, std::ranges::forward_range target_range_type>
315 inline void format_sam_base::read_field(stream_view_type && stream_view, target_range_type & target)
317 using target_range_value_t = std::ranges::range_value_t<target_range_type>;
318 using begin_iterator_t = std::ranges::iterator_t<stream_view_type>;
319 using end_iterator_t = std::ranges::sentinel_t<stream_view_type>;
323 if (
auto it = std::ranges::begin(stream_view); it != std::ranges::end(stream_view))
326 if (
char c = *it; !(++it == std::ranges::end(stream_view) && c ==
'*'))
329 std::ranges::copy(std::ranges::subrange<begin_iterator_t, end_iterator_t>{it, std::ranges::end(stream_view)}
330 | views::char_to<target_range_value_t>,
331 std::cpp20::back_inserter(target));
346 template <
typename stream_view_t, arithmetic arithmetic_target_type>
347 inline void format_sam_base::read_field(stream_view_t && stream_view, arithmetic_target_type & arithmetic_target)
350 auto [ignore,
end] = std::ranges::copy(stream_view, arithmetic_buffer.data());
352 std::from_chars_result res =
std::from_chars(arithmetic_buffer.begin(), end, arithmetic_target);
354 if (res.ec == std::errc::invalid_argument || res.ptr != end)
355 throw format_error{
std::string(
"[CORRUPTED SAM FILE] The string '") +
357 "' could not be cast into type " +
358 detail::type_name_as_string<arithmetic_target_type>};
360 if (res.ec == std::errc::result_out_of_range)
361 throw format_error{
std::string(
"[CORRUPTED SAM FILE] Casting '") +
std::string(arithmetic_buffer.begin(), end) +
362 "' into type " + detail::type_name_as_string<arithmetic_target_type> +
363 " would cause an overflow."};
376 template <
typename stream_view_type,
typename optional_value_type>
379 optional_value_type tmp;
380 read_field(std::forward<stream_view_type>(stream_view), tmp);
400 template <
typename stream_view_type,
typename ref_
ids_type,
typename ref_seqs_type>
401 inline void format_sam_base::read_header(stream_view_type && stream_view,
402 sam_file_header<ref_ids_type> & hdr,
406 auto end = std::ranges::end(stream_view);
409 auto make_tag = [] (uint8_t char1, uint8_t char2) constexpr
411 return static_cast<uint16_t
>(char1) | (
static_cast<uint16_t
>(char2) << CHAR_BIT);
416 auto parse_and_make_tag = [&] ()
422 return make_tag(raw_tag[0], raw_tag[1]);
425 auto take_until_predicate = [&it, &string_buffer] (
auto const & predicate)
427 string_buffer.clear();
428 while (!predicate(*it))
430 string_buffer.push_back(*it);
435 auto skip_until_predicate = [&it] (
auto const & predicate)
437 while (!predicate(*it))
441 auto parse_tag_value = [&] (
auto & value)
443 skip_until_predicate(is_char<':'>);
445 take_until_predicate(is_char<'\t'> || is_char<'\n'>);
446 read_field(string_buffer, value);
458 take_until_predicate(is_char<'\t'> || is_char<'\n'>);
463 read_field(string_buffer, value);
466 auto print_cerr_of_unspported_tag = [&it] (
char const *
const header_tag,
std::array<char, 2> raw_tag)
468 std::cerr <<
"Unsupported SAM header tag in @" << header_tag <<
": " << raw_tag[0] << raw_tag[1] <<
'\n';
471 while (it != end && is_char<'@'>(*it))
475 switch (parse_and_make_tag())
477 case make_tag(
'H',
'D'):
480 while (is_char<'\t'>(*it))
485 switch (parse_and_make_tag())
487 case make_tag(
'V',
'N'):
492 case make_tag(
'S',
'O'):
497 case make_tag(
'S',
'S'):
502 case make_tag(
'G',
'O'):
509 print_cerr_of_unspported_tag(
"HD", raw_tag);
513 if (header_entry !=
nullptr)
514 parse_tag_value(*header_entry);
516 skip_until_predicate(is_char<'\t'> || is_char<'\n'>);
520 if (hdr.format_version.empty())
521 throw format_error{
std::string{
"The required VN tag in @HD is missing."}};
526 case make_tag(
'S',
'Q'):
528 ref_info_present_in_header =
true;
529 std::ranges::range_value_t<decltype(hdr.ref_ids())>
id;
534 while (is_char<'\t'>(*it))
538 switch (parse_and_make_tag())
540 case make_tag(
'S',
'N'):
545 case make_tag(
'L',
'N'):
547 parse_tag_value(sequence_length);
552 parse_and_append_unhandled_tag_to_string(get<1>(info), raw_tag);
559 throw format_error{
std::string{
"The required SN tag in @SQ is missing."}};
560 if (!sequence_length.has_value())
561 throw format_error{
std::string{
"The required LN tag in @SQ is missing."}};
562 if (sequence_length.value() <= 0)
563 throw format_error{
std::string{
"The value of LN in @SQ must be positive."}};
565 get<0>(info) = sequence_length.value();
568 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
570 auto id_it = hdr.ref_dict.
find(
id);
572 if (id_it == hdr.ref_dict.end())
573 throw format_error{detail::to_string(
"Unknown reference name '",
id,
"' found in SAM header ",
574 "(header.ref_ids(): ", hdr.ref_ids(),
").")};
576 auto & given_ref_info = hdr.ref_id_info[id_it->second];
578 if (std::get<0>(given_ref_info) != std::get<0>(info))
579 throw format_error{
"Provided and header-based reference length differ."};
581 hdr.ref_id_info[id_it->second] =
std::move(info);
585 static_assert(!detail::is_type_specialisation_of_v<decltype(hdr.ref_ids()),
std::deque>,
586 "The range over reference ids must be of type std::deque such that pointers are not "
589 hdr.ref_ids().push_back(
id);
590 hdr.ref_id_info.push_back(info);
591 hdr.ref_dict[(hdr.ref_ids())[(hdr.ref_ids()).
size() - 1]] = (hdr.ref_ids()).
size() - 1;
596 case make_tag(
'R',
'G'):
601 while (is_char<'\t'>(*it))
605 switch (parse_and_make_tag())
607 case make_tag(
'I',
'D'):
609 parse_tag_value(get<0>(tmp));
614 parse_and_append_unhandled_tag_to_string(get<1>(tmp), raw_tag);
620 if (get<0>(tmp).
empty())
621 throw format_error{
std::string{
"The required ID tag in @RG is missing."}};
623 hdr.read_groups.emplace_back(
std::move(tmp));
627 case make_tag(
'P',
'G'):
629 typename sam_file_header<ref_ids_type>::program_info_t tmp{};
632 while (is_char<'\t'>(*it))
637 switch (parse_and_make_tag())
639 case make_tag(
'I',
'D'):
644 case make_tag(
'P',
'N'):
649 case make_tag(
'P',
'P'):
654 case make_tag(
'C',
'L'):
659 case make_tag(
'D',
'S'):
664 case make_tag(
'V',
'N'):
671 print_cerr_of_unspported_tag(
"PG", raw_tag);
675 if (program_info_entry !=
nullptr)
676 parse_tag_value(*program_info_entry);
678 skip_until_predicate(is_char<'\t'> || is_char<'\n'>);
683 throw format_error{
std::string{
"The required ID tag in @PG is missing."}};
685 hdr.program_infos.emplace_back(
std::move(tmp));
689 case make_tag(
'C',
'O'):
693 take_until_predicate(is_char<'\n'>);
694 read_field(string_buffer, tmp);
696 hdr.comments.emplace_back(
std::move(tmp));
701 throw format_error{
std::string{
"Illegal SAM header tag starting with:"} + *it};
722 template <
typename stream_t,
typename ref_
ids_type>
723 inline void format_sam_base::write_header(stream_t & stream,
724 sam_file_output_options
const & options,
725 sam_file_header<ref_ids_type> & header)
733 if (!header.sorting.empty() &&
734 !(header.sorting ==
"unknown" ||
735 header.sorting ==
"unsorted" ||
736 header.sorting ==
"queryname" ||
737 header.sorting ==
"coordinate" ))
738 throw format_error{
"SAM format error: The header.sorting member must be "
739 "one of [unknown, unsorted, queryname, coordinate]."};
741 if (!header.grouping.empty() &&
742 !(header.grouping ==
"none" ||
743 header.grouping ==
"query" ||
744 header.grouping ==
"reference"))
745 throw format_error{
"SAM format error: The header.grouping member must be "
746 "one of [none, query, reference]."};
762 std::cpp20::ostreambuf_iterator stream_it{stream};
765 stream <<
"@HD\tVN:";
766 std::ranges::copy(format_version, stream_it);
768 if (!header.sorting.empty())
769 stream <<
"\tSO:" << header.sorting;
771 if (!header.subsorting.empty())
772 stream <<
"\tSS:" << header.subsorting;
774 if (!header.grouping.empty())
775 stream <<
"\tGO:" << header.grouping;
777 detail::write_eol(stream_it, options.add_carriage_return);
780 for (
auto const & [ref_name, ref_info] :
views::zip(header.ref_ids(), header.ref_id_info))
782 stream <<
"@SQ\tSN:";
784 std::ranges::copy(ref_name, stream_it);
786 stream <<
"\tLN:" << get<0>(ref_info);
788 if (!get<1>(ref_info).
empty())
789 stream <<
"\t" << get<1>(ref_info);
791 detail::write_eol(stream_it, options.add_carriage_return);
795 for (
auto const & read_group : header.read_groups)
798 <<
"\tID:" << get<0>(read_group);
800 if (!get<1>(read_group).
empty())
801 stream <<
"\t" << get<1>(read_group);
803 detail::write_eol(stream_it, options.add_carriage_return);
807 for (
auto const & program : header.program_infos)
810 <<
"\tID:" << program.id;
812 if (!program.name.empty())
813 stream <<
"\tPN:" << program.name;
815 if (!program.command_line_call.empty())
816 stream <<
"\tCL:" << program.command_line_call;
818 if (!program.previous.empty())
819 stream <<
"\tPP:" << program.previous;
821 if (!program.description.empty())
822 stream <<
"\tDS:" << program.description;
824 if (!program.version.empty())
825 stream <<
"\tVN:" << program.version;
827 detail::write_eol(stream_it, options.add_carriage_return);
831 for (
auto const & comment : header.comments)
834 detail::write_eol(stream_it, options.add_carriage_return);
Provides seqan3::views::char_to.
Provides various utility functions.
Provides seqan3::debug_stream and related types.
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: concept.hpp:523
@ comment
Comment field of arbitrary content, usually a string.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ id
The identifier, usually a string.
auto search(queries_t &&queries, index_t const &index, configuration_t const &cfg=search_cfg::default_configuration)
Search a query or a range of queries in an index.
Definition: search.hpp:108
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:189
constexpr auto zip
A zip view.
Definition: zip.hpp:29
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:95
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:74
Provides various utility functions.
Auxiliary functions for the alignment IO.
Adaptations of concepts from the Ranges TS.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::views::repeat_n.
Provides seqan3::views::slice.
Provides seqan3::views::zip.