SeqAn3 3.4.0-rc.4
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
format_sam_base.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
11#pragma once
12
13#include <climits>
14#include <ranges>
15#include <string>
16#include <vector>
17
29
30namespace seqan3::detail
31{
32
41class format_sam_base
42{
43protected:
47 format_sam_base() = default;
48 format_sam_base(format_sam_base const &) = default;
49 format_sam_base & operator=(format_sam_base const &) = default;
50 format_sam_base(format_sam_base &&) = default;
51 format_sam_base & operator=(format_sam_base &&) = default;
52 ~format_sam_base() = default;
53
55
57 static constexpr std::array format_version{'1', '.', '6'};
58
60 std::array<char, 316> arithmetic_buffer{}; // Doubles can be up to 316 characters
61
63 bool header_was_written{false};
64
66 bool ref_info_present_in_header{false};
67
68 template <typename ref_id_type, typename ref_id_tmp_type, typename header_type, typename ref_seqs_type>
69 void check_and_assign_ref_id(ref_id_type & ref_id,
70 ref_id_tmp_type & ref_id_tmp,
71 header_type & header,
72 ref_seqs_type & /*tag*/);
73
74 int32_t soft_clipping_at_front(std::vector<cigar> const & cigar_vector) const;
75
76 template <typename stream_view_type, std::ranges::forward_range target_range_type>
77 void read_forward_range_field(stream_view_type && stream_view, target_range_type & target);
78
79 template <std::ranges::forward_range target_range_type>
80 void read_forward_range_field(std::string_view const str, target_range_type & target);
81
82 template <arithmetic arithmetic_target_type>
83 void read_arithmetic_field(std::string_view const & str, arithmetic_target_type & arithmetic_target);
84
85 template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type>
86 void read_header(stream_view_type && stream_view,
87 sam_file_header<ref_ids_type> & hdr,
88 ref_seqs_type & /*ref_id_to_pos_map*/);
89
90 template <typename stream_t, typename header_type>
91 void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
92};
93
104template <typename ref_id_type, typename ref_id_tmp_type, typename header_type, typename ref_seqs_type>
105inline void format_sam_base::check_and_assign_ref_id(ref_id_type & ref_id,
106 ref_id_tmp_type & ref_id_tmp,
107 header_type & header,
108 ref_seqs_type & /*tag*/)
109{
110 if (!std::ranges::empty(ref_id_tmp)) // otherwise the std::optional will not be filled
111 {
112 auto search = header.ref_dict.find(ref_id_tmp);
113
114 if (search == header.ref_dict.end())
115 {
116 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
117 {
118 if (ref_info_present_in_header)
119 {
120 throw format_error{"Unknown reference id found in record which is not present in the header."};
121 }
122 else
123 {
124 header.ref_ids().push_back(ref_id_tmp);
125 auto pos = std::ranges::size(header.ref_ids()) - 1;
126 header.ref_dict[header.ref_ids()[pos]] = pos;
127 ref_id = pos;
128 }
129 }
130 else
131 {
132 throw format_error{"Unknown reference id found in record which is not present in the given ids."};
133 }
134 }
135 else
136 {
137 ref_id = search->second;
138 }
139 }
140}
141
145inline int32_t format_sam_base::soft_clipping_at_front(std::vector<cigar> const & cigar_vector) const
146{
147 int32_t sc_front{};
148
149 // Checks if the given index in the cigar vector is a soft clip.
150 auto soft_clipping_at = [&](size_t const index)
151 {
152 return cigar_vector[index] == 'S'_cigar_operation;
153 };
154 // Checks if the given index in the cigar vector is a hard clip.
155 auto hard_clipping_at = [&](size_t const index)
156 {
157 return cigar_vector[index] == 'H'_cigar_operation;
158 };
159 // Checks if the given cigar vector as at least min_size many elements.
160 auto vector_size_at_least = [&](size_t const min_size)
161 {
162 return cigar_vector.size() >= min_size;
163 };
164 // Returns the cigar count of the ith cigar element in the given cigar vector.
165 auto cigar_count_at = [&](size_t const index)
166 {
167 return get<0>(cigar_vector[index]);
168 };
169
170 // check for soft clipping at the first two positions
171 if (vector_size_at_least(1) && soft_clipping_at(0))
172 sc_front = cigar_count_at(0);
173 else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1))
174 sc_front = cigar_count_at(1);
175
176 return sc_front;
177}
178
186template <typename stream_view_type, std::ranges::forward_range target_range_type>
187inline void format_sam_base::read_forward_range_field(stream_view_type && stream_view, target_range_type & target)
188{
189 using target_range_value_t = std::ranges::range_value_t<target_range_type>;
190 using begin_iterator_t = std::ranges::iterator_t<stream_view_type>;
191 using end_iterator_t = std::ranges::sentinel_t<stream_view_type>;
192
193 // Note that we need to cache the begin iterator since the stream_view is an input range that may be consuming
194 // and in that case might read `past-the-end` on a second call of std::ranges::begin.
195 if (auto it = std::ranges::begin(stream_view); it != std::ranges::end(stream_view))
196 {
197 // Write to target if field does not represent an empty string, denoted as single '*' character.
198 if (char c = *it; !(++it == std::ranges::end(stream_view) && c == '*'))
199 {
200 target.push_back(seqan3::assign_char_to(c, target_range_value_t{}));
201 std::ranges::copy(std::ranges::subrange<begin_iterator_t, end_iterator_t>{it, std::ranges::end(stream_view)}
202 | views::char_to<target_range_value_t>,
203 std::back_inserter(target));
204 }
205 }
206}
207
214template <std::ranges::forward_range target_range_type>
215inline void format_sam_base::read_forward_range_field(std::string_view const str, target_range_type & target)
216{
217 if (str.size() == 1 && str[0] == '*') // '*' denotes empty field
218 return;
219
220 if constexpr (std::assignable_from<target_range_type, std::string_view>)
221 {
222 target = str;
223 }
224 else
225 {
226 target.resize(str.size());
227 for (size_t i = 0; i < str.size(); ++i)
228 target[i] = assign_char_to(str[i], std::ranges::range_value_t<target_range_type>{});
229 }
230}
231
241template <arithmetic arithmetic_target_type>
242inline void format_sam_base::read_arithmetic_field(std::string_view const & str,
243 arithmetic_target_type & arithmetic_target)
244{
245 std::from_chars_result res = std::from_chars(str.begin(), str.end(), arithmetic_target);
246
247 if (res.ec == std::errc::invalid_argument || res.ptr != str.end())
248 throw format_error{std::string("[CORRUPTED SAM FILE] The string '") + std::string(str.begin(), str.end())
249 + "' could not be cast into type " + detail::type_name_as_string<arithmetic_target_type>};
250
251 if (res.ec == std::errc::result_out_of_range)
252 throw format_error{std::string("[CORRUPTED SAM FILE] Casting '") + std::string(str.begin(), str.end())
253 + "' into type " + detail::type_name_as_string<arithmetic_target_type>
254 + " would cause an overflow."};
255}
256
278template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type>
279inline void format_sam_base::read_header(stream_view_type && stream_view,
280 sam_file_header<ref_ids_type> & hdr,
281 ref_seqs_type & /*ref_id_to_pos_map*/)
282{
283 auto it = std::ranges::begin(stream_view);
284 auto end = std::ranges::end(stream_view);
285 std::string string_buffer{};
286
287 auto make_tag = [](uint8_t char1, uint8_t char2) constexpr
288 {
289 return static_cast<uint16_t>(char1) | (static_cast<uint16_t>(char2) << CHAR_BIT);
290 };
291
292 std::array<char, 2> raw_tag{};
293
294 auto parse_and_make_tag = [&]()
295 {
296 raw_tag[0] = *it;
297 ++it;
298 raw_tag[1] = *it;
299 ++it;
300 return make_tag(raw_tag[0], raw_tag[1]);
301 };
302
303 auto take_until_predicate = [&it, &string_buffer](auto const & predicate)
304 {
305 string_buffer.clear();
306 while (!predicate(*it))
307 {
308 string_buffer.push_back(*it);
309 ++it;
310 }
311 };
312
313 auto skip_until_predicate = [&it](auto const & predicate)
314 {
315 while (!predicate(*it))
316 ++it;
317 };
318
319 auto copy_next_tag_value_into_buffer = [&]()
320 {
321 skip_until_predicate(is_char<':'>);
322 ++it; // skip :
323 take_until_predicate(is_char<'\t'> || is_char<'\n'>);
324 };
325
326 // Some tags are not parsed individually. Instead, these are simply copied into a std::string.
327 // Multiple tags must be separated by a `\t`, hence we prepend a tab to the string, except the first time.
328 // Alternatively, we could always append a `\t`, but this would have the side effect that we might need to trim a
329 // trailing tab after parsing all tags via `pop_back()`.
330 // Unfortunately, we do not know when we are parsing the last tag (and in this case just not append a tab),
331 // because even though we can check if the line ends in a `\n`, it is not guaranteed that the last tag of the
332 // line is passed to this lambda. For example, the line might end with a tag that is properly parsed, such as `ID`.
333 auto parse_and_append_unhandled_tag_to_string = [&](std::string & value, std::array<char, 2> raw_tag)
334 {
335 take_until_predicate(is_char<'\t'> || is_char<'\n'>);
336 if (!value.empty())
337 value.push_back('\t');
338 value.push_back(raw_tag[0]);
339 value.push_back(raw_tag[1]);
340 read_forward_range_field(string_buffer, value);
341 };
342
343 while (it != end && is_char<'@'>(*it))
344 {
345 ++it; // skip @
346
347 switch (parse_and_make_tag())
348 {
349 case make_tag('H', 'D'): // HD (header) tag
350 {
351 // All tags can appear in any order, VN is the only required tag
352 while (is_char<'\t'>(*it))
353 {
354 ++it; // skip tab
355 std::string * header_entry{nullptr};
356
357 switch (parse_and_make_tag())
358 {
359 case make_tag('V', 'N'): // parse required VN (version) tag
360 {
361 header_entry = std::addressof(hdr.format_version);
362 break;
363 }
364 case make_tag('S', 'O'): // SO (sorting) tag
365 {
366 header_entry = std::addressof(hdr.sorting);
367 break;
368 }
369 case make_tag('S', 'S'): // SS (sub-order) tag
370 {
371 header_entry = std::addressof(hdr.subsorting);
372 break;
373 }
374 case make_tag('G', 'O'): // GO (grouping) tag
375 {
376 header_entry = std::addressof(hdr.grouping);
377 break;
378 }
379 default: // unknown/user tag
380 {
381 parse_and_append_unhandled_tag_to_string(hdr.user_tags, raw_tag);
382 }
383 }
384
385 if (header_entry != nullptr)
386 {
387 copy_next_tag_value_into_buffer();
388 read_forward_range_field(string_buffer, *header_entry);
389 }
390 }
391 ++it; // skip newline
392
393 if (hdr.format_version.empty())
394 throw format_error{std::string{"The required VN tag in @HD is missing."}};
395
396 break;
397 }
398
399 case make_tag('S', 'Q'): // SQ (sequence dictionary) tag
400 {
401 ref_info_present_in_header = true;
402 std::ranges::range_value_t<decltype(hdr.ref_ids())> id;
403 std::optional<int32_t> sequence_length{};
405
406 // All tags can appear in any order, SN and LN are required tags
407 while (is_char<'\t'>(*it))
408 {
409 ++it; // skip tab
410
411 switch (parse_and_make_tag())
412 {
413 case make_tag('S', 'N'): // parse required SN (sequence name) tag
414 {
415 copy_next_tag_value_into_buffer();
416 read_forward_range_field(string_buffer, id);
417 break;
418 }
419 case make_tag('L', 'N'): // parse required LN (length) tag
420 {
421 int32_t sequence_length_tmp{};
422 copy_next_tag_value_into_buffer();
423 read_arithmetic_field(string_buffer, sequence_length_tmp);
424 sequence_length = sequence_length_tmp;
425 break;
426 }
427 default: // Any other tag
428 {
429 parse_and_append_unhandled_tag_to_string(get<1>(info), raw_tag);
430 }
431 }
432 }
433 ++it; // skip newline
434
435 if (id.empty())
436 throw format_error{std::string{"The required SN tag in @SQ is missing."}};
437 if (!sequence_length.has_value())
438 throw format_error{std::string{"The required LN tag in @SQ is missing."}};
439 if (sequence_length.value() <= 0)
440 throw format_error{std::string{"The value of LN in @SQ must be positive."}};
441
442 get<0>(info) = sequence_length.value();
443 // If reference information was given, the ids exist and we can fill ref_dict directly.
444 // If not, we need to update the ids first and fill the reference dictionary afterwards.
445 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>) // reference information given
446 {
447 auto id_it = hdr.ref_dict.find(id);
448
449 if (id_it == hdr.ref_dict.end())
450 throw format_error{detail::to_string("Unknown reference name '",
451 id,
452 "' found in SAM header ",
453 "(header.ref_ids(): ",
454 hdr.ref_ids(),
455 ").")};
456
457 auto & given_ref_info = hdr.ref_id_info[id_it->second];
458
459 if (std::get<0>(given_ref_info) != std::get<0>(info))
460 throw format_error{"Provided and header-based reference length differ."};
461
462 hdr.ref_id_info[id_it->second] = std::move(info);
463 }
464 else
465 {
466 static_assert(!detail::is_type_specialisation_of_v<decltype(hdr.ref_ids()), std::deque>,
467 "The range over reference ids must be of type std::deque such that pointers are not "
468 "invalidated.");
469
470 hdr.ref_ids().push_back(id);
471 hdr.ref_id_info.push_back(info);
472 hdr.ref_dict[(hdr.ref_ids())[(hdr.ref_ids()).size() - 1]] = (hdr.ref_ids()).size() - 1;
473 }
474 break;
475 }
476
477 case make_tag('R', 'G'): // RG (read group) tag
478 {
480
481 // All tags can appear in any order, SN and LN are required tags
482 while (is_char<'\t'>(*it))
483 {
484 ++it; // skip tab
485
486 switch (parse_and_make_tag())
487 {
488 case make_tag('I', 'D'): // parse required ID tag
489 {
490 copy_next_tag_value_into_buffer();
491 read_forward_range_field(string_buffer, get<0>(tmp));
492 break;
493 }
494 default: // Any other tag
495 {
496 parse_and_append_unhandled_tag_to_string(get<1>(tmp), raw_tag);
497 }
498 }
499 }
500 ++it; // skip newline
501
502 if (get<0>(tmp).empty())
503 throw format_error{std::string{"The required ID tag in @RG is missing."}};
504
505 hdr.read_groups.emplace_back(std::move(tmp));
506 break;
507 }
508
509 case make_tag('P', 'G'): // PG (program) tag
510 {
512
513 // All tags can appear in any order, ID is the only required tag
514 while (is_char<'\t'>(*it))
515 {
516 ++it; // skip tab
517 std::string * program_info_entry{nullptr};
518
519 switch (parse_and_make_tag())
520 {
521 case make_tag('I', 'D'): // read required ID tag
522 {
523 program_info_entry = std::addressof(tmp.id);
524 break;
525 }
526 case make_tag('P', 'N'): // PN (program name) tag
527 {
528 program_info_entry = std::addressof(tmp.name);
529 break;
530 }
531 case make_tag('P', 'P'): // PP (previous program) tag
532 {
533 program_info_entry = std::addressof(tmp.previous);
534 break;
535 }
536 case make_tag('C', 'L'): // CL (command line) tag
537 {
538 program_info_entry = std::addressof(tmp.command_line_call);
539 break;
540 }
541 case make_tag('D', 'S'): // DS (description) tag
542 {
543 program_info_entry = std::addressof(tmp.description);
544 break;
545 }
546 case make_tag('V', 'N'): // VN (version) tag
547 {
548 program_info_entry = std::addressof(tmp.version);
549 break;
550 }
551 default: // unsupported header tag
552 {
553 parse_and_append_unhandled_tag_to_string(tmp.user_tags, raw_tag);
554 }
555 }
556
557 if (program_info_entry != nullptr)
558 {
559 copy_next_tag_value_into_buffer();
560 read_forward_range_field(string_buffer, *program_info_entry);
561 }
562 }
563 ++it; // skip newline
564
565 if (tmp.id.empty())
566 throw format_error{std::string{"The required ID tag in @PG is missing."}};
567
568 hdr.program_infos.emplace_back(std::move(tmp));
569 break;
570 }
571
572 case make_tag('C', 'O'): // CO (comment) tag
573 {
574 ++it; // skip tab
575 std::string tmp;
576 take_until_predicate(is_char<'\n'>);
577 read_forward_range_field(string_buffer, tmp);
578 ++it; // skip newline
579 hdr.comments.emplace_back(std::move(tmp));
580 break;
581 }
582
583 default:
584 throw format_error{std::string{"Illegal SAM header tag starting with:"} + *it};
585 }
586 }
587}
588
605template <typename stream_t, typename header_type>
606inline void
607format_sam_base::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
608{
609 if constexpr (!detail::decays_to_ignore_v<header_type>)
610 {
611 // -----------------------------------------------------------------
612 // Check Header
613 // -----------------------------------------------------------------
614
615 // (@HD) Check header line
616 // The format version string will be taken from the local member variable
617 if (!header.sorting.empty()
618 && !(header.sorting == "unknown" || header.sorting == "unsorted" || header.sorting == "queryname"
619 || header.sorting == "coordinate"))
620 throw format_error{"SAM format error: The header.sorting member must be "
621 "one of [unknown, unsorted, queryname, coordinate]."};
622
623 if (!header.grouping.empty()
624 && !(header.grouping == "none" || header.grouping == "query" || header.grouping == "reference"))
625 throw format_error{"SAM format error: The header.grouping member must be "
626 "one of [none, query, reference]."};
627
628 // (@SQ) Check Reference Sequence Dictionary lines
629
630 // TODO
631
632 // - sorting order be one of ...
633 // - grouping can be one of ...
634 // - reference names must be unique
635 // - ids of read groups must be unique
636 // - program ids need to be unique
637 // many more small semantic things, like fits REGEX
638
639 // -----------------------------------------------------------------
640 // Write Header
641 // -----------------------------------------------------------------
642 std::ostreambuf_iterator stream_it{stream};
643
644 // (@HD) Write header line [required].
645 stream << "@HD\tVN:";
646 std::ranges::copy(format_version, stream_it);
647
648 if (!header.sorting.empty())
649 stream << "\tSO:" << header.sorting;
650
651 if (!header.subsorting.empty())
652 stream << "\tSS:" << header.subsorting;
653
654 if (!header.grouping.empty())
655 stream << "\tGO:" << header.grouping;
656
657 if (!header.user_tags.empty())
658 stream << '\t' << header.user_tags;
659
660 detail::write_eol(stream_it, options.add_carriage_return);
661
662 // (@SQ) Write Reference Sequence Dictionary lines [required].
663 for (auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info))
664 {
665 stream << "@SQ\tSN:";
666
667 std::ranges::copy(ref_name, stream_it);
668
669 stream << "\tLN:" << get<0>(ref_info);
670
671 if (!get<1>(ref_info).empty())
672 stream << "\t" << get<1>(ref_info);
673
674 detail::write_eol(stream_it, options.add_carriage_return);
675 }
676
677 // Write read group (@RG) lines if specified.
678 for (auto const & read_group : header.read_groups)
679 {
680 stream << "@RG"
681 << "\tID:" << get<0>(read_group);
682
683 if (!get<1>(read_group).empty())
684 stream << "\t" << get<1>(read_group);
685
686 detail::write_eol(stream_it, options.add_carriage_return);
687 }
688
689 // Write program (@PG) lines if specified.
690 for (auto const & program : header.program_infos)
691 {
692 stream << "@PG"
693 << "\tID:" << program.id;
694
695 if (!program.name.empty())
696 stream << "\tPN:" << program.name;
697
698 if (!program.command_line_call.empty())
699 stream << "\tCL:" << program.command_line_call;
700
701 if (!program.previous.empty())
702 stream << "\tPP:" << program.previous;
703
704 if (!program.description.empty())
705 stream << "\tDS:" << program.description;
706
707 if (!program.version.empty())
708 stream << "\tVN:" << program.version;
709
710 if (!program.user_tags.empty())
711 stream << '\t' << program.user_tags;
712
713 detail::write_eol(stream_it, options.add_carriage_return);
714 }
715
716 // Write comment (@CO) lines if specified.
717 for (auto const & comment : header.comments)
718 {
719 stream << "@CO\t" << comment;
720 detail::write_eol(stream_it, options.add_carriage_return);
721 }
722 }
723}
724
725} // namespace seqan3::detail
T addressof(T... args)
T back_inserter(T... args)
T begin(T... args)
Provides seqan3::views::char_to.
sam_file_program_info_t program_info_t
Stores information of the program/tool that was used to create the file.
Definition header.hpp:67
T copy(T... args)
Provides various utility functions.
T empty(T... args)
T end(T... args)
T find(T... args)
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition alphabet/concept.hpp:517
@ comment
Comment field of arbitrary content, usually a string.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition predicate.hpp:60
constexpr size_t size
The size of a type pack.
Definition type_pack/traits.hpp:143
seqan::stl::views::zip zip
A view adaptor that takes several views and returns tuple-like values from every i-th element of each...
Definition zip.hpp:24
Provides the seqan3::sam_file_header class.
Provides various utility functions.
Auxiliary functions for the SAM IO.
T push_back(T... args)
Provides seqan3::debug_stream and related types.
Provides seqan3::views::repeat_n.
Provides seqan3::sam_file_output_format and auxiliary classes.
T search(T... args)
T size(T... args)
Provides seqan3::views::slice.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::views::zip.
Hide me