SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
format_sam_base.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
11#pragma once
12
13#include <climits>
14#include <ranges>
15#include <string>
16#include <vector>
17
30
31namespace seqan3::detail
32{
33
42class format_sam_base
43{
44protected:
48 format_sam_base() = default;
49 format_sam_base(format_sam_base const &) = default;
50 format_sam_base & operator=(format_sam_base const &) = default;
51 format_sam_base(format_sam_base &&) = default;
52 format_sam_base & operator=(format_sam_base &&) = default;
53 ~format_sam_base() = default;
54
56
58 static constexpr std::array format_version{'1', '.', '6'};
59
61 std::array<char, 316> arithmetic_buffer{}; // Doubles can be up to 316 characters
62
64 bool header_was_written{false};
65
67 bool ref_info_present_in_header{false};
68
69 template <typename ref_id_type, typename ref_id_tmp_type, typename header_type, typename ref_seqs_type>
70 void check_and_assign_ref_id(ref_id_type & ref_id,
71 ref_id_tmp_type & ref_id_tmp,
72 header_type & header,
73 ref_seqs_type & /*tag*/);
74
75 int32_t soft_clipping_at_front(std::vector<cigar> const & cigar_vector) const;
76
77 template <typename stream_view_type, std::ranges::forward_range target_range_type>
78 void read_forward_range_field(stream_view_type && stream_view, target_range_type & target);
79
80 template <std::ranges::forward_range target_range_type>
81 void read_forward_range_field(std::string_view const str, target_range_type & target);
82
83 template <arithmetic arithmetic_target_type>
84 void read_arithmetic_field(std::string_view const & str, arithmetic_target_type & arithmetic_target);
85
86 template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type, typename seq_legal_alph_type>
87 void read_header(stream_view_type && stream_view,
88 sam_file_header<ref_ids_type> & hdr,
89 ref_seqs_type & /*ref_id_to_pos_map*/,
90 sam_file_input_options<seq_legal_alph_type> const & options);
91
92 template <typename stream_t, typename header_type>
93 void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
94};
95
106template <typename ref_id_type, typename ref_id_tmp_type, typename header_type, typename ref_seqs_type>
107inline void format_sam_base::check_and_assign_ref_id(ref_id_type & ref_id,
108 ref_id_tmp_type & ref_id_tmp,
109 header_type & header,
110 ref_seqs_type & /*tag*/)
111{
112 if (!std::ranges::empty(ref_id_tmp)) // otherwise the std::optional will not be filled
113 {
114 auto search = header.ref_dict.find(ref_id_tmp);
115
116 if (search == header.ref_dict.end())
117 {
118 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
119 {
120 if (ref_info_present_in_header)
121 {
122 throw format_error{"Unknown reference id found in record which is not present in the header."};
123 }
124 else
125 {
126 header.ref_ids().push_back(ref_id_tmp);
127 auto pos = std::ranges::size(header.ref_ids()) - 1;
128 header.ref_dict[header.ref_ids()[pos]] = pos;
129 ref_id = pos;
130 }
131 }
132 else
133 {
134 throw format_error{"Unknown reference id found in record which is not present in the given ids."};
135 }
136 }
137 else
138 {
139 ref_id = search->second;
140 }
141 }
142}
143
147inline int32_t format_sam_base::soft_clipping_at_front(std::vector<cigar> const & cigar_vector) const
148{
149 int32_t sc_front{};
150
151 // Checks if the given index in the cigar vector is a soft clip.
152 auto soft_clipping_at = [&](size_t const index)
153 {
154 return cigar_vector[index] == 'S'_cigar_operation;
155 };
156 // Checks if the given index in the cigar vector is a hard clip.
157 auto hard_clipping_at = [&](size_t const index)
158 {
159 return cigar_vector[index] == 'H'_cigar_operation;
160 };
161 // Checks if the given cigar vector as at least min_size many elements.
162 auto vector_size_at_least = [&](size_t const min_size)
163 {
164 return cigar_vector.size() >= min_size;
165 };
166 // Returns the cigar count of the ith cigar element in the given cigar vector.
167 auto cigar_count_at = [&](size_t const index)
168 {
169 return get<0>(cigar_vector[index]);
170 };
171
172 // check for soft clipping at the first two positions
173 if (vector_size_at_least(1) && soft_clipping_at(0))
174 sc_front = cigar_count_at(0);
175 else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1))
176 sc_front = cigar_count_at(1);
177
178 return sc_front;
179}
180
188template <typename stream_view_type, std::ranges::forward_range target_range_type>
189inline void format_sam_base::read_forward_range_field(stream_view_type && stream_view, target_range_type & target)
190{
191 using target_range_value_t = std::ranges::range_value_t<target_range_type>;
192 using begin_iterator_t = std::ranges::iterator_t<stream_view_type>;
193 using end_iterator_t = std::ranges::sentinel_t<stream_view_type>;
194
195 // Note that we need to cache the begin iterator since the stream_view is an input range that may be consuming
196 // and in that case might read `past-the-end` on a second call of std::ranges::begin.
197 if (auto it = std::ranges::begin(stream_view); it != std::ranges::end(stream_view))
198 {
199 // Write to target if field does not represent an empty string, denoted as single '*' character.
200 if (char c = *it; !(++it == std::ranges::end(stream_view) && c == '*'))
201 {
202 target.push_back(seqan3::assign_char_to(c, target_range_value_t{}));
203 std::ranges::copy(std::ranges::subrange<begin_iterator_t, end_iterator_t>{it, std::ranges::end(stream_view)}
204 | views::char_to<target_range_value_t>,
205 std::back_inserter(target));
206 }
207 }
208}
209
216template <std::ranges::forward_range target_range_type>
217inline void format_sam_base::read_forward_range_field(std::string_view const str, target_range_type & target)
218{
219 if (str.size() == 1 && str[0] == '*') // '*' denotes empty field
220 return;
221
222 if constexpr (std::assignable_from<target_range_type, std::string_view>)
223 {
224 target = str;
225 }
226 else
227 {
228 target.resize(str.size());
229 for (size_t i = 0; i < str.size(); ++i)
230 target[i] = assign_char_to(str[i], std::ranges::range_value_t<target_range_type>{});
231 }
232}
233
243template <arithmetic arithmetic_target_type>
244inline void format_sam_base::read_arithmetic_field(std::string_view const & str,
245 arithmetic_target_type & arithmetic_target)
246{
247 std::from_chars_result res = std::from_chars(str.begin(), str.end(), arithmetic_target);
248
249 if (res.ec == std::errc::invalid_argument || res.ptr != str.end())
250 throw format_error{std::string("[CORRUPTED SAM FILE] The string '") + std::string(str.begin(), str.end())
251 + "' could not be cast into type " + detail::type_name_as_string<arithmetic_target_type>};
252
253 if (res.ec == std::errc::result_out_of_range)
254 throw format_error{std::string("[CORRUPTED SAM FILE] Casting '") + std::string(str.begin(), str.end())
255 + "' into type " + detail::type_name_as_string<arithmetic_target_type>
256 + " would cause an overflow."};
257}
258
278template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type, typename seq_legal_alph_type>
279inline void format_sam_base::read_header(stream_view_type && stream_view,
280 sam_file_header<ref_ids_type> & hdr,
281 ref_seqs_type & /*ref_id_to_pos_map*/,
282 sam_file_input_options<seq_legal_alph_type> const & options)
283{
284 auto it = std::ranges::begin(stream_view);
285 auto end = std::ranges::end(stream_view);
286 std::string string_buffer{};
287
288 auto make_tag = [](uint8_t char1, uint8_t char2) constexpr
289 {
290 return static_cast<uint16_t>(char1) | (static_cast<uint16_t>(char2) << CHAR_BIT);
291 };
292
293 std::array<char, 2> raw_tag{};
294
295 auto parse_and_make_tag = [&]()
296 {
297 raw_tag[0] = *it;
298 ++it;
299 raw_tag[1] = *it;
300 ++it;
301 return make_tag(raw_tag[0], raw_tag[1]);
302 };
303
304 auto take_until_predicate = [&it, &string_buffer](auto const & predicate)
305 {
306 string_buffer.clear();
307 while (!predicate(*it))
308 {
309 string_buffer.push_back(*it);
310 ++it;
311 }
312 };
313
314 auto skip_until_predicate = [&it](auto const & predicate)
315 {
316 while (!predicate(*it))
317 ++it;
318 };
319
320 auto copy_next_tag_value_into_buffer = [&]()
321 {
322 skip_until_predicate(is_char<':'>);
323 ++it; // skip :
324 take_until_predicate(is_char<'\t'> || is_char<'\n'>);
325 };
326
327 // Some tags are not parsed individually. Instead, these are simply copied into a std::string.
328 // Multiple tags must be separated by a `\t`, hence we prepend a tab to the string, except the first time.
329 // Alternatively, we could always append a `\t`, but this would have the side effect that we might need to trim a
330 // trailing tab after parsing all tags via `pop_back()`.
331 // Unfortunately, we do not know when we are parsing the last tag (and in this case just not append a tab),
332 // because even though we can check if the line ends in a `\n`, it is not guaranteed that the last tag of the
333 // line is passed to this lambda. For example, the line might end with a tag that is properly parsed, such as `ID`.
334 auto parse_and_append_unhandled_tag_to_string = [&](std::string & value, std::array<char, 2> raw_tag)
335 {
336 take_until_predicate(is_char<'\t'> || is_char<'\n'>);
337 if (!value.empty())
338 value.push_back('\t');
339 value.push_back(raw_tag[0]);
340 value.push_back(raw_tag[1]);
341 read_forward_range_field(string_buffer, value);
342 };
343
344 auto print_cerr_of_unspported_tag = [&options](char const * const header_tag, std::array<char, 2> raw_tag)
345 {
346 if (options.stream_warnings_to == nullptr)
347 return;
348
349 *options.stream_warnings_to << "Unsupported SAM header tag in @" << header_tag << ": " << raw_tag[0]
350 << raw_tag[1] << '\n';
351 };
352
353 while (it != end && is_char<'@'>(*it))
354 {
355 ++it; // skip @
356
357 switch (parse_and_make_tag())
358 {
359 case make_tag('H', 'D'): // HD (header) tag
360 {
361 // All tags can appear in any order, VN is the only required tag
362 while (is_char<'\t'>(*it))
363 {
364 ++it; // skip tab
365 std::string * header_entry{nullptr};
366
367 switch (parse_and_make_tag())
368 {
369 case make_tag('V', 'N'): // parse required VN (version) tag
370 {
371 header_entry = std::addressof(hdr.format_version);
372 break;
373 }
374 case make_tag('S', 'O'): // SO (sorting) tag
375 {
376 header_entry = std::addressof(hdr.sorting);
377 break;
378 }
379 case make_tag('S', 'S'): // SS (sub-order) tag
380 {
381 header_entry = std::addressof(hdr.subsorting);
382 break;
383 }
384 case make_tag('G', 'O'): // GO (grouping) tag
385 {
386 header_entry = std::addressof(hdr.grouping);
387 break;
388 }
389 default: // unsupported header tag
390 {
391 print_cerr_of_unspported_tag("HD", raw_tag);
392 }
393 }
394
395 if (header_entry != nullptr)
396 {
397 copy_next_tag_value_into_buffer();
398 read_forward_range_field(string_buffer, *header_entry);
399 }
400 else
401 skip_until_predicate(is_char<'\t'> || is_char<'\n'>);
402 }
403 ++it; // skip newline
404
405 if (hdr.format_version.empty())
406 throw format_error{std::string{"The required VN tag in @HD is missing."}};
407
408 break;
409 }
410
411 case make_tag('S', 'Q'): // SQ (sequence dictionary) tag
412 {
413 ref_info_present_in_header = true;
414 std::ranges::range_value_t<decltype(hdr.ref_ids())> id;
415 std::optional<int32_t> sequence_length{};
417
418 // All tags can appear in any order, SN and LN are required tags
419 while (is_char<'\t'>(*it))
420 {
421 ++it; // skip tab
422
423 switch (parse_and_make_tag())
424 {
425 case make_tag('S', 'N'): // parse required SN (sequence name) tag
426 {
427 copy_next_tag_value_into_buffer();
428 read_forward_range_field(string_buffer, id);
429 break;
430 }
431 case make_tag('L', 'N'): // parse required LN (length) tag
432 {
433 int32_t sequence_length_tmp{};
434 copy_next_tag_value_into_buffer();
435 read_arithmetic_field(string_buffer, sequence_length_tmp);
436 sequence_length = sequence_length_tmp;
437 break;
438 }
439 default: // Any other tag
440 {
441 parse_and_append_unhandled_tag_to_string(get<1>(info), raw_tag);
442 }
443 }
444 }
445 ++it; // skip newline
446
447 if (id.empty())
448 throw format_error{std::string{"The required SN tag in @SQ is missing."}};
449 if (!sequence_length.has_value())
450 throw format_error{std::string{"The required LN tag in @SQ is missing."}};
451 if (sequence_length.value() <= 0)
452 throw format_error{std::string{"The value of LN in @SQ must be positive."}};
453
454 get<0>(info) = sequence_length.value();
455 // If reference information was given, the ids exist and we can fill ref_dict directly.
456 // If not, we need to update the ids first and fill the reference dictionary afterwards.
457 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>) // reference information given
458 {
459 auto id_it = hdr.ref_dict.find(id);
460
461 if (id_it == hdr.ref_dict.end())
462 throw format_error{detail::to_string("Unknown reference name '",
463 id,
464 "' found in SAM header ",
465 "(header.ref_ids(): ",
466 hdr.ref_ids(),
467 ").")};
468
469 auto & given_ref_info = hdr.ref_id_info[id_it->second];
470
471 if (std::get<0>(given_ref_info) != std::get<0>(info))
472 throw format_error{"Provided and header-based reference length differ."};
473
474 hdr.ref_id_info[id_it->second] = std::move(info);
475 }
476 else
477 {
478 static_assert(!detail::is_type_specialisation_of_v<decltype(hdr.ref_ids()), std::deque>,
479 "The range over reference ids must be of type std::deque such that pointers are not "
480 "invalidated.");
481
482 hdr.ref_ids().push_back(id);
483 hdr.ref_id_info.push_back(info);
484 hdr.ref_dict[(hdr.ref_ids())[(hdr.ref_ids()).size() - 1]] = (hdr.ref_ids()).size() - 1;
485 }
486 break;
487 }
488
489 case make_tag('R', 'G'): // RG (read group) tag
490 {
492
493 // All tags can appear in any order, SN and LN are required tags
494 while (is_char<'\t'>(*it))
495 {
496 ++it; // skip tab
497
498 switch (parse_and_make_tag())
499 {
500 case make_tag('I', 'D'): // parse required ID tag
501 {
502 copy_next_tag_value_into_buffer();
503 read_forward_range_field(string_buffer, get<0>(tmp));
504 break;
505 }
506 default: // Any other tag
507 {
508 parse_and_append_unhandled_tag_to_string(get<1>(tmp), raw_tag);
509 }
510 }
511 }
512 ++it; // skip newline
513
514 if (get<0>(tmp).empty())
515 throw format_error{std::string{"The required ID tag in @RG is missing."}};
516
517 hdr.read_groups.emplace_back(std::move(tmp));
518 break;
519 }
520
521 case make_tag('P', 'G'): // PG (program) tag
522 {
524
525 // All tags can appear in any order, ID is the only required tag
526 while (is_char<'\t'>(*it))
527 {
528 ++it; // skip tab
529 std::string * program_info_entry{nullptr};
530
531 switch (parse_and_make_tag())
532 {
533 case make_tag('I', 'D'): // read required ID tag
534 {
535 program_info_entry = std::addressof(tmp.id);
536 break;
537 }
538 case make_tag('P', 'N'): // PN (program name) tag
539 {
540 program_info_entry = std::addressof(tmp.name);
541 break;
542 }
543 case make_tag('P', 'P'): // PP (previous program) tag
544 {
545 program_info_entry = std::addressof(tmp.previous);
546 break;
547 }
548 case make_tag('C', 'L'): // CL (command line) tag
549 {
550 program_info_entry = std::addressof(tmp.command_line_call);
551 break;
552 }
553 case make_tag('D', 'S'): // DS (description) tag
554 {
555 program_info_entry = std::addressof(tmp.description);
556 break;
557 }
558 case make_tag('V', 'N'): // VN (version) tag
559 {
560 program_info_entry = std::addressof(tmp.version);
561 break;
562 }
563 default: // unsupported header tag
564 {
565 print_cerr_of_unspported_tag("PG", raw_tag);
566 }
567 }
568
569 if (program_info_entry != nullptr)
570 {
571 copy_next_tag_value_into_buffer();
572 read_forward_range_field(string_buffer, *program_info_entry);
573 }
574 else
575 skip_until_predicate(is_char<'\t'> || is_char<'\n'>);
576 }
577 ++it; // skip newline
578
579 if (tmp.id.empty())
580 throw format_error{std::string{"The required ID tag in @PG is missing."}};
581
582 hdr.program_infos.emplace_back(std::move(tmp));
583 break;
584 }
585
586 case make_tag('C', 'O'): // CO (comment) tag
587 {
588 ++it; // skip tab
589 std::string tmp;
590 take_until_predicate(is_char<'\n'>);
591 read_forward_range_field(string_buffer, tmp);
592 ++it; // skip newline
593 hdr.comments.emplace_back(std::move(tmp));
594 break;
595 }
596
597 default:
598 throw format_error{std::string{"Illegal SAM header tag starting with:"} + *it};
599 }
600 }
601}
602
619template <typename stream_t, typename header_type>
620inline void
621format_sam_base::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
622{
623 if constexpr (!detail::decays_to_ignore_v<header_type>)
624 {
625 // -----------------------------------------------------------------
626 // Check Header
627 // -----------------------------------------------------------------
628
629 // (@HD) Check header line
630 // The format version string will be taken from the local member variable
631 if (!header.sorting.empty()
632 && !(header.sorting == "unknown" || header.sorting == "unsorted" || header.sorting == "queryname"
633 || header.sorting == "coordinate"))
634 throw format_error{"SAM format error: The header.sorting member must be "
635 "one of [unknown, unsorted, queryname, coordinate]."};
636
637 if (!header.grouping.empty()
638 && !(header.grouping == "none" || header.grouping == "query" || header.grouping == "reference"))
639 throw format_error{"SAM format error: The header.grouping member must be "
640 "one of [none, query, reference]."};
641
642 // (@SQ) Check Reference Sequence Dictionary lines
643
644 // TODO
645
646 // - sorting order be one of ...
647 // - grouping can be one of ...
648 // - reference names must be unique
649 // - ids of read groups must be unique
650 // - program ids need to be unique
651 // many more small semantic things, like fits REGEX
652
653 // -----------------------------------------------------------------
654 // Write Header
655 // -----------------------------------------------------------------
656 std::ostreambuf_iterator stream_it{stream};
657
658 // (@HD) Write header line [required].
659 stream << "@HD\tVN:";
660 std::ranges::copy(format_version, stream_it);
661
662 if (!header.sorting.empty())
663 stream << "\tSO:" << header.sorting;
664
665 if (!header.subsorting.empty())
666 stream << "\tSS:" << header.subsorting;
667
668 if (!header.grouping.empty())
669 stream << "\tGO:" << header.grouping;
670
671 detail::write_eol(stream_it, options.add_carriage_return);
672
673 // (@SQ) Write Reference Sequence Dictionary lines [required].
674 for (auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info))
675 {
676 stream << "@SQ\tSN:";
677
678 std::ranges::copy(ref_name, stream_it);
679
680 stream << "\tLN:" << get<0>(ref_info);
681
682 if (!get<1>(ref_info).empty())
683 stream << "\t" << get<1>(ref_info);
684
685 detail::write_eol(stream_it, options.add_carriage_return);
686 }
687
688 // Write read group (@RG) lines if specified.
689 for (auto const & read_group : header.read_groups)
690 {
691 stream << "@RG"
692 << "\tID:" << get<0>(read_group);
693
694 if (!get<1>(read_group).empty())
695 stream << "\t" << get<1>(read_group);
696
697 detail::write_eol(stream_it, options.add_carriage_return);
698 }
699
700 // Write program (@PG) lines if specified.
701 for (auto const & program : header.program_infos)
702 {
703 stream << "@PG"
704 << "\tID:" << program.id;
705
706 if (!program.name.empty())
707 stream << "\tPN:" << program.name;
708
709 if (!program.command_line_call.empty())
710 stream << "\tCL:" << program.command_line_call;
711
712 if (!program.previous.empty())
713 stream << "\tPP:" << program.previous;
714
715 if (!program.description.empty())
716 stream << "\tDS:" << program.description;
717
718 if (!program.version.empty())
719 stream << "\tVN:" << program.version;
720
721 detail::write_eol(stream_it, options.add_carriage_return);
722 }
723
724 // Write comment (@CO) lines if specified.
725 for (auto const & comment : header.comments)
726 {
727 stream << "@CO\t" << comment;
728 detail::write_eol(stream_it, options.add_carriage_return);
729 }
730 }
731}
732
733} // namespace seqan3::detail
T addressof(T... args)
T back_inserter(T... args)
T begin(T... args)
Provides seqan3::views::char_to.
sam_file_program_info_t program_info_t
Stores information of the program/tool that was used to create the file.
Definition header.hpp:73
T copy(T... args)
Provides various utility functions.
T empty(T... args)
T end(T... args)
T find(T... args)
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition alphabet/concept.hpp:521
@ comment
Comment field of arbitrary content, usually a string.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition predicate.hpp:60
constexpr size_t size
The size of a type pack.
Definition type_pack/traits.hpp:143
seqan::stl::views::zip zip
A view adaptor that takes several views and returns tuple-like values from every i-th element of each...
Definition zip.hpp:24
Provides the seqan3::sam_file_header class.
Provides various utility functions.
Auxiliary functions for the SAM IO.
T push_back(T... args)
Provides seqan3::debug_stream and related types.
Provides seqan3::views::repeat_n.
Provides seqan3::sam_file_input_options.
Provides seqan3::sam_file_output_format and auxiliary classes.
T search(T... args)
T size(T... args)
Provides seqan3::views::slice.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::views::zip.
Hide me