SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
format_sam_base.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
14#pragma once
15
16#include <climits>
17#include <ranges>
18#include <string>
19#include <vector>
20
32
33namespace seqan3::detail
34{
35
44class format_sam_base
45{
46protected:
50 format_sam_base() = default;
51 format_sam_base(format_sam_base const &) = default;
52 format_sam_base & operator=(format_sam_base const &) = default;
53 format_sam_base(format_sam_base &&) = default;
54 format_sam_base & operator=(format_sam_base &&) = default;
55 ~format_sam_base() = default;
56
58
60 static constexpr std::array format_version{'1', '.', '6'};
61
63 std::array<char, 316> arithmetic_buffer{}; // Doubles can be up to 316 characters
64
66 bool header_was_written{false};
67
69 bool ref_info_present_in_header{false};
70
71 template <typename ref_id_type, typename ref_id_tmp_type, typename header_type, typename ref_seqs_type>
72 void check_and_assign_ref_id(ref_id_type & ref_id,
73 ref_id_tmp_type & ref_id_tmp,
74 header_type & header,
75 ref_seqs_type & /*tag*/);
76
77 int32_t soft_clipping_at_front(std::vector<cigar> const & cigar_vector) const;
78
79 template <typename stream_view_type, std::ranges::forward_range target_range_type>
80 void read_forward_range_field(stream_view_type && stream_view, target_range_type & target);
81
82 template <std::ranges::forward_range target_range_type>
83 void read_forward_range_field(std::string_view const str, target_range_type & target);
84
85 template <arithmetic arithmetic_target_type>
86 void read_arithmetic_field(std::string_view const & str, arithmetic_target_type & arithmetic_target);
87
88 template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type>
89 void read_header(stream_view_type && stream_view,
90 sam_file_header<ref_ids_type> & hdr,
91 ref_seqs_type & /*ref_id_to_pos_map*/);
92
93 template <typename stream_t, typename header_type>
94 void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
95};
96
107template <typename ref_id_type, typename ref_id_tmp_type, typename header_type, typename ref_seqs_type>
108inline void format_sam_base::check_and_assign_ref_id(ref_id_type & ref_id,
109 ref_id_tmp_type & ref_id_tmp,
110 header_type & header,
111 ref_seqs_type & /*tag*/)
112{
113 if (!std::ranges::empty(ref_id_tmp)) // otherwise the std::optional will not be filled
114 {
115 auto search = header.ref_dict.find(ref_id_tmp);
116
117 if (search == header.ref_dict.end())
118 {
119 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
120 {
121 if (ref_info_present_in_header)
122 {
123 throw format_error{"Unknown reference id found in record which is not present in the header."};
124 }
125 else
126 {
127 header.ref_ids().push_back(ref_id_tmp);
128 auto pos = std::ranges::size(header.ref_ids()) - 1;
129 header.ref_dict[header.ref_ids()[pos]] = pos;
130 ref_id = pos;
131 }
132 }
133 else
134 {
135 throw format_error{"Unknown reference id found in record which is not present in the given ids."};
136 }
137 }
138 else
139 {
140 ref_id = search->second;
141 }
142 }
143}
144
148inline int32_t format_sam_base::soft_clipping_at_front(std::vector<cigar> const & cigar_vector) const
149{
150 int32_t sc_front{};
151
152 // Checks if the given index in the cigar vector is a soft clip.
153 auto soft_clipping_at = [&](size_t const index)
154 {
155 return cigar_vector[index] == 'S'_cigar_operation;
156 };
157 // Checks if the given index in the cigar vector is a hard clip.
158 auto hard_clipping_at = [&](size_t const index)
159 {
160 return cigar_vector[index] == 'H'_cigar_operation;
161 };
162 // Checks if the given cigar vector as at least min_size many elements.
163 auto vector_size_at_least = [&](size_t const min_size)
164 {
165 return cigar_vector.size() >= min_size;
166 };
167 // Returns the cigar count of the ith cigar element in the given cigar vector.
168 auto cigar_count_at = [&](size_t const index)
169 {
170 return get<0>(cigar_vector[index]);
171 };
172
173 // check for soft clipping at the first two positions
174 if (vector_size_at_least(1) && soft_clipping_at(0))
175 sc_front = cigar_count_at(0);
176 else if (vector_size_at_least(2) && hard_clipping_at(0) && soft_clipping_at(1))
177 sc_front = cigar_count_at(1);
178
179 return sc_front;
180}
181
189template <typename stream_view_type, std::ranges::forward_range target_range_type>
190inline void format_sam_base::read_forward_range_field(stream_view_type && stream_view, target_range_type & target)
191{
192 using target_range_value_t = std::ranges::range_value_t<target_range_type>;
193 using begin_iterator_t = std::ranges::iterator_t<stream_view_type>;
194 using end_iterator_t = std::ranges::sentinel_t<stream_view_type>;
195
196 // Note that we need to cache the begin iterator since the stream_view is an input range that may be consuming
197 // and in that case might read `past-the-end` on a second call of std::ranges::begin.
198 if (auto it = std::ranges::begin(stream_view); it != std::ranges::end(stream_view))
199 {
200 // Write to target if field does not represent an empty string, denoted as single '*' character.
201 if (char c = *it; !(++it == std::ranges::end(stream_view) && c == '*'))
202 {
203 target.push_back(seqan3::assign_char_to(c, target_range_value_t{}));
204 std::ranges::copy(std::ranges::subrange<begin_iterator_t, end_iterator_t>{it, std::ranges::end(stream_view)}
205 | views::char_to<target_range_value_t>,
206 std::back_inserter(target));
207 }
208 }
209}
210
217template <std::ranges::forward_range target_range_type>
218inline void format_sam_base::read_forward_range_field(std::string_view const str, target_range_type & target)
219{
220 if (str.size() == 1 && str[0] == '*') // '*' denotes empty field
221 return;
222
223 if constexpr (std::assignable_from<target_range_type, std::string_view>)
224 {
225 target = str;
226 }
227 else
228 {
229 target.resize(str.size());
230 for (size_t i = 0; i < str.size(); ++i)
231 target[i] = assign_char_to(str[i], std::ranges::range_value_t<target_range_type>{});
232 }
233}
234
244template <arithmetic arithmetic_target_type>
245inline void format_sam_base::read_arithmetic_field(std::string_view const & str,
246 arithmetic_target_type & arithmetic_target)
247{
248 std::from_chars_result res = std::from_chars(str.begin(), str.end(), arithmetic_target);
249
250 if (res.ec == std::errc::invalid_argument || res.ptr != str.end())
251 throw format_error{std::string("[CORRUPTED SAM FILE] The string '") + std::string(str.begin(), str.end())
252 + "' could not be cast into type " + detail::type_name_as_string<arithmetic_target_type>};
253
254 if (res.ec == std::errc::result_out_of_range)
255 throw format_error{std::string("[CORRUPTED SAM FILE] Casting '") + std::string(str.begin(), str.end())
256 + "' into type " + detail::type_name_as_string<arithmetic_target_type>
257 + " would cause an overflow."};
258}
259
276template <typename stream_view_type, typename ref_ids_type, typename ref_seqs_type>
277inline void format_sam_base::read_header(stream_view_type && stream_view,
278 sam_file_header<ref_ids_type> & hdr,
279 ref_seqs_type & /*ref_id_to_pos_map*/)
280{
281 auto it = std::ranges::begin(stream_view);
282 auto end = std::ranges::end(stream_view);
283 std::string string_buffer{};
284
285 auto make_tag = [](uint8_t char1, uint8_t char2) constexpr
286 {
287 return static_cast<uint16_t>(char1) | (static_cast<uint16_t>(char2) << CHAR_BIT);
288 };
289
290 std::array<char, 2> raw_tag{};
291
292 auto parse_and_make_tag = [&]()
293 {
294 raw_tag[0] = *it;
295 ++it;
296 raw_tag[1] = *it;
297 ++it;
298 return make_tag(raw_tag[0], raw_tag[1]);
299 };
300
301 auto take_until_predicate = [&it, &string_buffer](auto const & predicate)
302 {
303 string_buffer.clear();
304 while (!predicate(*it))
305 {
306 string_buffer.push_back(*it);
307 ++it;
308 }
309 };
310
311 auto skip_until_predicate = [&it](auto const & predicate)
312 {
313 while (!predicate(*it))
314 ++it;
315 };
316
317 auto copy_next_tag_value_into_buffer = [&]()
318 {
319 skip_until_predicate(is_char<':'>);
320 ++it; // skip :
321 take_until_predicate(is_char<'\t'> || is_char<'\n'>);
322 };
323
324 // Some tags are not parsed individually. Instead, these are simply copied into a std::string.
325 // Multiple tags must be separated by a `\t`, hence we prepend a tab to the string, except the first time.
326 // Alternatively, we could always append a `\t`, but this would have the side effect that we might need to trim a
327 // trailing tab after parsing all tags via `pop_back()`.
328 // Unfortunately, we do not know when we are parsing the last tag (and in this case just not append a tab),
329 // because even though we can check if the line ends in a `\n`, it is not guaranteed that the last tag of the
330 // line is passed to this lambda. For example, the line might end with a tag that is properly parsed, such as `ID`.
331 auto parse_and_append_unhandled_tag_to_string = [&](std::string & value, std::array<char, 2> raw_tag)
332 {
333 take_until_predicate(is_char<'\t'> || is_char<'\n'>);
334 if (!value.empty())
335 value.push_back('\t');
336 value.push_back(raw_tag[0]);
337 value.push_back(raw_tag[1]);
338 read_forward_range_field(string_buffer, value);
339 };
340
341 auto print_cerr_of_unspported_tag = [&it](char const * const header_tag, std::array<char, 2> raw_tag)
342 {
343 std::cerr << "Unsupported SAM header tag in @" << header_tag << ": " << raw_tag[0] << raw_tag[1] << '\n';
344 };
345
346 while (it != end && is_char<'@'>(*it))
347 {
348 ++it; // skip @
349
350 switch (parse_and_make_tag())
351 {
352 case make_tag('H', 'D'): // HD (header) tag
353 {
354 // All tags can appear in any order, VN is the only required tag
355 while (is_char<'\t'>(*it))
356 {
357 ++it; // skip tab
358 std::string * header_entry{nullptr};
359
360 switch (parse_and_make_tag())
361 {
362 case make_tag('V', 'N'): // parse required VN (version) tag
363 {
364 header_entry = std::addressof(hdr.format_version);
365 break;
366 }
367 case make_tag('S', 'O'): // SO (sorting) tag
368 {
369 header_entry = std::addressof(hdr.sorting);
370 break;
371 }
372 case make_tag('S', 'S'): // SS (sub-order) tag
373 {
374 header_entry = std::addressof(hdr.subsorting);
375 break;
376 }
377 case make_tag('G', 'O'): // GO (grouping) tag
378 {
379 header_entry = std::addressof(hdr.grouping);
380 break;
381 }
382 default: // unsupported header tag
383 {
384 print_cerr_of_unspported_tag("HD", raw_tag);
385 }
386 }
387
388 if (header_entry != nullptr)
389 {
390 copy_next_tag_value_into_buffer();
391 read_forward_range_field(string_buffer, *header_entry);
392 }
393 else
394 skip_until_predicate(is_char<'\t'> || is_char<'\n'>);
395 }
396 ++it; // skip newline
397
398 if (hdr.format_version.empty())
399 throw format_error{std::string{"The required VN tag in @HD is missing."}};
400
401 break;
402 }
403
404 case make_tag('S', 'Q'): // SQ (sequence dictionary) tag
405 {
406 ref_info_present_in_header = true;
407 std::ranges::range_value_t<decltype(hdr.ref_ids())> id;
408 std::optional<int32_t> sequence_length{};
410
411 // All tags can appear in any order, SN and LN are required tags
412 while (is_char<'\t'>(*it))
413 {
414 ++it; // skip tab
415
416 switch (parse_and_make_tag())
417 {
418 case make_tag('S', 'N'): // parse required SN (sequence name) tag
419 {
420 copy_next_tag_value_into_buffer();
421 read_forward_range_field(string_buffer, id);
422 break;
423 }
424 case make_tag('L', 'N'): // parse required LN (length) tag
425 {
426 int32_t sequence_length_tmp{};
427 copy_next_tag_value_into_buffer();
428 read_arithmetic_field(string_buffer, sequence_length_tmp);
429 sequence_length = sequence_length_tmp;
430 break;
431 }
432 default: // Any other tag
433 {
434 parse_and_append_unhandled_tag_to_string(get<1>(info), raw_tag);
435 }
436 }
437 }
438 ++it; // skip newline
439
440 if (id.empty())
441 throw format_error{std::string{"The required SN tag in @SQ is missing."}};
442 if (!sequence_length.has_value())
443 throw format_error{std::string{"The required LN tag in @SQ is missing."}};
444 if (sequence_length.value() <= 0)
445 throw format_error{std::string{"The value of LN in @SQ must be positive."}};
446
447 get<0>(info) = sequence_length.value();
448 // If reference information was given, the ids exist and we can fill ref_dict directly.
449 // If not, we need to update the ids first and fill the reference dictionary afterwards.
450 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>) // reference information given
451 {
452 auto id_it = hdr.ref_dict.find(id);
453
454 if (id_it == hdr.ref_dict.end())
455 throw format_error{detail::to_string("Unknown reference name '",
456 id,
457 "' found in SAM header ",
458 "(header.ref_ids(): ",
459 hdr.ref_ids(),
460 ").")};
461
462 auto & given_ref_info = hdr.ref_id_info[id_it->second];
463
464 if (std::get<0>(given_ref_info) != std::get<0>(info))
465 throw format_error{"Provided and header-based reference length differ."};
466
467 hdr.ref_id_info[id_it->second] = std::move(info);
468 }
469 else
470 {
471 static_assert(!detail::is_type_specialisation_of_v<decltype(hdr.ref_ids()), std::deque>,
472 "The range over reference ids must be of type std::deque such that pointers are not "
473 "invalidated.");
474
475 hdr.ref_ids().push_back(id);
476 hdr.ref_id_info.push_back(info);
477 hdr.ref_dict[(hdr.ref_ids())[(hdr.ref_ids()).size() - 1]] = (hdr.ref_ids()).size() - 1;
478 }
479 break;
480 }
481
482 case make_tag('R', 'G'): // RG (read group) tag
483 {
485
486 // All tags can appear in any order, SN and LN are required tags
487 while (is_char<'\t'>(*it))
488 {
489 ++it; // skip tab
490
491 switch (parse_and_make_tag())
492 {
493 case make_tag('I', 'D'): // parse required ID tag
494 {
495 copy_next_tag_value_into_buffer();
496 read_forward_range_field(string_buffer, get<0>(tmp));
497 break;
498 }
499 default: // Any other tag
500 {
501 parse_and_append_unhandled_tag_to_string(get<1>(tmp), raw_tag);
502 }
503 }
504 }
505 ++it; // skip newline
506
507 if (get<0>(tmp).empty())
508 throw format_error{std::string{"The required ID tag in @RG is missing."}};
509
510 hdr.read_groups.emplace_back(std::move(tmp));
511 break;
512 }
513
514 case make_tag('P', 'G'): // PG (program) tag
515 {
517
518 // All tags can appear in any order, ID is the only required tag
519 while (is_char<'\t'>(*it))
520 {
521 ++it; // skip tab
522 std::string * program_info_entry{nullptr};
523
524 switch (parse_and_make_tag())
525 {
526 case make_tag('I', 'D'): // read required ID tag
527 {
528 program_info_entry = std::addressof(tmp.id);
529 break;
530 }
531 case make_tag('P', 'N'): // PN (program name) tag
532 {
533 program_info_entry = std::addressof(tmp.name);
534 break;
535 }
536 case make_tag('P', 'P'): // PP (previous program) tag
537 {
538 program_info_entry = std::addressof(tmp.previous);
539 break;
540 }
541 case make_tag('C', 'L'): // CL (command line) tag
542 {
543 program_info_entry = std::addressof(tmp.command_line_call);
544 break;
545 }
546 case make_tag('D', 'S'): // DS (description) tag
547 {
548 program_info_entry = std::addressof(tmp.description);
549 break;
550 }
551 case make_tag('V', 'N'): // VN (version) tag
552 {
553 program_info_entry = std::addressof(tmp.version);
554 break;
555 }
556 default: // unsupported header tag
557 {
558 print_cerr_of_unspported_tag("PG", raw_tag);
559 }
560 }
561
562 if (program_info_entry != nullptr)
563 {
564 copy_next_tag_value_into_buffer();
565 read_forward_range_field(string_buffer, *program_info_entry);
566 }
567 else
568 skip_until_predicate(is_char<'\t'> || is_char<'\n'>);
569 }
570 ++it; // skip newline
571
572 if (tmp.id.empty())
573 throw format_error{std::string{"The required ID tag in @PG is missing."}};
574
575 hdr.program_infos.emplace_back(std::move(tmp));
576 break;
577 }
578
579 case make_tag('C', 'O'): // CO (comment) tag
580 {
581 ++it; // skip tab
582 std::string tmp;
583 take_until_predicate(is_char<'\n'>);
584 read_forward_range_field(string_buffer, tmp);
585 ++it; // skip newline
586 hdr.comments.emplace_back(std::move(tmp));
587 break;
588 }
589
590 default:
591 throw format_error{std::string{"Illegal SAM header tag starting with:"} + *it};
592 }
593 }
594}
595
612template <typename stream_t, typename header_type>
613inline void
614format_sam_base::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
615{
616 if constexpr (!detail::decays_to_ignore_v<header_type>)
617 {
618 // -----------------------------------------------------------------
619 // Check Header
620 // -----------------------------------------------------------------
621
622 // (@HD) Check header line
623 // The format version string will be taken from the local member variable
624 if (!header.sorting.empty()
625 && !(header.sorting == "unknown" || header.sorting == "unsorted" || header.sorting == "queryname"
626 || header.sorting == "coordinate"))
627 throw format_error{"SAM format error: The header.sorting member must be "
628 "one of [unknown, unsorted, queryname, coordinate]."};
629
630 if (!header.grouping.empty()
631 && !(header.grouping == "none" || header.grouping == "query" || header.grouping == "reference"))
632 throw format_error{"SAM format error: The header.grouping member must be "
633 "one of [none, query, reference]."};
634
635 // (@SQ) Check Reference Sequence Dictionary lines
636
637 // TODO
638
639 // - sorting order be one of ...
640 // - grouping can be one of ...
641 // - reference names must be unique
642 // - ids of read groups must be unique
643 // - program ids need to be unique
644 // many more small semantic things, like fits REGEX
645
646 // -----------------------------------------------------------------
647 // Write Header
648 // -----------------------------------------------------------------
649 std::ostreambuf_iterator stream_it{stream};
650
651 // (@HD) Write header line [required].
652 stream << "@HD\tVN:";
653 std::ranges::copy(format_version, stream_it);
654
655 if (!header.sorting.empty())
656 stream << "\tSO:" << header.sorting;
657
658 if (!header.subsorting.empty())
659 stream << "\tSS:" << header.subsorting;
660
661 if (!header.grouping.empty())
662 stream << "\tGO:" << header.grouping;
663
664 detail::write_eol(stream_it, options.add_carriage_return);
665
666 // (@SQ) Write Reference Sequence Dictionary lines [required].
667 for (auto const & [ref_name, ref_info] : views::zip(header.ref_ids(), header.ref_id_info))
668 {
669 stream << "@SQ\tSN:";
670
671 std::ranges::copy(ref_name, stream_it);
672
673 stream << "\tLN:" << get<0>(ref_info);
674
675 if (!get<1>(ref_info).empty())
676 stream << "\t" << get<1>(ref_info);
677
678 detail::write_eol(stream_it, options.add_carriage_return);
679 }
680
681 // Write read group (@RG) lines if specified.
682 for (auto const & read_group : header.read_groups)
683 {
684 stream << "@RG"
685 << "\tID:" << get<0>(read_group);
686
687 if (!get<1>(read_group).empty())
688 stream << "\t" << get<1>(read_group);
689
690 detail::write_eol(stream_it, options.add_carriage_return);
691 }
692
693 // Write program (@PG) lines if specified.
694 for (auto const & program : header.program_infos)
695 {
696 stream << "@PG"
697 << "\tID:" << program.id;
698
699 if (!program.name.empty())
700 stream << "\tPN:" << program.name;
701
702 if (!program.command_line_call.empty())
703 stream << "\tCL:" << program.command_line_call;
704
705 if (!program.previous.empty())
706 stream << "\tPP:" << program.previous;
707
708 if (!program.description.empty())
709 stream << "\tDS:" << program.description;
710
711 if (!program.version.empty())
712 stream << "\tVN:" << program.version;
713
714 detail::write_eol(stream_it, options.add_carriage_return);
715 }
716
717 // Write comment (@CO) lines if specified.
718 for (auto const & comment : header.comments)
719 {
720 stream << "@CO\t" << comment;
721 detail::write_eol(stream_it, options.add_carriage_return);
722 }
723 }
724}
725
726} // namespace seqan3::detail
T addressof(T... args)
T back_inserter(T... args)
T begin(T... args)
Provides seqan3::views::char_to.
sam_file_program_info_t program_info_t
Stores information of the program/tool that was used to create the file.
Definition: header.hpp:82
T copy(T... args)
Provides various utility functions.
T empty(T... args)
T end(T... args)
T find(T... args)
T from_chars(T... args)
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition: alphabet/concept.hpp:524
@ comment
Comment field of arbitrary content, usually a string.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition: predicate.hpp:63
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
seqan::std::views::zip zip
A view adaptor that takes several views and returns tuple-like values from every i-th element of each...
Definition: zip.hpp:27
Provides the seqan3::sam_file_header class.
Provides various utility functions.
Auxiliary functions for the SAM IO.
T push_back(T... args)
Provides seqan3::debug_stream and related types.
Provides seqan3::views::repeat_n.
Provides seqan3::sam_file_output_format and auxiliary classes.
T search(T... args)
T size(T... args)
Provides seqan3::views::slice.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::views::zip.