SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
format_sam.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <iterator>
13#include <ranges>
14#include <string>
15#include <vector>
16
35
36namespace seqan3
37{
38
105{
106public:
110 // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
111 format_sam() = default;
112 format_sam(format_sam const &) = delete;
113 format_sam & operator=(format_sam const &) = delete;
114 format_sam(format_sam &&) = default;
116 ~format_sam() = default;
117
119
122 {"sam"},
123 };
124
125protected:
126 template <typename stream_type, // constraints checked by file
127 typename seq_legal_alph_type,
128 typename stream_pos_type,
129 typename seq_type, // other constraints checked inside function
130 typename id_type,
131 typename qual_type>
132 void read_sequence_record(stream_type & stream,
134 stream_pos_type & position_buffer,
135 seq_type & sequence,
136 id_type & id,
137 qual_type & qualities);
138
139 template <typename stream_type, // constraints checked by file
140 typename seq_type, // other constraints checked inside function
141 typename id_type,
142 typename qual_type>
143 void write_sequence_record(stream_type & stream,
144 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
145 seq_type && sequence,
146 id_type && id,
147 qual_type && qualities);
148
149 template <typename stream_type, // constraints checked by file
150 typename seq_legal_alph_type,
151 typename ref_seqs_type,
152 typename ref_ids_type,
153 typename stream_pos_type,
154 typename seq_type,
155 typename id_type,
156 typename ref_seq_type,
157 typename ref_id_type,
158 typename ref_offset_type,
159 typename cigar_type,
160 typename flag_type,
161 typename mapq_type,
162 typename qual_type,
163 typename mate_type,
164 typename tag_dict_type,
165 typename e_value_type,
166 typename bit_score_type>
167 void read_alignment_record(stream_type & stream,
169 ref_seqs_type & ref_seqs,
171 stream_pos_type & position_buffer,
172 seq_type & seq,
173 qual_type & qual,
174 id_type & id,
175 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
176 ref_id_type & ref_id,
177 ref_offset_type & ref_offset,
178 cigar_type & cigar_vector,
179 flag_type & flag,
180 mapq_type & mapq,
181 mate_type & mate,
182 tag_dict_type & tag_dict,
183 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
184 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
185
186 template <typename stream_type,
187 typename header_type,
188 typename seq_type,
189 typename id_type,
190 typename ref_seq_type,
191 typename ref_id_type,
192 typename qual_type,
193 typename mate_type,
194 typename tag_dict_type,
195 typename e_value_type,
196 typename bit_score_type>
197 void write_alignment_record(stream_type & stream,
198 sam_file_output_options const & options,
199 header_type && header,
200 seq_type && seq,
201 qual_type && qual,
202 id_type && id,
203 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
204 ref_id_type && ref_id,
206 std::vector<cigar> const & cigar_vector,
207 sam_flag const flag,
208 uint8_t const mapq,
209 mate_type && mate,
210 tag_dict_type && tag_dict,
211 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
212 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
213
214private:
217
219 static constexpr std::string_view dummy{};
220
223
226
229 {
230 return dummy;
231 }
232
234 template <typename t>
235 decltype(auto) default_or(t && v) const noexcept
236 {
237 return std::forward<t>(v);
238 }
239
240 template <arithmetic value_type>
241 void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant, std::string_view const str, value_type value);
242
244
245 void read_sam_dict(std::string_view const tag_str, sam_tag_dictionary & target);
246
247 template <typename stream_it_t, std::ranges::forward_range field_type>
248 void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
249
250 template <typename stream_it_t>
251 void write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value);
252
253 template <typename stream_it_t>
254 void write_tag_fields(stream_it_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
255};
256
258template <typename stream_type, // constraints checked by file
259 typename seq_legal_alph_type,
260 typename stream_pos_type,
261 typename seq_type, // other constraints checked inside function
262 typename id_type,
263 typename qual_type>
264inline void format_sam::read_sequence_record(stream_type & stream,
266 stream_pos_type & position_buffer,
267 seq_type & sequence,
268 id_type & id,
269 qual_type & qualities)
270{
272
273 {
275 align_options,
276 std::ignore,
278 position_buffer,
279 sequence,
280 qualities,
281 id,
282 std::ignore,
283 std::ignore,
284 std::ignore,
285 std::ignore,
286 std::ignore,
287 std::ignore,
288 std::ignore,
289 std::ignore,
290 std::ignore,
291 std::ignore);
292 }
293
294 if constexpr (!detail::decays_to_ignore_v<seq_type>)
295 if (std::ranges::distance(sequence) == 0)
296 throw parse_error{"The sequence information must not be empty."};
297 if constexpr (!detail::decays_to_ignore_v<id_type>)
298 if (std::ranges::distance(id) == 0)
299 throw parse_error{"The id information must not be empty."};
300
301 if (options.truncate_ids)
303}
304
306template <typename stream_type, // constraints checked by file
307 typename seq_type, // other constraints checked inside function
308 typename id_type,
309 typename qual_type>
310inline void format_sam::write_sequence_record(stream_type & stream,
311 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
312 seq_type && sequence,
313 id_type && id,
314 qual_type && qualities)
315{
316 using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
317
318 sam_file_output_options output_options;
319
321 output_options,
322 /*header*/ std::ignore,
323 /*seq*/ default_or(sequence),
324 /*qual*/ default_or(qualities),
325 /*id*/ default_or(id),
326 /*ref_seq*/ std::string_view{},
327 /*ref_id*/ std::string_view{},
328 /*ref_offset*/ -1,
329 /*cigar_vector*/ std::vector<cigar>{},
330 /*flag*/ sam_flag::none,
331 /*mapq*/ 0,
332 /*mate*/ default_mate_t{},
333 /*tag_dict*/ sam_tag_dictionary{},
334 /*e_value*/ 0,
335 /*bit_score*/ 0);
336}
337
339template <typename stream_type, // constraints checked by file
340 typename seq_legal_alph_type,
341 typename ref_seqs_type,
342 typename ref_ids_type,
343 typename stream_pos_type,
344 typename seq_type,
345 typename id_type,
346 typename ref_seq_type,
347 typename ref_id_type,
348 typename ref_offset_type,
349 typename cigar_type,
350 typename flag_type,
351 typename mapq_type,
352 typename qual_type,
353 typename mate_type,
354 typename tag_dict_type,
355 typename e_value_type,
356 typename bit_score_type>
357inline void format_sam::read_alignment_record(stream_type & stream,
359 ref_seqs_type & ref_seqs,
361 stream_pos_type & position_buffer,
362 seq_type & seq,
363 qual_type & qual,
364 id_type & id,
365 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
366 ref_id_type & ref_id,
367 ref_offset_type & ref_offset,
368 cigar_type & cigar_vector,
369 flag_type & flag,
370 mapq_type & mapq,
371 mate_type & mate,
372 tag_dict_type & tag_dict,
373 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
374 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
375{
376 static_assert(detail::decays_to_ignore_v<ref_offset_type>
377 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
378 "The ref_offset must be a specialisation of std::optional.");
379
380 auto stream_it = detail::fast_istreambuf_iterator{*stream.rdbuf()};
381
382 auto stream_view = detail::istreambuf(stream);
383
384 int32_t ref_offset_tmp{}; // needed to read the ref_offset (int) beofre storing it in std::optional<uint32_t>
385 std::ranges::range_value_t<decltype(header.ref_ids())> ref_id_tmp{}; // in case mate is requested but ref_offset not
386
387 // Header
388 // -------------------------------------------------------------------------------------------------------------
389 if (is_char<'@'>(*stream_it)) // we always read the header if present
390 {
391 read_header(stream_view, header, ref_seqs, options);
392
393 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
394 return;
395 }
396
397 // Store the current file position in the buffer.
398 position_buffer = stream.tellg();
399
400 // We don't know wether we have 11 or 12 fields, since the tags are optional.
401 // The last field will thus contain either the quality sequence
402 // or the quality sequence AND tags. This will be handled at the respective fields below.
403 stream_it.cache_record_into('\n', '\t', raw_record);
404
405 // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
406 // -------------------------------------------------------------------------------------------------------------
407 if constexpr (!detail::decays_to_ignore_v<id_type>)
409
410 uint16_t flag_integral{};
411 read_arithmetic_field(raw_record[1], flag_integral);
412 flag = sam_flag{flag_integral};
413
414 read_forward_range_field(raw_record[2], ref_id_tmp);
415 check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
416
417 read_arithmetic_field(raw_record[3], ref_offset_tmp);
418 --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
419
420 if (ref_offset_tmp == -1)
421 ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
422 else if (ref_offset_tmp > -1)
423 ref_offset = ref_offset_tmp;
424 else if (ref_offset_tmp < -1)
425 throw format_error{"No negative values are allowed for field::ref_offset."};
426
427 if constexpr (!detail::decays_to_ignore_v<mapq_type>)
429
430 // Field 6: CIGAR
431 // -------------------------------------------------------------------------------------------------------------
432 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
433 cigar_vector = detail::parse_cigar(raw_record[5]);
434
435 // Field 7-9: (RNEXT PNEXT TLEN) = MATE
436 // -------------------------------------------------------------------------------------------------------------
437 if constexpr (!detail::decays_to_ignore_v<mate_type>)
438 {
439 std::ranges::range_value_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
440 read_forward_range_field(raw_record[6], tmp_mate_ref_id); // RNEXT
441
442 if (tmp_mate_ref_id == "=") // indicates "same as ref id"
443 {
444 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
445 get<0>(mate) = ref_id;
446 else
447 check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
448 }
449 else
450 {
451 check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
452 }
453
454 int32_t tmp_pnext{};
455 read_arithmetic_field(raw_record[7], tmp_pnext); // PNEXT
456
457 if (tmp_pnext > 0)
458 get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
459 else if (tmp_pnext < 0)
460 throw format_error{"No negative values are allowed at the mate mapping position."};
461 // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
462
463 read_arithmetic_field(raw_record[8], get<2>(mate)); // TLEN
464 }
465
466 // Field 10: Sequence
467 // -------------------------------------------------------------------------------------------------------------
468 if constexpr (!detail::decays_to_ignore_v<seq_type>)
469 {
470 std::string_view const seq_str = raw_record[9];
471
472 if (!seq_str.starts_with('*')) // * indicates missing sequence information
473 {
474 seq.resize(seq_str.size());
475 constexpr auto is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
476
477 for (size_t i = 0; i < seq_str.size(); ++i)
478 {
479 if (!is_legal_alph(seq_str[i]))
480 throw parse_error{std::string{"Encountered an unexpected letter: "} + "char_is_valid_for<"
481 + detail::type_name_as_string<seq_legal_alph_type>
482 + "> evaluated to false on " + detail::make_printable(seq_str[i])};
483
484 seq[i] = assign_char_to(seq_str[i], std::ranges::range_value_t<seq_type>{});
485 }
486 }
487 }
488
489 // Field 11: Quality
490 // -------------------------------------------------------------------------------------------------------------
491 // We don't know wether we have 11 or 12 fields, since the tags are optional.
492 // The last field will thus contain either the quality sequence
493 // or the quality sequence AND tags.
494 size_t tag_begin_pos = raw_record[10].find('\t');
495
496 std::string_view qualities =
497 (tag_begin_pos == std::string_view::npos) ? raw_record[10] : raw_record[10].substr(0, tag_begin_pos);
498
499 if constexpr (!detail::decays_to_ignore_v<qual_type>)
500 read_forward_range_field(qualities, qual);
501
502 if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
503 {
504 if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0
505 && std::ranges::distance(seq) != std::ranges::distance(qual))
506 {
507 throw format_error{detail::to_string("Sequence length (",
508 std::ranges::distance(seq),
509 ") and quality length (",
510 std::ranges::distance(qual),
511 ") must be the same.")};
512 }
513 }
514
515 // All remaining optional fields if any: SAM tags dictionary
516 // -------------------------------------------------------------------------------------------------------------
517 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
518 {
519 while (tag_begin_pos != std::string_view::npos) // read all tags if present
520 {
521 ++tag_begin_pos; // skip '\t'
522 size_t const tag_end_pos = raw_record[10].find('\t', tag_begin_pos);
523
524 char const * tag_begin = raw_record[10].begin() + tag_begin_pos;
525 char const * tag_end =
526 (tag_end_pos == std::string_view::npos) ? raw_record[10].end() : raw_record[10].begin() + tag_end_pos;
527
528 read_sam_dict(std::string_view{tag_begin, tag_end}, tag_dict);
529
530 tag_begin_pos = tag_end_pos;
531 }
532 }
533
534 assert(stream_it == std::default_sentinel_t{} || *stream_it == '\n');
535 ++stream_it; // Move from end of record to the beginning of the next or to the end of the stream.
536}
537
539template <typename stream_type,
540 typename header_type,
541 typename seq_type,
542 typename id_type,
543 typename ref_seq_type,
544 typename ref_id_type,
545 typename qual_type,
546 typename mate_type,
547 typename tag_dict_type,
548 typename e_value_type,
549 typename bit_score_type>
550inline void format_sam::write_alignment_record(stream_type & stream,
551 sam_file_output_options const & options,
552 header_type && header,
553 seq_type && seq,
554 qual_type && qual,
555 id_type && id,
556 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
557 ref_id_type && ref_id,
559 std::vector<cigar> const & cigar_vector,
560 sam_flag const flag,
561 uint8_t const mapq,
562 mate_type && mate,
563 tag_dict_type && tag_dict,
564 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
565 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
566{
567 /* Note the following general things:
568 *
569 * - Given the SAM specifications, all fields may be empty
570 *
571 * - arithmetic values default to 0 while all others default to '*'
572 *
573 * - Because of the former, arithmetic values can be directly streamed
574 * into 'stream' as operator<< is defined for all arithmetic types
575 * and the default value (0) is also the SAM default.
576 *
577 * - All other non-arithmetic values need to be checked for emptiness
578 */
579
580 // ---------------------------------------------------------------------
581 // Type Requirements (as static asserts for user friendliness)
582 // ---------------------------------------------------------------------
583 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
584 "The seq object must be a std::ranges::forward_range over "
585 "letters that model seqan3::alphabet.");
586
587 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
588 "The id object must be a std::ranges::forward_range over "
589 "letters that model seqan3::alphabet.");
590
591 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
592 {
593 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
594 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
595 "The ref_id object must be a std::ranges::forward_range "
596 "over letters that model seqan3::alphabet.");
597
598 if constexpr (std::integral<std::remove_cvref_t<ref_id_type>>
599 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>)
600 static_assert(!detail::decays_to_ignore_v<header_type>,
601 "If you give indices as reference id information the header must also be present.");
602 }
603
604 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
605 "The qual object must be a std::ranges::forward_range "
606 "over letters that model seqan3::alphabet.");
607
609 "The mate object must be a std::tuple of size 3 with "
610 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
611 "2) a std::integral or std::optional<std::integral>, and "
612 "3) a std::integral.");
613
614 static_assert(
615 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
616 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
617 || detail::is_type_specialisation_of_v<
618 std::remove_cvref_t<decltype(std::get<0>(mate))>,
619 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
620 || detail::is_type_specialisation_of_v<
621 std::remove_cvref_t<decltype(std::get<1>(mate))>,
622 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
623 "The mate object must be a std::tuple of size 3 with "
624 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
625 "2) a std::integral or std::optional<std::integral>, and "
626 "3) a std::integral.");
627
628 if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
629 || detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>,
631 static_assert(!detail::decays_to_ignore_v<header_type>,
632 "If you give indices as mate reference id information the header must also be present.");
633
634 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
635 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
636
637 // ---------------------------------------------------------------------
638 // logical Requirements
639 // ---------------------------------------------------------------------
640 if constexpr (!detail::decays_to_ignore_v<header_type> && !detail::decays_to_ignore_v<ref_id_type>
641 && !std::integral<std::remove_reference_t<ref_id_type>>
642 && !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
643 {
644
645 if (options.sam_require_header && !std::ranges::empty(ref_id))
646 {
647 auto id_it = header.ref_dict.end();
648
649 if constexpr (std::ranges::contiguous_range<decltype(ref_id)> && std::ranges::sized_range<decltype(ref_id)>
650 && std::ranges::borrowed_range<decltype(ref_id)>)
651 {
652 id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
653 }
654 else
655 {
656 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
657
659 "The ref_id type is not convertible to the reference id information stored in the "
660 "reference dictionary of the header object.");
661
662 id_it = header.ref_dict.find(ref_id);
663 }
664
665 if (id_it == header.ref_dict.end()) // no reference id matched
666 throw format_error{detail::to_string("The ref_id '",
667 ref_id,
668 "' was not in the list of references:",
669 header.ref_ids())};
670 }
671 }
672
673 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
674 throw format_error{"The ref_offset object must be a std::integral >= 0."};
675
676 // ---------------------------------------------------------------------
677 // Writing the Header on first call
678 // ---------------------------------------------------------------------
679 if constexpr (!detail::decays_to_ignore_v<header_type>)
680 {
682 {
683 write_header(stream, options, header);
684 header_was_written = true;
685 }
686 }
687
688 // ---------------------------------------------------------------------
689 // Writing the Record
690 // ---------------------------------------------------------------------
691
692 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
693 constexpr char separator{'\t'};
694
695 write_range_or_asterisk(stream_it, id);
696 *stream_it = separator;
697
698 stream_it.write_number(static_cast<uint16_t>(flag));
699 *stream_it = separator;
700
701 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
702 {
703 if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
704 {
705 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id]);
706 }
707 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
708 {
709 if (ref_id.has_value())
710 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id.value()]);
711 else
712 *stream_it = '*';
713 }
714 else
715 {
717 }
718 }
719 else
720 {
721 *stream_it = '*';
722 }
723
724 *stream_it = separator;
725
726 // SAM is 1 based, 0 indicates unmapped read if optional is not set
727 stream_it.write_number(ref_offset.value_or(-1) + 1);
728 *stream_it = separator;
729
730 stream_it.write_number(static_cast<unsigned>(mapq));
731 *stream_it = separator;
732
733 if (!std::ranges::empty(cigar_vector))
734 {
735 for (auto & c : cigar_vector) //TODO THIS IS PROBABLY TERRIBLE PERFORMANCE_WISE
736 stream_it.write_range(c.to_string());
737 }
738 else
739 {
740 *stream_it = '*';
741 }
742
743 *stream_it = separator;
744
745 if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
746 {
747 write_range_or_asterisk(stream_it, (header.ref_ids())[get<0>(mate)]);
748 }
749 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>,
751 {
752 if (get<0>(mate).has_value())
753 write_range_or_asterisk(stream_it, header.ref_ids()[get<0>(mate).value()]);
754 else
755 *stream_it = '*';
756 }
757 else
758 {
759 write_range_or_asterisk(stream_it, get<0>(mate));
760 }
761
762 *stream_it = separator;
763
764 if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
765 {
766 // SAM is 1 based, 0 indicates unmapped read if optional is not set
767 stream_it.write_number(get<1>(mate).value_or(-1) + 1);
768 *stream_it = separator;
769 }
770 else
771 {
772 stream_it.write_number(get<1>(mate));
773 *stream_it = separator;
774 }
775
776 stream_it.write_number(get<2>(mate));
777 *stream_it = separator;
778
779 write_range_or_asterisk(stream_it, seq);
780 *stream_it = separator;
781
782 write_range_or_asterisk(stream_it, qual);
783
784 write_tag_fields(stream_it, tag_dict, separator);
785
786 stream_it.write_end_of_line(options.add_carriage_return);
787}
788
805template <arithmetic value_type>
807 std::string_view const str,
808 value_type value)
809{
810 std::vector<value_type> tmp_vector{};
811 size_t start_pos{0};
812 size_t end_pos{0};
813
814 while (start_pos != std::string_view::npos)
815 {
816 end_pos = str.find(',', start_pos);
817 auto end = (end_pos == std::string_view::npos) ? str.end() : str.begin() + end_pos;
818 read_arithmetic_field(std::string_view{str.begin() + start_pos, end}, value);
819 tmp_vector.push_back(value);
820
821 start_pos = (end_pos == std::string_view::npos) ? end_pos : end_pos + 1;
822 }
823 variant = std::move(tmp_vector);
824}
825
838{
839 std::vector<std::byte> tmp_vector{};
840 // std::from_chars cannot directly parse into a std::byte
841 uint8_t dummy_byte{};
842
843 if (str.size() % 2 != 0)
844 throw format_error{"[CORRUPTED SAM FILE] Hexadecimal tag must have even number of digits."};
845
846 // H encodes bytes in a hexadecimal format. Two hex values are stored for each byte as characters.
847 // E.g., '1' and 'A' need one byte each and are read as `\x1A`, which is 27 in decimal.
848 for (auto hex_begin = str.begin(), hex_end = str.begin() + 2; hex_begin != str.end(); hex_begin += 2, hex_end += 2)
849 {
850 auto res = std::from_chars(hex_begin, hex_end, dummy_byte, 16);
851
852 if (res.ec == std::errc::invalid_argument)
853 throw format_error{std::string("[CORRUPTED SAM FILE] The string '") + std::string(hex_begin, hex_end)
854 + "' could not be cast into type uint8_t."};
855
856 if (res.ec == std::errc::result_out_of_range)
857 throw format_error{std::string("[CORRUPTED SAM FILE] Casting '") + std::string(str)
858 + "' into type uint8_t would cause an overflow."};
859
860 tmp_vector.push_back(std::byte{dummy_byte});
861 }
862
863 variant = std::move(tmp_vector);
864}
865
882{
883 /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
884 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
885 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
886 VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
887 */
888 assert(tag_str.size() > 5);
889
890 uint16_t tag = static_cast<uint16_t>(tag_str[0]) << 8;
891 tag += static_cast<uint16_t>(tag_str[1]);
892
893 char type_id = tag_str[3];
894
895 switch (type_id)
896 {
897 case 'A': // char
898 {
899 assert(tag_str.size() == 6);
900 target[tag] = tag_str[5];
901 break;
902 }
903 case 'i': // int32_t
904 {
905 int32_t tmp;
906 read_arithmetic_field(tag_str.substr(5), tmp);
907 target[tag] = tmp;
908 break;
909 }
910 case 'f': // float
911 {
912 float tmp;
913 read_arithmetic_field(tag_str.substr(5), tmp);
914 target[tag] = tmp;
915 break;
916 }
917 case 'Z': // string
918 {
919 target[tag] = std::string{tag_str.substr(5)};
920 break;
921 }
922 case 'H':
923 {
924 read_sam_byte_vector(target[tag], tag_str.substr(5));
925 break;
926 }
927 case 'B': // Array. Value type depends on second char [cCsSiIf]
928 {
929 assert(tag_str.size() > 6);
930 char array_value_type_id = tag_str[5];
931
932 switch (array_value_type_id)
933 {
934 case 'c': // int8_t
935 read_sam_dict_vector(target[tag], tag_str.substr(7), int8_t{});
936 break;
937 case 'C': // uint8_t
938 read_sam_dict_vector(target[tag], tag_str.substr(7), uint8_t{});
939 break;
940 case 's': // int16_t
941 read_sam_dict_vector(target[tag], tag_str.substr(7), int16_t{});
942 break;
943 case 'S': // uint16_t
944 read_sam_dict_vector(target[tag], tag_str.substr(7), uint16_t{});
945 break;
946 case 'i': // int32_t
947 read_sam_dict_vector(target[tag], tag_str.substr(7), int32_t{});
948 break;
949 case 'I': // uint32_t
950 read_sam_dict_vector(target[tag], tag_str.substr(7), uint32_t{});
951 break;
952 case 'f': // float
953 read_sam_dict_vector(target[tag], tag_str.substr(7), float{});
954 break;
955 default:
956 throw format_error{std::string("The first character in the numerical ")
957 + "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id
958 + "' was given."};
959 }
960 break;
961 }
962 default:
963 throw format_error{std::string("The second character in the numerical id of a "
964 "SAM tag ([TAG]:[TYPE_ID]:[VALUE]) must be one of [A,i,Z,H,B,f] but '")
965 + type_id + "' was given."};
966 }
967}
968
976template <typename stream_it_t, std::ranges::forward_range field_type>
977inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
978{
979 if (std::ranges::empty(field_value))
980 {
981 *stream_it = '*';
982 }
983 else
984 {
985 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>, char>)
986 stream_it.write_range(field_value);
987 else // convert from alphabets to their character representation
988 stream_it.write_range(field_value | views::to_char);
989 }
990}
991
998template <typename stream_it_t>
999inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value)
1000{
1001 write_range_or_asterisk(stream_it, std::string_view{field_value});
1002}
1003
1011template <typename stream_it_t>
1012inline void
1013format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary const & tag_dict, char const separator)
1014{
1015 auto const stream_variant_fn = [&stream_it](auto && arg) // helper to print a std::variant
1016 {
1017 using T = std::remove_cvref_t<decltype(arg)>;
1018
1019 if constexpr (std::ranges::input_range<T>)
1020 {
1021 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, char>)
1022 {
1023 stream_it.write_range(arg);
1024 }
1025 else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, std::byte>)
1026 {
1027 if (!std::ranges::empty(arg))
1028 {
1029 stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1030
1031 for (auto && elem : arg | std::views::drop(1))
1032 {
1033 *stream_it = ',';
1034 stream_it.write_number(std::to_integer<uint8_t>(elem));
1035 }
1036 }
1037 }
1038 else
1039 {
1040 if (!std::ranges::empty(arg))
1041 {
1042 stream_it.write_number(*std::ranges::begin(arg));
1043
1044 for (auto && elem : arg | std::views::drop(1))
1045 {
1046 *stream_it = ',';
1047 stream_it.write_number(elem);
1048 }
1049 }
1050 }
1051 }
1052 else if constexpr (std::same_as<std::remove_cvref_t<T>, char>)
1053 {
1054 *stream_it = arg;
1055 }
1056 else // number
1057 {
1058 stream_it.write_number(arg);
1059 }
1060 };
1061
1062 for (auto & [tag, variant] : tag_dict)
1063 {
1064 *stream_it = separator;
1065
1066 char const char0 = tag / 256;
1067 char const char1 = tag % 256;
1068
1069 *stream_it = char0;
1070 *stream_it = char1;
1071 *stream_it = ':';
1072 *stream_it = detail::sam_tag_type_char[variant.index()];
1073 *stream_it = ':';
1074
1075 if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1076 {
1077 *stream_it = detail::sam_tag_type_char_extra[variant.index()];
1078 *stream_it = ',';
1079 }
1080
1081 std::visit(stream_variant_fn, variant);
1082 }
1083}
1084
1085} // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
T begin(T... args)
Functionally the same as std::istreambuf_iterator, but faster.
Definition fast_istreambuf_iterator.hpp:37
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition fast_ostreambuf_iterator.hpp:37
The alignment base format.
Definition format_sam_base.hpp:43
void check_and_assign_ref_id(ref_id_type &ref_id, ref_id_tmp_type &ref_id_tmp, header_type &header, ref_seqs_type &)
Checks for known reference ids or adds a new reference is and assigns a reference id to ref_id.
Definition format_sam_base.hpp:107
void write_header(stream_t &stream, sam_file_output_options const &options, header_type &header)
Writes the SAM header.
Definition format_sam_base.hpp:621
void read_arithmetic_field(std::string_view const &str, arithmetic_target_type &arithmetic_target)
Reads arithmetic fields using std::from_chars.
Definition format_sam_base.hpp:244
void read_forward_range_field(stream_view_type &&stream_view, target_range_type &target)
Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to.
Definition format_sam_base.hpp:189
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition format_sam_base.hpp:64
void read_header(stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &, sam_file_input_options< seq_legal_alph_type > const &options)
Reads the SAM header.
Definition format_sam_base.hpp:279
The SAM format (tag).
Definition format_sam.hpp:105
sam_file_header default_header
The default header for the alignment format.
Definition format_sam.hpp:222
void read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, std::string_view const str, value_type value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition format_sam.hpp:806
std::string_view const & default_or(detail::ignore_t) const noexcept
brief Returns a reference to dummy if passed a std::ignore.
Definition format_sam.hpp:228
~format_sam()=default
Defaulted.
format_sam & operator=(format_sam const &)=delete
Deleted. Header holds a unique_ptr.
void read_sam_byte_vector(seqan3::detail::sam_tag_variant &variant, std::string_view const str)
Reads a list of byte pairs as it is the case for SAM tag byte arrays.
Definition format_sam.hpp:837
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition format_sam.hpp:121
void read_sam_dict(std::string_view const tag_str, sam_tag_dictionary &target)
Reads the optional tag fields into the seqan3::sam_tag_dictionary.
Definition format_sam.hpp:881
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition format_sam.hpp:310
std::array< std::string_view, 11 > raw_record
A buffer to store a raw record pointing into the stream buffer of the input.
Definition format_sam.hpp:225
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition format_sam.hpp:550
format_sam(format_sam &&)=default
Defaulted.
void write_tag_fields(stream_it_t &stream, sam_tag_dictionary const &tag_dict, char const separator)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition format_sam.hpp:1013
format_sam & operator=(format_sam &&)=default
Defaulted.
static constexpr std::string_view dummy
An empty dummy container to pass to align_format.write() such that an empty field is written.
Definition format_sam.hpp:219
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition format_sam.hpp:357
void write_range_or_asterisk(stream_it_t &stream_it, field_type &&field_value)
Writes a field value to the stream.
Definition format_sam.hpp:977
decltype(auto) default_or(t &&v) const noexcept
brief Returns the input unchanged.
Definition format_sam.hpp:235
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type > const &options, stream_pos_type &position_buffer, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition format_sam.hpp:264
format_sam(format_sam const &)=delete
Deleted. Header holds a unique_ptr.
std::string tmp_qual
Stores quality values temporarily if seq and qual information are combined (not supported by SAM yet)...
Definition format_sam.hpp:216
format_sam()=default
Defaulted.
Stores the header information of SAM/BAM files.
Definition header.hpp:46
ref_ids_type & ref_ids()
The range of reference ids.
Definition header.hpp:140
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition header.hpp:179
The SAM tag dictionary class that stores all optional SAM fields.
Definition sam_tag_dictionary.hpp:327
T end(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
T from_chars(T... args)
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition to_char.hpp:60
constexpr auto assign_char_to
Assign a character to an alphabet object.
Definition alphabet/concept.hpp:521
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition sam_flag.hpp:73
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition sam_tag_dictionary.hpp:42
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition sam_tag_dictionary.hpp:39
constexpr std::vector< cigar > parse_cigar(std::string_view const cigar_str)
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition io/sam_file/detail/cigar.hpp:87
@ none
None of the flags below are set.
constexpr auto take_until_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition take_until_view.hpp:585
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition istreambuf_view.hpp:104
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ ref_seq
The (reference) "sequence" information, usually a range of nucleotides or amino acids.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ bit_score
The bit score (statistical significance indicator), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
std::string make_printable(char const c)
Returns a printable value for the given character c.
Definition pretty_print.hpp:45
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition predicate.hpp:60
constexpr auto is_space
Checks whether c is a space character.
Definition predicate.hpp:122
seqan::stl::ranges::to to
Converts a range to a container. <dl class="no-api">This entity is not part of the SeqAn API....
Definition to.hpp:23
Provides the seqan3::sam_file_header class.
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the SAM IO.
Provides seqan3::detail::istreambuf.
std::string to_string(value_type &&... values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition to_string.hpp:26
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
T push_back(T... args)
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
T size(T... args)
Provides seqan3::views::slice.
T starts_with(T... args)
Thrown if information given to output format didn't match expectations.
Definition io/exception.hpp:88
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition io/exception.hpp:45
The options type defines various option members that influence the behaviour of all or some formats.
Definition sam_file/input_options.hpp:26
The options type defines various option members that influence the behavior of all or some formats.
Definition sam_file/output_options.hpp:23
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition sam_file/output_options.hpp:27
bool sam_require_header
Whether to require a header for SAM files.
Definition sam_file/output_options.hpp:41
The options type defines various option members that influence the behaviour of all or some formats.
Definition sequence_file/input_options.hpp:24
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition sequence_file/input_options.hpp:26
The options type defines various option members that influence the behaviour of all or some formats.
Definition sequence_file/output_options.hpp:23
T substr(T... args)
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::ranges::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.
T visit(T... args)
Hide me