SeqAn3 3.2.0
The Modern C++ library for sequence analysis.
format_sam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <iterator>
16#include <ranges>
17#include <string>
18#include <vector>
19
38
39namespace seqan3
40{
41
116class format_sam : private detail::format_sam_base
117{
118public:
122 // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
123 format_sam() = default;
124 format_sam(format_sam const &) = default;
125 format_sam & operator=(format_sam const &) = default;
126 format_sam(format_sam &&) = default;
128 ~format_sam() = default;
129
131
134 {"sam"},
135 };
136
137protected:
138 template <typename stream_type, // constraints checked by file
139 typename seq_legal_alph_type,
140 typename stream_pos_type,
141 typename seq_type, // other constraints checked inside function
142 typename id_type,
143 typename qual_type>
144 void read_sequence_record(stream_type & stream,
146 stream_pos_type & position_buffer,
147 seq_type & sequence,
148 id_type & id,
149 qual_type & qualities);
150
151 template <typename stream_type, // constraints checked by file
152 typename seq_type, // other constraints checked inside function
153 typename id_type,
154 typename qual_type>
155 void write_sequence_record(stream_type & stream,
156 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
157 seq_type && sequence,
158 id_type && id,
159 qual_type && qualities);
160
161 template <typename stream_type, // constraints checked by file
162 typename seq_legal_alph_type,
163 typename ref_seqs_type,
164 typename ref_ids_type,
165 typename stream_pos_type,
166 typename seq_type,
167 typename id_type,
168 typename offset_type,
169 typename ref_seq_type,
170 typename ref_id_type,
171 typename ref_offset_type,
172 typename align_type,
173 typename cigar_type,
174 typename flag_type,
175 typename mapq_type,
176 typename qual_type,
177 typename mate_type,
178 typename tag_dict_type,
179 typename e_value_type,
180 typename bit_score_type>
181 void read_alignment_record(stream_type & stream,
182 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
183 ref_seqs_type & ref_seqs,
185 stream_pos_type & position_buffer,
186 seq_type & seq,
187 qual_type & qual,
188 id_type & id,
189 offset_type & offset,
190 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
191 ref_id_type & ref_id,
192 ref_offset_type & ref_offset,
193 align_type & align,
194 cigar_type & cigar_vector,
195 flag_type & flag,
196 mapq_type & mapq,
197 mate_type & mate,
198 tag_dict_type & tag_dict,
199 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
200 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
201
202 template <typename stream_type,
203 typename header_type,
204 typename seq_type,
205 typename id_type,
206 typename ref_seq_type,
207 typename ref_id_type,
208 typename align_type,
209 typename qual_type,
210 typename mate_type,
211 typename tag_dict_type,
212 typename e_value_type,
213 typename bit_score_type>
214 void write_alignment_record(stream_type & stream,
215 sam_file_output_options const & options,
216 header_type && header,
217 seq_type && seq,
218 qual_type && qual,
219 id_type && id,
220 int32_t const offset,
221 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
222 ref_id_type && ref_id,
223 std::optional<int32_t> ref_offset,
224 align_type && align,
225 std::vector<cigar> const & cigar_vector,
226 sam_flag const flag,
227 uint8_t const mapq,
228 mate_type && mate,
229 tag_dict_type && tag_dict,
230 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
231 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
232
233private:
235 std::string tmp_qual{};
236
238 static constexpr std::string_view dummy{};
239
241 sam_file_header<> default_header{};
242
244 bool ref_info_present_in_header{false};
245
247 std::string_view const & default_or(detail::ignore_t) const noexcept
248 {
249 return dummy;
250 }
251
253 template <typename t>
254 decltype(auto) default_or(t && v) const noexcept
255 {
256 return std::forward<t>(v);
257 }
258
259 template <typename stream_view_type, arithmetic value_type>
260 void
261 read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant, stream_view_type && stream_view, value_type value);
262
263 template <typename stream_view_type>
264 void read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant, stream_view_type && stream_view);
265
266 template <typename stream_view_type>
267 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
268
269 template <typename stream_it_t, std::ranges::forward_range field_type>
270 void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
271
272 template <typename stream_it_t>
273 void write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value);
274
275 template <typename stream_it_t>
276 void write_tag_fields(stream_it_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
277};
278
280template <typename stream_type, // constraints checked by file
281 typename seq_legal_alph_type,
282 typename stream_pos_type,
283 typename seq_type, // other constraints checked inside function
284 typename id_type,
285 typename qual_type>
286inline void format_sam::read_sequence_record(stream_type & stream,
288 stream_pos_type & position_buffer,
289 seq_type & sequence,
290 id_type & id,
291 qual_type & qualities)
292{
294
295 {
297 align_options,
298 std::ignore,
299 default_header,
300 position_buffer,
301 sequence,
302 qualities,
303 id,
304 std::ignore,
305 std::ignore,
306 std::ignore,
307 std::ignore,
308 std::ignore,
309 std::ignore,
310 std::ignore,
311 std::ignore,
312 std::ignore,
313 std::ignore,
314 std::ignore,
315 std::ignore);
316 }
317
318 if constexpr (!detail::decays_to_ignore_v<seq_type>)
319 if (std::ranges::distance(sequence) == 0)
320 throw parse_error{"The sequence information must not be empty."};
321 if constexpr (!detail::decays_to_ignore_v<id_type>)
322 if (std::ranges::distance(id) == 0)
323 throw parse_error{"The id information must not be empty."};
324
325 if (options.truncate_ids)
326 id = id | detail::take_until_and_consume(is_space) | ranges::to<id_type>();
327}
328
330template <typename stream_type, // constraints checked by file
331 typename seq_type, // other constraints checked inside function
332 typename id_type,
333 typename qual_type>
334inline void format_sam::write_sequence_record(stream_type & stream,
335 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
336 seq_type && sequence,
337 id_type && id,
338 qual_type && qualities)
339{
341 using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
342
343 sam_file_output_options output_options;
344
346 output_options,
347 /*header*/ std::ignore,
348 /*seq*/ default_or(sequence),
349 /*qual*/ default_or(qualities),
350 /*id*/ default_or(id),
351 /*offset*/ 0,
352 /*ref_seq*/ std::string_view{},
353 /*ref_id*/ std::string_view{},
354 /*ref_offset*/ -1,
355 /*align*/ default_align_t{},
356 /*cigar_vector*/ std::vector<cigar>{},
357 /*flag*/ sam_flag::none,
358 /*mapq*/ 0,
359 /*mate*/ default_mate_t{},
360 /*tag_dict*/ sam_tag_dictionary{},
361 /*e_value*/ 0,
362 /*bit_score*/ 0);
363}
364
366template <typename stream_type, // constraints checked by file
367 typename seq_legal_alph_type,
368 typename ref_seqs_type,
369 typename ref_ids_type,
370 typename stream_pos_type,
371 typename seq_type,
372 typename id_type,
373 typename offset_type,
374 typename ref_seq_type,
375 typename ref_id_type,
376 typename ref_offset_type,
377 typename align_type,
378 typename cigar_type,
379 typename flag_type,
380 typename mapq_type,
381 typename qual_type,
382 typename mate_type,
383 typename tag_dict_type,
384 typename e_value_type,
385 typename bit_score_type>
386inline void
388 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
389 ref_seqs_type & ref_seqs,
391 stream_pos_type & position_buffer,
392 seq_type & seq,
393 qual_type & qual,
394 id_type & id,
395 offset_type & offset,
396 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
397 ref_id_type & ref_id,
398 ref_offset_type & ref_offset,
399 align_type & align,
400 cigar_type & cigar_vector,
401 flag_type & flag,
402 mapq_type & mapq,
403 mate_type & mate,
404 tag_dict_type & tag_dict,
405 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
406 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
407{
408 static_assert(detail::decays_to_ignore_v<ref_offset_type>
409 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
410 "The ref_offset must be a specialisation of std::optional.");
411
412 auto stream_view = detail::istreambuf(stream);
413 auto field_view = stream_view | detail::take_until_or_throw_and_consume(is_char<'\t'>);
414
415 // these variables need to be stored to compute the ALIGNMENT
416 int32_t ref_offset_tmp{};
417 std::ranges::range_value_t<decltype(header.ref_ids())> ref_id_tmp{};
418 [[maybe_unused]] int32_t offset_tmp{};
419 [[maybe_unused]] int32_t soft_clipping_end{};
420 [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
421 [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
422
423 // Header
424 // -------------------------------------------------------------------------------------------------------------
425 if (is_char<'@'>(*std::ranges::begin(stream_view))) // we always read the header if present
426 {
427 read_header(stream_view, header, ref_seqs);
428
429 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
430 return;
431 }
432
433 // Store the current file position in the buffer.
434 position_buffer = stream.tellg();
435
436 // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
437 // -------------------------------------------------------------------------------------------------------------
438 if constexpr (!detail::decays_to_ignore_v<id_type>)
439 read_forward_range_field(field_view, id);
440 else
441 detail::consume(field_view);
442
443 uint16_t flag_integral{};
444 read_arithmetic_field(field_view, flag_integral);
445 flag = sam_flag{flag_integral};
446
447 read_forward_range_field(field_view, ref_id_tmp);
448 check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
449
450 read_arithmetic_field(field_view, ref_offset_tmp);
451 --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
452
453 if (ref_offset_tmp == -1)
454 ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
455 else if (ref_offset_tmp > -1)
456 ref_offset = ref_offset_tmp;
457 else if (ref_offset_tmp < -1)
458 throw format_error{"No negative values are allowed for field::ref_offset."};
459
460 if constexpr (!detail::decays_to_ignore_v<mapq_type>)
461 read_arithmetic_field(field_view, mapq);
462 else
463 detail::consume(field_view);
464
465 // Field 6: CIGAR
466 // -------------------------------------------------------------------------------------------------------------
467 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
468 {
469 if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
470 {
471 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view);
472 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
473 // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
474 }
475 else
476 {
477 ++std::ranges::begin(field_view); // skip '*'
478 }
479 }
480 else
481 {
482 detail::consume(field_view);
483 }
484
485 offset = offset_tmp;
486
487 // Field 7-9: (RNEXT PNEXT TLEN) = MATE
488 // -------------------------------------------------------------------------------------------------------------
489 if constexpr (!detail::decays_to_ignore_v<mate_type>)
490 {
491 std::ranges::range_value_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
492 read_forward_range_field(field_view, tmp_mate_ref_id); // RNEXT
493
494 if (tmp_mate_ref_id == "=") // indicates "same as ref id"
495 {
496 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
497 get<0>(mate) = ref_id;
498 else
499 check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
500 }
501 else
502 {
503 check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
504 }
505
506 int32_t tmp_pnext{};
507 read_arithmetic_field(field_view, tmp_pnext); // PNEXT
508
509 if (tmp_pnext > 0)
510 get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
511 else if (tmp_pnext < 0)
512 throw format_error{"No negative values are allowed at the mate mapping position."};
513 // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
514
515 read_arithmetic_field(field_view, get<2>(mate)); // TLEN
516 }
517 else
518 {
519 for (size_t i = 0; i < 3u; ++i)
520 {
521 detail::consume(field_view);
522 }
523 }
524
525 // Field 10: Sequence
526 // -------------------------------------------------------------------------------------------------------------
527 if (!is_char<'*'>(*std::ranges::begin(stream_view))) // sequence information is given
528 {
529 constexpr auto is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
530 auto seq_stream =
531 field_view
533 [is_legal_alph](char const c) // enforce legal alphabet
534 {
535 if (!is_legal_alph(c))
536 throw parse_error{std::string{"Encountered an unexpected letter: "} + "char_is_valid_for<"
537 + detail::type_name_as_string<seq_legal_alph_type>
538 + "> evaluated to false on " + detail::make_printable(c)};
539 return c;
540 });
541
542 if constexpr (detail::decays_to_ignore_v<seq_type>)
543 {
544 if constexpr (!detail::decays_to_ignore_v<align_type>)
545 {
547 "If you want to read ALIGNMENT but not SEQ, the alignment"
548 " object must store a sequence container at the second (query) position.");
549
550 if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
551 {
552
553 auto tmp_iter = std::ranges::begin(seq_stream);
554 std::ranges::advance(tmp_iter, offset_tmp);
555
556 for (; seq_length > 0; --seq_length) // seq_length is not needed anymore
557 {
558 get<1>(align).push_back(
559 std::ranges::range_value_t<decltype(get<1>(align))>{}.assign_char(*tmp_iter));
560 ++tmp_iter;
561 }
562
563 std::ranges::advance(tmp_iter, soft_clipping_end);
564 }
565 else
566 {
567 get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // empty container
568 }
569 }
570 else
571 {
572 detail::consume(seq_stream);
573 }
574 }
575 else
576 {
577 read_forward_range_field(seq_stream, seq);
578
579 if constexpr (!detail::decays_to_ignore_v<align_type>)
580 {
581 if (!tmp_cigar_vector
582 .empty()) // if no alignment info is given, the field::alignment should remain empty
583 {
584 assign_unaligned(get<1>(align),
585 seq
586 | views::slice(static_cast<decltype(std::ranges::size(seq))>(offset_tmp),
587 std::ranges::size(seq) - soft_clipping_end));
588 }
589 }
590 }
591 }
592 else
593 {
594 ++std::ranges::begin(field_view); // skip '*'
595 }
596
597 // Field 11: Quality
598 // -------------------------------------------------------------------------------------------------------------
599 auto const tab_or_end = is_char<'\t'> || is_char<'\r'> || is_char<'\n'>;
600 auto qual_view = stream_view | detail::take_until_or_throw(tab_or_end);
601 if constexpr (!detail::decays_to_ignore_v<qual_type>)
602 read_forward_range_field(qual_view, qual);
603 else
604 detail::consume(qual_view);
605
606 if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
607 {
608 if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0
609 && std::ranges::distance(seq) != std::ranges::distance(qual))
610 {
611 throw format_error{detail::to_string("Sequence length (",
612 std::ranges::distance(seq),
613 ") and quality length (",
614 std::ranges::distance(qual),
615 ") must be the same.")};
616 }
617 }
618
619 // All remaining optional fields if any: SAM tags dictionary
620 // -------------------------------------------------------------------------------------------------------------
621 while (is_char<'\t'>(*std::ranges::begin(stream_view))) // read all tags if present
622 {
623 ++std::ranges::begin(stream_view); // skip tab
624 auto stream_until_tab_or_end = stream_view | detail::take_until_or_throw(tab_or_end);
625 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
626 read_sam_dict_field(stream_until_tab_or_end, tag_dict);
627 else
628 detail::consume(stream_until_tab_or_end);
629 }
630
631 detail::consume(stream_view | detail::take_until(!(is_char<'\r'> || is_char<'\n'>))); // consume new line
632
633 // DONE READING - wrap up
634 // -------------------------------------------------------------------------------------------------------------
635 // Alignment object construction
636 // Note that the query sequence in get<1>(align) has already been filled while reading Field 10.
637 if constexpr (!detail::decays_to_ignore_v<align_type>)
638 {
639 int32_t ref_idx{(ref_id_tmp.empty() /*unmapped read?*/) ? -1 : 0};
640
641 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
642 {
643 if (!ref_id_tmp.empty())
644 {
645 assert(header.ref_dict.count(ref_id_tmp) != 0); // taken care of in check_and_assign_ref_id()
646 ref_idx = header.ref_dict[ref_id_tmp]; // get index for reference sequence
647 }
648 }
649
650 construct_alignment(align, tmp_cigar_vector, ref_idx, ref_seqs, ref_offset_tmp, ref_length);
651 }
652
653 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
654 std::swap(cigar_vector, tmp_cigar_vector);
655}
656
658template <typename stream_type,
659 typename header_type,
660 typename seq_type,
661 typename id_type,
662 typename ref_seq_type,
663 typename ref_id_type,
664 typename align_type,
665 typename qual_type,
666 typename mate_type,
667 typename tag_dict_type,
668 typename e_value_type,
669 typename bit_score_type>
670inline void format_sam::write_alignment_record(stream_type & stream,
671 sam_file_output_options const & options,
672 header_type && header,
673 seq_type && seq,
674 qual_type && qual,
675 id_type && id,
676 int32_t const offset,
677 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
678 ref_id_type && ref_id,
679 std::optional<int32_t> ref_offset,
680 align_type && align,
681 std::vector<cigar> const & cigar_vector,
682 sam_flag const flag,
683 uint8_t const mapq,
684 mate_type && mate,
685 tag_dict_type && tag_dict,
686 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
687 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
688{
689 /* Note the following general things:
690 *
691 * - Given the SAM specifications, all fields may be empty
692 *
693 * - arithmetic values default to 0 while all others default to '*'
694 *
695 * - Because of the former, arithmetic values can be directly streamed
696 * into 'stream' as operator<< is defined for all arithmetic types
697 * and the default value (0) is also the SAM default.
698 *
699 * - All other non-arithmetic values need to be checked for emptiness
700 */
701
702 // ---------------------------------------------------------------------
703 // Type Requirements (as static asserts for user friendliness)
704 // ---------------------------------------------------------------------
705 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
706 "The seq object must be a std::ranges::forward_range over "
707 "letters that model seqan3::alphabet.");
708
709 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
710 "The id object must be a std::ranges::forward_range over "
711 "letters that model seqan3::alphabet.");
712
713 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
714 {
715 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
716 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
717 "The ref_id object must be a std::ranges::forward_range "
718 "over letters that model seqan3::alphabet.");
719
720 if constexpr (std::integral<std::remove_cvref_t<ref_id_type>>
721 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>)
722 static_assert(!detail::decays_to_ignore_v<header_type>,
723 "If you give indices as reference id information the header must also be present.");
724 }
725
727 "The align object must be a std::pair of two ranges whose "
728 "value_type is comparable to seqan3::gap");
729
730 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2
731 && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>>
732 && std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
733 "The align object must be a std::pair of two ranges whose "
734 "value_type is comparable to seqan3::gap");
735
736 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
737 "The qual object must be a std::ranges::forward_range "
738 "over letters that model seqan3::alphabet.");
739
741 "The mate object must be a std::tuple of size 3 with "
742 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
743 "2) a std::integral or std::optional<std::integral>, and "
744 "3) a std::integral.");
745
746 static_assert(
747 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
748 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
749 || detail::is_type_specialisation_of_v<
750 std::remove_cvref_t<decltype(std::get<0>(mate))>,
751 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
752 || detail::is_type_specialisation_of_v<
753 std::remove_cvref_t<decltype(std::get<1>(mate))>,
754 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
755 "The mate object must be a std::tuple of size 3 with "
756 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
757 "2) a std::integral or std::optional<std::integral>, and "
758 "3) a std::integral.");
759
760 if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
761 || detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>,
763 static_assert(!detail::decays_to_ignore_v<header_type>,
764 "If you give indices as mate reference id information the header must also be present.");
765
766 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
767 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
768
769 // ---------------------------------------------------------------------
770 // logical Requirements
771 // ---------------------------------------------------------------------
772 if constexpr (!detail::decays_to_ignore_v<header_type> && !detail::decays_to_ignore_v<ref_id_type>
773 && !std::integral<std::remove_reference_t<ref_id_type>>
774 && !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
775 {
776
777 if (options.sam_require_header && !std::ranges::empty(ref_id))
778 {
779 auto id_it = header.ref_dict.end();
780
781 if constexpr (std::ranges::contiguous_range<decltype(ref_id)> && std::ranges::sized_range<decltype(ref_id)>
782 && std::ranges::borrowed_range<decltype(ref_id)>)
783 {
784 id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
785 }
786 else
787 {
788 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
789
791 "The ref_id type is not convertible to the reference id information stored in the "
792 "reference dictionary of the header object.");
793
794 id_it = header.ref_dict.find(ref_id);
795 }
796
797 if (id_it == header.ref_dict.end()) // no reference id matched
798 throw format_error{detail::to_string("The ref_id '",
799 ref_id,
800 "' was not in the list of references:",
801 header.ref_ids())};
802 }
803 }
804
805 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
806 throw format_error{"The ref_offset object must be a std::integral >= 0."};
807
808 // ---------------------------------------------------------------------
809 // Writing the Header on first call
810 // ---------------------------------------------------------------------
811 if constexpr (!detail::decays_to_ignore_v<header_type>)
812 {
813 if (options.sam_require_header && !header_was_written)
814 {
815 write_header(stream, options, header);
816 header_was_written = true;
817 }
818 }
819
820 // ---------------------------------------------------------------------
821 // Writing the Record
822 // ---------------------------------------------------------------------
823
824 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
825 constexpr char separator{'\t'};
826
827 write_range_or_asterisk(stream_it, id);
828 *stream_it = separator;
829
830 stream_it.write_number(static_cast<uint16_t>(flag));
831 *stream_it = separator;
832
833 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
834 {
835 if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
836 {
837 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id]);
838 }
839 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
840 {
841 if (ref_id.has_value())
842 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id.value()]);
843 else
844 *stream_it = '*';
845 }
846 else
847 {
848 write_range_or_asterisk(stream_it, ref_id);
849 }
850 }
851 else
852 {
853 *stream_it = '*';
854 }
855
856 *stream_it = separator;
857
858 // SAM is 1 based, 0 indicates unmapped read if optional is not set
859 stream_it.write_number(ref_offset.value_or(-1) + 1);
860 *stream_it = separator;
861
862 stream_it.write_number(static_cast<unsigned>(mapq));
863 *stream_it = separator;
864
865 if (!std::ranges::empty(cigar_vector))
866 {
867 for (auto & c : cigar_vector) //TODO THIS IS PROBABLY TERRIBLE PERFORMANCE_WISE
868 stream_it.write_range(c.to_string());
869 }
870 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
871 {
872 // compute possible distance from alignment end to sequence end
873 // which indicates soft clipping at the end.
874 // This should be replace by a free count_gaps function for
875 // aligned sequences which is more efficient if possible.
876 size_t off_end{std::ranges::size(seq) - offset};
877 for (auto chr : get<1>(align))
878 if (chr == gap{})
879 ++off_end;
880
881 // Might happen if get<1>(align) doesn't correspond to the reference.
882 assert(off_end >= std::ranges::size(get<1>(align)));
883 off_end -= std::ranges::size(get<1>(align));
884
885 write_range_or_asterisk(stream_it, detail::get_cigar_string(align, offset, off_end));
886 }
887 else
888 {
889 *stream_it = '*';
890 }
891
892 *stream_it = separator;
893
894 if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
895 {
896 write_range_or_asterisk(stream_it, (header.ref_ids())[get<0>(mate)]);
897 }
898 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>,
900 {
901 if (get<0>(mate).has_value())
902 write_range_or_asterisk(stream_it, header.ref_ids()[get<0>(mate).value()]);
903 else
904 *stream_it = '*';
905 }
906 else
907 {
908 write_range_or_asterisk(stream_it, get<0>(mate));
909 }
910
911 *stream_it = separator;
912
913 if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
914 {
915 // SAM is 1 based, 0 indicates unmapped read if optional is not set
916 stream_it.write_number(get<1>(mate).value_or(-1) + 1);
917 *stream_it = separator;
918 }
919 else
920 {
921 stream_it.write_number(get<1>(mate));
922 *stream_it = separator;
923 }
924
925 stream_it.write_number(get<2>(mate));
926 *stream_it = separator;
927
928 write_range_or_asterisk(stream_it, seq);
929 *stream_it = separator;
930
931 write_range_or_asterisk(stream_it, qual);
932
933 write_tag_fields(stream_it, tag_dict, separator);
934
935 stream_it.write_end_of_line(options.add_carriage_return);
936}
937
955template <typename stream_view_type, arithmetic value_type>
956inline void format_sam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
957 stream_view_type && stream_view,
958 value_type value)
959{
960 std::vector<value_type> tmp_vector;
961 while (std::ranges::begin(stream_view) != std::ranges::end(stream_view)) // not fully consumed yet
962 {
963 read_arithmetic_field(stream_view | detail::take_until(is_char<','>), value);
964 tmp_vector.push_back(value);
965
966 if (is_char<','>(*std::ranges::begin(stream_view)))
967 ++std::ranges::begin(stream_view); // skip ','
968 }
969 variant = std::move(tmp_vector);
970}
971
985template <typename stream_view_type>
986inline void format_sam::read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant, stream_view_type && stream_view)
987{
988 std::vector<std::byte> tmp_vector;
989 std::byte value;
990
991 while (std::ranges::begin(stream_view) != std::ranges::end(stream_view)) // not fully consumed yet
992 {
993 try
994 {
995 read_byte_field(stream_view | detail::take_exactly_or_throw(2), value);
996 }
997 catch (std::exception const & e)
998 {
999 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1000 }
1001
1002 tmp_vector.push_back(value);
1003 }
1004
1005 variant = std::move(tmp_vector);
1006}
1007
1025template <typename stream_view_type>
1026inline void format_sam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
1027{
1028 /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
1029 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
1030 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
1031 VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
1032 */
1033 uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
1034 ++std::ranges::begin(stream_view); // skip char read before
1035 tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
1036 ++std::ranges::begin(stream_view); // skip char read before
1037 ++std::ranges::begin(stream_view); // skip ':'
1038 char type_id = *std::ranges::begin(stream_view);
1039 ++std::ranges::begin(stream_view); // skip char read before
1040 ++std::ranges::begin(stream_view); // skip ':'
1041
1042 switch (type_id)
1043 {
1044 case 'A': // char
1045 {
1046 target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
1047 ++std::ranges::begin(stream_view); // skip char that has been read
1048 break;
1049 }
1050 case 'i': // int32_t
1051 {
1052 int32_t tmp;
1053 read_arithmetic_field(stream_view, tmp);
1054 target[tag] = tmp;
1055 break;
1056 }
1057 case 'f': // float
1058 {
1059 float tmp;
1060 read_arithmetic_field(stream_view, tmp);
1061 target[tag] = tmp;
1062 break;
1063 }
1064 case 'Z': // string
1065 {
1066 target[tag] = stream_view | ranges::to<std::string>();
1067 break;
1068 }
1069 case 'H':
1070 {
1071 read_sam_byte_vector(target[tag], stream_view);
1072 break;
1073 }
1074 case 'B': // Array. Value type depends on second char [cCsSiIf]
1075 {
1076 char array_value_type_id = *std::ranges::begin(stream_view);
1077 ++std::ranges::begin(stream_view); // skip char read before
1078 ++std::ranges::begin(stream_view); // skip first ','
1079
1080 switch (array_value_type_id)
1081 {
1082 case 'c': // int8_t
1083 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1084 break;
1085 case 'C': // uint8_t
1086 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1087 break;
1088 case 's': // int16_t
1089 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1090 break;
1091 case 'S': // uint16_t
1092 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1093 break;
1094 case 'i': // int32_t
1095 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1096 break;
1097 case 'I': // uint32_t
1098 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1099 break;
1100 case 'f': // float
1101 read_sam_dict_vector(target[tag], stream_view, float{});
1102 break;
1103 default:
1104 throw format_error{std::string("The first character in the numerical ")
1105 + "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id
1106 + "' was given."};
1107 }
1108 break;
1109 }
1110 default:
1111 throw format_error{std::string("The second character in the numerical id of a "
1112 "SAM tag must be one of [A,i,Z,H,B,f] but '")
1113 + type_id + "' was given."};
1114 }
1115}
1116
1124template <typename stream_it_t, std::ranges::forward_range field_type>
1125inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
1126{
1127 if (std::ranges::empty(field_value))
1128 {
1129 *stream_it = '*';
1130 }
1131 else
1132 {
1133 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>, char>)
1134 stream_it.write_range(field_value);
1135 else // convert from alphabets to their character representation
1136 stream_it.write_range(field_value | views::to_char);
1137 }
1138}
1139
1146template <typename stream_it_t>
1147inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value)
1148{
1149 write_range_or_asterisk(stream_it, std::string_view{field_value});
1150}
1151
1159template <typename stream_it_t>
1160inline void
1161format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary const & tag_dict, char const separator)
1162{
1163 auto const stream_variant_fn = [&stream_it](auto && arg) // helper to print a std::variant
1164 {
1165 using T = std::remove_cvref_t<decltype(arg)>;
1166
1167 if constexpr (std::ranges::input_range<T>)
1168 {
1169 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, char>)
1170 {
1171 stream_it.write_range(arg);
1172 }
1173 else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, std::byte>)
1174 {
1175 if (!std::ranges::empty(arg))
1176 {
1177 stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1178
1179 for (auto && elem : arg | std::views::drop(1))
1180 {
1181 *stream_it = ',';
1182 stream_it.write_number(std::to_integer<uint8_t>(elem));
1183 }
1184 }
1185 }
1186 else
1187 {
1188 if (!std::ranges::empty(arg))
1189 {
1190 stream_it.write_number(*std::ranges::begin(arg));
1191
1192 for (auto && elem : arg | std::views::drop(1))
1193 {
1194 *stream_it = ',';
1195 stream_it.write_number(elem);
1196 }
1197 }
1198 }
1199 }
1200 else if constexpr (std::same_as<std::remove_cvref_t<T>, char>)
1201 {
1202 *stream_it = arg;
1203 }
1204 else // number
1205 {
1206 stream_it.write_number(arg);
1207 }
1208 };
1209
1210 for (auto & [tag, variant] : tag_dict)
1211 {
1212 *stream_it = separator;
1213
1214 char const char0 = tag / 256;
1215 char const char1 = tag % 256;
1216
1217 *stream_it = char0;
1218 *stream_it = char1;
1219 *stream_it = ':';
1220 *stream_it = detail::sam_tag_type_char[variant.index()];
1221 *stream_it = ':';
1222
1223 if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1224 {
1225 *stream_it = detail::sam_tag_type_char_extra[variant.index()];
1226 *stream_it = ',';
1227 }
1228
1229 std::visit(stream_variant_fn, variant);
1230 }
1231}
1232
1233} // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
T begin(T... args)
The SAM format (tag).
Definition: format_sam.hpp:117
~format_sam()=default
Defaulted.
format_sam & operator=(format_sam const &)=default
Defaulted.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_sam.hpp:133
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_sam.hpp:334
format_sam(format_sam &&)=default
Defaulted.
format_sam & operator=(format_sam &&)=default
Defaulted.
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type > const &options, stream_pos_type &position_buffer, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:286
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition: format_sam.hpp:670
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:387
format_sam()=default
Defaulted.
format_sam(format_sam const &)=default
Defaulted.
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:34
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:143
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:182
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:63
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ none
None of the flags below are set.
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:125
typename decltype(detail::split_after< i >(list_t{}))::second_type drop
Return a seqan3::type_list of the types in the input type list, except the first n.
Definition: traits.hpp:395
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:470
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:146
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:178
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
A more refined container concept than seqan3::container.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
T push_back(T... args)
Provides seqan3::ranges::to.
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Provides seqan3::views::slice.
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:91
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:26
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:30
bool sam_require_header
Whether to require a header for SAM files.
Definition: output_options.hpp:44
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:29
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:26
T swap(T... args)
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.
T visit(T... args)