SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
format_sam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <iterator>
16#include <seqan3/std/ranges>
17#include <string>
18#include <vector>
19
38
39namespace seqan3
40{
41
116class format_sam : private detail::format_sam_base
117{
118public:
122 // construction cannot be noexcept because this class has a std::string variable as a quality string buffer.
123 format_sam() = default;
124 format_sam(format_sam const &) = default;
125 format_sam & operator=(format_sam const &) = default;
126 format_sam(format_sam &&) = default;
128 ~format_sam() = default;
129
131
134 {
135 { "sam" },
136 };
137
138protected:
139 template <typename stream_type, // constraints checked by file
140 typename seq_legal_alph_type,
141 typename seq_type, // other constraints checked inside function
142 typename id_type,
143 typename qual_type>
144 void read_sequence_record(stream_type & stream,
146 seq_type & sequence,
147 id_type & id,
148 qual_type & qualities);
149
150 template <typename stream_type, // constraints checked by file
151 typename seq_type, // other constraints checked inside function
152 typename id_type,
153 typename qual_type>
154 void write_sequence_record(stream_type & stream,
155 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
156 seq_type && sequence,
157 id_type && id,
158 qual_type && qualities);
159
160 template <typename stream_type, // constraints checked by file
161 typename seq_legal_alph_type,
162 typename ref_seqs_type,
163 typename ref_ids_type,
164 typename seq_type,
165 typename id_type,
166 typename offset_type,
167 typename ref_seq_type,
168 typename ref_id_type,
169 typename ref_offset_type,
170 typename align_type,
171 typename cigar_type,
172 typename flag_type,
173 typename mapq_type,
174 typename qual_type,
175 typename mate_type,
176 typename tag_dict_type,
177 typename e_value_type,
178 typename bit_score_type>
179 void read_alignment_record(stream_type & stream,
180 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
181 ref_seqs_type & ref_seqs,
183 seq_type & seq,
184 qual_type & qual,
185 id_type & id,
186 offset_type & offset,
187 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
188 ref_id_type & ref_id,
189 ref_offset_type & ref_offset,
190 align_type & align,
191 cigar_type & cigar_vector,
192 flag_type & flag,
193 mapq_type & mapq,
194 mate_type & mate,
195 tag_dict_type & tag_dict,
196 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
197 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
198
199 template <typename stream_type,
200 typename header_type,
201 typename seq_type,
202 typename id_type,
203 typename ref_seq_type,
204 typename ref_id_type,
205 typename align_type,
206 typename qual_type,
207 typename mate_type,
208 typename tag_dict_type,
209 typename e_value_type,
210 typename bit_score_type>
211 void write_alignment_record(stream_type & stream,
212 sam_file_output_options const & options,
213 header_type && header,
214 seq_type && seq,
215 qual_type && qual,
216 id_type && id,
217 int32_t const offset,
218 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
219 ref_id_type && ref_id,
220 std::optional<int32_t> ref_offset,
221 align_type && align,
222 std::vector<cigar> const & cigar_vector,
223 sam_flag const flag,
224 uint8_t const mapq,
225 mate_type && mate,
226 tag_dict_type && tag_dict,
227 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
228 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score));
229
230private:
232 std::string tmp_qual{};
233
235 static constexpr std::string_view dummy{};
236
238 sam_file_header<> default_header{};
239
241 bool ref_info_present_in_header{false};
242
244 std::string_view const & default_or(detail::ignore_t) const noexcept
245 {
246 return dummy;
247 }
248
250 template <typename t>
251 decltype(auto) default_or(t && v) const noexcept
252 {
253 return std::forward<t>(v);
254 }
255
256 using format_sam_base::read_field; // inherit read_field functions from format_base explicitly
257
258 template <typename stream_view_type, typename value_type>
259 void read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
260 stream_view_type && stream_view,
261 value_type value);
262
263 template <typename stream_view_type>
264 void read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant,
265 stream_view_type && stream_view);
266
267 template <typename stream_view_type>
268 void read_field(stream_view_type && stream_view, sam_tag_dictionary & target);
269
270 template <typename stream_it_t, std::ranges::forward_range field_type>
271 void write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value);
272
273 template <typename stream_it_t>
274 void write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value);
275
276 template <typename stream_it_t>
277 void write_tag_fields(stream_it_t & stream, sam_tag_dictionary const & tag_dict, char const separator);
278};
279
281template <typename stream_type, // constraints checked by file
282 typename seq_legal_alph_type,
283 typename seq_type, // other constraints checked inside function
284 typename id_type,
285 typename qual_type>
286inline void format_sam::read_sequence_record(stream_type & stream,
288 seq_type & sequence,
289 id_type & id,
290 qual_type & qualities)
291{
293
294 {
295 read_alignment_record(stream, align_options, std::ignore, default_header, sequence, qualities, id,
296 std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore,
297 std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore);
298 }
299
300 if constexpr (!detail::decays_to_ignore_v<seq_type>)
301 if (std::ranges::distance(sequence) == 0)
302 throw parse_error{"The sequence information must not be empty."};
303 if constexpr (!detail::decays_to_ignore_v<id_type>)
304 if (std::ranges::distance(id) == 0)
305 throw parse_error{"The id information must not be empty."};
306
307 if (options.truncate_ids)
308 id = id | detail::take_until_and_consume(is_space) | views::to<id_type>;
309}
310
312template <typename stream_type, // constraints checked by file
313 typename seq_type, // other constraints checked inside function
314 typename id_type,
315 typename qual_type>
316inline void format_sam::write_sequence_record(stream_type & stream,
317 sequence_file_output_options const & SEQAN3_DOXYGEN_ONLY(options),
318 seq_type && sequence,
319 id_type && id,
320 qual_type && qualities)
321{
323 using default_mate_t = std::tuple<std::string_view, std::optional<int32_t>, int32_t>;
324
325 sam_file_output_options output_options;
326
328 output_options,
329 /*header*/ std::ignore,
330 /*seq*/ default_or(sequence),
331 /*qual*/ default_or(qualities),
332 /*id*/ default_or(id),
333 /*offset*/ 0,
334 /*ref_seq*/ std::string_view{},
335 /*ref_id*/ std::string_view{},
336 /*ref_offset*/ -1,
337 /*align*/ default_align_t{},
338 /*cigar_vector*/ std::vector<cigar>{},
339 /*flag*/ sam_flag::none,
340 /*mapq*/ 0,
341 /*mate*/ default_mate_t{},
342 /*tag_dict*/ sam_tag_dictionary{},
343 /*e_value*/ 0,
344 /*bit_score*/ 0);
345}
346
348template <typename stream_type, // constraints checked by file
349 typename seq_legal_alph_type,
350 typename ref_seqs_type,
351 typename ref_ids_type,
352 typename seq_type,
353 typename id_type,
354 typename offset_type,
355 typename ref_seq_type,
356 typename ref_id_type,
357 typename ref_offset_type,
358 typename align_type,
359 typename cigar_type,
360 typename flag_type,
361 typename mapq_type,
362 typename qual_type,
363 typename mate_type,
364 typename tag_dict_type,
365 typename e_value_type,
366 typename bit_score_type>
367inline void format_sam::read_alignment_record(stream_type & stream,
368 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
369 ref_seqs_type & ref_seqs,
371 seq_type & seq,
372 qual_type & qual,
373 id_type & id,
374 offset_type & offset,
375 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
376 ref_id_type & ref_id,
377 ref_offset_type & ref_offset,
378 align_type & align,
379 cigar_type & cigar_vector,
380 flag_type & flag,
381 mapq_type & mapq,
382 mate_type & mate,
383 tag_dict_type & tag_dict,
384 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
385 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
386{
387 static_assert(detail::decays_to_ignore_v<ref_offset_type> ||
388 detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
389 "The ref_offset must be a specialisation of std::optional.");
390
391 auto stream_view = detail::istreambuf(stream);
392 auto field_view = stream_view | detail::take_until_or_throw_and_consume(is_char<'\t'>);
393
394 // these variables need to be stored to compute the ALIGNMENT
395 int32_t ref_offset_tmp{};
396 std::ranges::range_value_t<decltype(header.ref_ids())> ref_id_tmp{};
397 [[maybe_unused]] int32_t offset_tmp{};
398 [[maybe_unused]] int32_t soft_clipping_end{};
399 [[maybe_unused]] std::vector<cigar> tmp_cigar_vector{};
400 [[maybe_unused]] int32_t ref_length{0}, seq_length{0}; // length of aligned part for ref and query
401
402 // Header
403 // -------------------------------------------------------------------------------------------------------------
404 if (is_char<'@'>(*std::ranges::begin(stream_view))) // we always read the header if present
405 {
406 read_header(stream_view, header, ref_seqs);
407
408 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // file has no records
409 return;
410 }
411
412 // Fields 1-5: ID FLAG REF_ID REF_OFFSET MAPQ
413 // -------------------------------------------------------------------------------------------------------------
414 read_field(field_view, id);
415
416 uint16_t flag_integral{};
417 read_field(field_view, flag_integral);
418 flag = sam_flag{flag_integral};
419
420 read_field(field_view, ref_id_tmp);
421 check_and_assign_ref_id(ref_id, ref_id_tmp, header, ref_seqs);
422
423 read_field(field_view, ref_offset_tmp);
424 --ref_offset_tmp; // SAM format is 1-based but SeqAn operates 0-based
425
426 if (ref_offset_tmp == -1)
427 ref_offset = std::nullopt; // indicates an unmapped read -> ref_offset is not set
428 else if (ref_offset_tmp > -1)
429 ref_offset = ref_offset_tmp;
430 else if (ref_offset_tmp < -1)
431 throw format_error{"No negative values are allowed for field::ref_offset."};
432
433 read_field(field_view, mapq);
434
435 // Field 6: CIGAR
436 // -------------------------------------------------------------------------------------------------------------
437 if constexpr (!detail::decays_to_ignore_v<align_type> || !detail::decays_to_ignore_v<cigar_type>)
438 {
439 if (!is_char<'*'>(*std::ranges::begin(stream_view))) // no cigar information given
440 {
441 std::tie(tmp_cigar_vector, ref_length, seq_length) = detail::parse_cigar(field_view);
442 transfer_soft_clipping_to(tmp_cigar_vector, offset_tmp, soft_clipping_end);
443 // the actual cigar_vector is swapped with tmp_cigar_vector at the end to avoid copying
444 }
445 else
446 {
447 std::ranges::next(std::ranges::begin(field_view)); // skip '*'
448 }
449 }
450 else
451 {
452 detail::consume(field_view);
453 }
454
455 offset = offset_tmp;
456
457 // Field 7-9: (RNEXT PNEXT TLEN) = MATE
458 // -------------------------------------------------------------------------------------------------------------
459 if constexpr (!detail::decays_to_ignore_v<mate_type>)
460 {
461 std::ranges::range_value_t<decltype(header.ref_ids())> tmp_mate_ref_id{};
462 read_field(field_view, tmp_mate_ref_id); // RNEXT
463
464 if (tmp_mate_ref_id == "=") // indicates "same as ref id"
465 {
466 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
467 get<0>(mate) = ref_id;
468 else
469 check_and_assign_ref_id(get<0>(mate), ref_id_tmp, header, ref_seqs);
470 }
471 else
472 {
473 check_and_assign_ref_id(get<0>(mate), tmp_mate_ref_id, header, ref_seqs);
474 }
475
476 int32_t tmp_pnext{};
477 read_field(field_view, tmp_pnext); // PNEXT
478
479 if (tmp_pnext > 0)
480 get<1>(mate) = --tmp_pnext; // SAM format is 1-based but SeqAn operates 0-based.
481 else if (tmp_pnext < 0)
482 throw format_error{"No negative values are allowed at the mate mapping position."};
483 // tmp_pnext == 0 indicates an unmapped mate -> do not fill std::optional get<1>(mate)
484
485 read_field(field_view, get<2>(mate)); // TLEN
486 }
487 else
488 {
489 for (size_t i = 0; i < 3u; ++i)
490 {
491 detail::consume(field_view);
492 }
493 }
494
495 // Field 10: Sequence
496 // -------------------------------------------------------------------------------------------------------------
497 if (!is_char<'*'>(*std::ranges::begin(stream_view))) // sequence information is given
498 {
499 auto constexpr is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
500 auto seq_stream = field_view | std::views::transform([is_legal_alph] (char const c) // enforce legal alphabet
501 {
502 if (!is_legal_alph(c))
503 throw parse_error{std::string{"Encountered an unexpected letter: "} +
504 "char_is_valid_for<" +
505 detail::type_name_as_string<seq_legal_alph_type> +
506 "> evaluated to false on " +
507 detail::make_printable(c)};
508 return c;
509 });
510
511 if constexpr (detail::decays_to_ignore_v<seq_type>)
512 {
513 if constexpr (!detail::decays_to_ignore_v<align_type>)
514 {
516 "If you want to read ALIGNMENT but not SEQ, the alignment"
517 " object must store a sequence container at the second (query) position.");
518
519 if (!tmp_cigar_vector.empty()) // only parse alignment if cigar information was given
520 {
521
522 auto tmp_iter = std::ranges::begin(seq_stream);
523 std::ranges::advance(tmp_iter, offset_tmp);
524
525 for (; seq_length > 0; --seq_length) // seq_length is not needed anymore
526 {
527 get<1>(align).push_back(std::ranges::range_value_t<decltype(get<1>(align))>{}.assign_char(*tmp_iter));
528 ++tmp_iter;
529 }
530
531 std::ranges::advance(tmp_iter, soft_clipping_end);
532 }
533 else
534 {
535 get<1>(align) = std::remove_reference_t<decltype(get<1>(align))>{}; // empty container
536 }
537 }
538 else
539 {
540 detail::consume(seq_stream);
541 }
542 }
543 else
544 {
545 read_field(seq_stream, seq);
546
547 if constexpr (!detail::decays_to_ignore_v<align_type>)
548 {
549 if (!tmp_cigar_vector.empty()) // if no alignment info is given, the field::alignment should remain empty
550 {
551 assign_unaligned(get<1>(align),
552 seq | views::slice(static_cast<decltype(std::ranges::size(seq))>(offset_tmp),
553 std::ranges::size(seq) - soft_clipping_end));
554 }
555 }
556 }
557 }
558 else
559 {
560 std::ranges::next(std::ranges::begin(field_view)); // skip '*'
561 }
562
563 // Field 11: Quality
564 // -------------------------------------------------------------------------------------------------------------
565 auto const tab_or_end = is_char<'\t'> || is_char<'\r'> || is_char<'\n'>;
566 read_field(stream_view | detail::take_until_or_throw(tab_or_end), qual);
567
568 if constexpr (!detail::decays_to_ignore_v<seq_type> && !detail::decays_to_ignore_v<qual_type>)
569 {
570 if (std::ranges::distance(seq) != 0 && std::ranges::distance(qual) != 0 &&
571 std::ranges::distance(seq) != std::ranges::distance(qual))
572 {
573 throw format_error{detail::to_string("Sequence length (", std::ranges::distance(seq),
574 ") and quality length (", std::ranges::distance(qual),
575 ") must be the same.")};
576 }
577 }
578
579 // All remaining optional fields if any: SAM tags dictionary
580 // -------------------------------------------------------------------------------------------------------------
581 while (is_char<'\t'>(*std::ranges::begin(stream_view))) // read all tags if present
582 {
583 std::ranges::next(std::ranges::begin(stream_view)); // skip tab
584 read_field(stream_view | detail::take_until_or_throw(tab_or_end), tag_dict);
585 }
586
587 detail::consume(stream_view | detail::take_until(!(is_char<'\r'> || is_char<'\n'>))); // consume new line
588
589 // DONE READING - wrap up
590 // -------------------------------------------------------------------------------------------------------------
591 // Alignment object construction
592 // Note that the query sequence in get<1>(align) has already been filled while reading Field 10.
593 if constexpr (!detail::decays_to_ignore_v<align_type>)
594 {
595 int32_t ref_idx{(ref_id_tmp.empty()/*unmapped read?*/) ? -1 : 0};
596
597 if constexpr (!detail::decays_to_ignore_v<ref_seqs_type>)
598 {
599 if (!ref_id_tmp.empty())
600 {
601 assert(header.ref_dict.count(ref_id_tmp) != 0); // taken care of in check_and_assign_ref_id()
602 ref_idx = header.ref_dict[ref_id_tmp]; // get index for reference sequence
603 }
604 }
605
606 construct_alignment(align, tmp_cigar_vector, ref_idx, ref_seqs, ref_offset_tmp, ref_length);
607 }
608
609 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
610 std::swap(cigar_vector, tmp_cigar_vector);
611}
612
614template <typename stream_type,
615 typename header_type,
616 typename seq_type,
617 typename id_type,
618 typename ref_seq_type,
619 typename ref_id_type,
620 typename align_type,
621 typename qual_type,
622 typename mate_type,
623 typename tag_dict_type,
624 typename e_value_type,
625 typename bit_score_type>
626inline void format_sam::write_alignment_record(stream_type & stream,
627 sam_file_output_options const & options,
628 header_type && header,
629 seq_type && seq,
630 qual_type && qual,
631 id_type && id,
632 int32_t const offset,
633 ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
634 ref_id_type && ref_id,
635 std::optional<int32_t> ref_offset,
636 align_type && align,
637 std::vector<cigar> const & cigar_vector,
638 sam_flag const flag,
639 uint8_t const mapq,
640 mate_type && mate,
641 tag_dict_type && tag_dict,
642 e_value_type && SEQAN3_DOXYGEN_ONLY(e_value),
643 bit_score_type && SEQAN3_DOXYGEN_ONLY(bit_score))
644{
645 /* Note the following general things:
646 *
647 * - Given the SAM specifications, all fields may be empty
648 *
649 * - arithmetic values default to 0 while all others default to '*'
650 *
651 * - Because of the former, arithmetic values can be directly streamed
652 * into 'stream' as operator<< is defined for all arithmetic types
653 * and the default value (0) is also the SAM default.
654 *
655 * - All other non-arithmetic values need to be checked for emptiness
656 */
657
658 // ---------------------------------------------------------------------
659 // Type Requirements (as static asserts for user friendliness)
660 // ---------------------------------------------------------------------
661 static_assert((std::ranges::forward_range<seq_type> &&
663 "The seq object must be a std::ranges::forward_range over "
664 "letters that model seqan3::alphabet.");
665
666 static_assert((std::ranges::forward_range<id_type> &&
668 "The id object must be a std::ranges::forward_range over "
669 "letters that model seqan3::alphabet.");
670
671 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
672 {
673 static_assert((std::ranges::forward_range<ref_id_type> ||
674 std::integral<std::remove_reference_t<ref_id_type>> ||
675 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
676 "The ref_id object must be a std::ranges::forward_range "
677 "over letters that model seqan3::alphabet.");
678
679 if constexpr (std::integral<std::remove_cvref_t<ref_id_type>> ||
680 detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>)
681 static_assert(!detail::decays_to_ignore_v<header_type>,
682 "If you give indices as reference id information the header must also be present.");
683 }
684
686 "The align object must be a std::pair of two ranges whose "
687 "value_type is comparable to seqan3::gap");
688
689 static_assert((std::tuple_size_v<std::remove_cvref_t<align_type>> == 2 &&
690 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<0>(align))>> &&
691 std::equality_comparable_with<gap, std::ranges::range_reference_t<decltype(std::get<1>(align))>>),
692 "The align object must be a std::pair of two ranges whose "
693 "value_type is comparable to seqan3::gap");
694
695 static_assert((std::ranges::forward_range<qual_type> &&
697 "The qual object must be a std::ranges::forward_range "
698 "over letters that model seqan3::alphabet.");
699
701 "The mate object must be a std::tuple of size 3 with "
702 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
703 "2) a std::integral or std::optional<std::integral>, and "
704 "3) a std::integral.");
705
706 static_assert(((std::ranges::forward_range<decltype(std::get<0>(mate))> ||
707 std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
708 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>) &&
709 (std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>> ||
710 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<1>(mate))>, std::optional>) &&
711 std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
712 "The mate object must be a std::tuple of size 3 with "
713 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
714 "2) a std::integral or std::optional<std::integral>, and "
715 "3) a std::integral.");
716
717 if constexpr (std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>> ||
718 detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(std::get<0>(mate))>, std::optional>)
719 static_assert(!detail::decays_to_ignore_v<header_type>,
720 "If you give indices as mate reference id information the header must also be present.");
721
722 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
723 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
724
725 // ---------------------------------------------------------------------
726 // logical Requirements
727 // ---------------------------------------------------------------------
728 if constexpr (!detail::decays_to_ignore_v<header_type> &&
729 !detail::decays_to_ignore_v<ref_id_type> &&
730 !std::integral<std::remove_reference_t<ref_id_type>> &&
731 !detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
732 {
733
734 if (options.sam_require_header && !std::ranges::empty(ref_id))
735 {
736 auto id_it = header.ref_dict.end();
737
738 if constexpr (std::ranges::contiguous_range<decltype(ref_id)> &&
739 std::ranges::sized_range<decltype(ref_id)> &&
740 std::ranges::borrowed_range<decltype(ref_id)>)
741 {
742 id_it = header.ref_dict.find(std::span{std::ranges::data(ref_id), std::ranges::size(ref_id)});
743 }
744 else
745 {
746 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
747
749 "The ref_id type is not convertible to the reference id information stored in the "
750 "reference dictionary of the header object.");
751
752 id_it = header.ref_dict.find(ref_id);
753 }
754
755 if (id_it == header.ref_dict.end()) // no reference id matched
756 throw format_error{detail::to_string("The ref_id '", ref_id, "' was not in the list of references:",
757 header.ref_ids())};
758 }
759 }
760
761 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
762 throw format_error{"The ref_offset object must be a std::integral >= 0."};
763
764 // ---------------------------------------------------------------------
765 // Writing the Header on first call
766 // ---------------------------------------------------------------------
767 if constexpr (!detail::decays_to_ignore_v<header_type>)
768 {
769 if (options.sam_require_header && !header_was_written)
770 {
771 write_header(stream, options, header);
772 header_was_written = true;
773 }
774 }
775
776 // ---------------------------------------------------------------------
777 // Writing the Record
778 // ---------------------------------------------------------------------
779
780 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
781 constexpr char separator{'\t'};
782
783 write_range_or_asterisk(stream_it, id);
784 *stream_it = separator;
785
786 stream_it.write_number(static_cast<uint16_t>(flag));
787 *stream_it = separator;
788
789 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
790 {
791 if constexpr (std::integral<std::remove_reference_t<ref_id_type>>)
792 {
793 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id]);
794 }
795 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<ref_id_type>, std::optional>)
796 {
797 if (ref_id.has_value())
798 write_range_or_asterisk(stream_it, (header.ref_ids())[ref_id.value()]);
799 else
800 *stream_it = '*';
801 }
802 else
803 {
804 write_range_or_asterisk(stream_it, ref_id);
805 }
806 }
807 else
808 {
809 *stream_it = '*';
810 }
811
812 *stream_it = separator;
813
814 // SAM is 1 based, 0 indicates unmapped read if optional is not set
815 stream_it.write_number(ref_offset.value_or(-1) + 1);
816 *stream_it = separator;
817
818 stream_it.write_number(static_cast<unsigned>(mapq));
819 *stream_it = separator;
820
821 if (!std::ranges::empty(cigar_vector))
822 {
823 for (auto & c : cigar_vector) //TODO THIS IS PROBABLY TERRIBLE PERFORMANCE_WISE
824 stream_it.write_range(c.to_string());
825 }
826 else if (!std::ranges::empty(get<0>(align)) && !std::ranges::empty(get<1>(align)))
827 {
828 // compute possible distance from alignment end to sequence end
829 // which indicates soft clipping at the end.
830 // This should be replace by a free count_gaps function for
831 // aligned sequences which is more efficient if possible.
832 size_t off_end{std::ranges::size(seq) - offset};
833 for (auto chr : get<1>(align))
834 if (chr == gap{})
835 ++off_end;
836
837 // Might happen if get<1>(align) doesn't correspond to the reference.
838 assert(off_end >= std::ranges::size(get<1>(align)));
839 off_end -= std::ranges::size(get<1>(align));
840
841 write_range_or_asterisk(stream_it, detail::get_cigar_string(align, offset, off_end));
842 }
843 else
844 {
845 *stream_it = '*';
846 }
847
848 *stream_it = separator;
849
850 if constexpr (std::integral<std::remove_reference_t<decltype(get<0>(mate))>>)
851 {
852 write_range_or_asterisk(stream_it, (header.ref_ids())[get<0>(mate)]);
853 }
854 else if constexpr (detail::is_type_specialisation_of_v<std::remove_reference_t<decltype(get<0>(mate))>, std::optional>)
855 {
856 if (get<0>(mate).has_value())
857 // value_or(0) instead of value() (which is equivalent here) as a
858 // workaround for a ubsan false-positive in GCC8: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90058
859 write_range_or_asterisk(stream_it, header.ref_ids()[get<0>(mate).value_or(0)]);
860 else
861 *stream_it = '*';
862 }
863 else
864 {
865 write_range_or_asterisk(stream_it, get<0>(mate));
866 }
867
868 *stream_it = separator;
869
870 if constexpr (detail::is_type_specialisation_of_v<std::remove_cvref_t<decltype(get<1>(mate))>, std::optional>)
871 {
872 // SAM is 1 based, 0 indicates unmapped read if optional is not set
873 stream_it.write_number(get<1>(mate).value_or(-1) + 1);
874 *stream_it = separator;
875 }
876 else
877 {
878 stream_it.write_number(get<1>(mate));
879 *stream_it = separator;
880 }
881
882 stream_it.write_number(get<2>(mate));
883 *stream_it = separator;
884
885 write_range_or_asterisk(stream_it, seq);
886 *stream_it = separator;
887
888 write_range_or_asterisk(stream_it, qual);
889
890 write_tag_fields(stream_it, tag_dict, separator);
891
892 stream_it.write_end_of_line(options.add_carriage_return);
893}
894
895
913template <typename stream_view_type, typename value_type>
914inline void format_sam::read_sam_dict_vector(seqan3::detail::sam_tag_variant & variant,
915 stream_view_type && stream_view,
916 value_type value)
917{
918 std::vector<value_type> tmp_vector;
919 while (std::ranges::begin(stream_view) != ranges::end(stream_view)) // not fully consumed yet
920 {
921 read_field(stream_view | detail::take_until(is_char<','>), value);
922 tmp_vector.push_back(value);
923
924 if (is_char<','>(*std::ranges::begin(stream_view)))
925 std::ranges::next(std::ranges::begin(stream_view)); // skip ','
926 }
927 variant = std::move(tmp_vector);
928}
929
943template <typename stream_view_type>
944inline void format_sam::read_sam_byte_vector(seqan3::detail::sam_tag_variant & variant,
945 stream_view_type && stream_view)
946{
947 std::vector<std::byte> tmp_vector;
948 std::byte value;
949
950 while (std::ranges::begin(stream_view) != ranges::end(stream_view)) // not fully consumed yet
951 {
952 try
953 {
954 read_field(stream_view | detail::take_exactly_or_throw(2), value);
955 }
956 catch (std::exception const & e)
957 {
958 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
959 }
960
961 tmp_vector.push_back(value);
962 }
963
964 variant = std::move(tmp_vector);
965}
966
984template <typename stream_view_type>
985inline void format_sam::read_field(stream_view_type && stream_view, sam_tag_dictionary & target)
986{
987 /* Every SAM tag has the format "[TAG]:[TYPE_ID]:[VALUE]", where TAG is a two letter
988 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
989 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of comma separated
990 VALUE's and the inner value type is identified by the character following ':', one of [cCsSiIf].
991 */
992 uint16_t tag = static_cast<uint16_t>(*std::ranges::begin(stream_view)) << 8;
993 std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
994 tag += static_cast<uint16_t>(*std::ranges::begin(stream_view));
995 std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
996 std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
997 char type_id = *std::ranges::begin(stream_view);
998 std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
999 std::ranges::next(std::ranges::begin(stream_view)); // skip ':'
1000
1001 switch (type_id)
1002 {
1003 case 'A' : // char
1004 {
1005 target[tag] = static_cast<char>(*std::ranges::begin(stream_view));
1006 std::ranges::next(std::ranges::begin(stream_view)); // skip char that has been read
1007 break;
1008 }
1009 case 'i' : // int32_t
1010 {
1011 int32_t tmp;
1012 read_field(stream_view, tmp);
1013 target[tag] = tmp;
1014 break;
1015 }
1016 case 'f' : // float
1017 {
1018 float tmp;
1019 read_field(stream_view, tmp);
1020 target[tag] = tmp;
1021 break;
1022 }
1023 case 'Z' : // string
1024 {
1025 target[tag] = stream_view | views::to<std::string>;
1026 break;
1027 }
1028 case 'H' :
1029 {
1030 read_sam_byte_vector(target[tag], stream_view);
1031 break;
1032 }
1033 case 'B' : // Array. Value type depends on second char [cCsSiIf]
1034 {
1035 char array_value_type_id = *std::ranges::begin(stream_view);
1036 std::ranges::next(std::ranges::begin(stream_view)); // skip char read before
1037 std::ranges::next(std::ranges::begin(stream_view)); // skip first ','
1038
1039 switch (array_value_type_id)
1040 {
1041 case 'c' : // int8_t
1042 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1043 break;
1044 case 'C' : // uint8_t
1045 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1046 break;
1047 case 's' : // int16_t
1048 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1049 break;
1050 case 'S' : // uint16_t
1051 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1052 break;
1053 case 'i' : // int32_t
1054 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1055 break;
1056 case 'I' : // uint32_t
1057 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1058 break;
1059 case 'f' : // float
1060 read_sam_dict_vector(target[tag], stream_view, float{});
1061 break;
1062 default:
1063 throw format_error{std::string("The first character in the numerical ") +
1064 "id of a SAM tag must be one of [cCsSiIf] but '" + array_value_type_id +
1065 "' was given."};
1066 }
1067 break;
1068 }
1069 default:
1070 throw format_error{std::string("The second character in the numerical id of a "
1071 "SAM tag must be one of [A,i,Z,H,B,f] but '") + type_id + "' was given."};
1072 }
1073}
1074
1082template <typename stream_it_t, std::ranges::forward_range field_type>
1083inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, field_type && field_value)
1084{
1085 if (std::ranges::empty(field_value))
1086 {
1087 *stream_it = '*';
1088 }
1089 else
1090 {
1091 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<field_type>>, char>)
1092 stream_it.write_range(field_value);
1093 else // convert from alphabets to their character representation
1094 stream_it.write_range(field_value | views::to_char);
1095 }
1096}
1097
1104template <typename stream_it_t>
1105inline void format_sam::write_range_or_asterisk(stream_it_t & stream_it, char const * const field_value)
1106{
1107 write_range_or_asterisk(stream_it, std::string_view{field_value});
1108}
1109
1117template <typename stream_it_t>
1118inline void format_sam::write_tag_fields(stream_it_t & stream_it, sam_tag_dictionary const & tag_dict, char const separator)
1119{
1120 auto const stream_variant_fn = [&stream_it] (auto && arg) // helper to print a std::variant
1121 {
1122 using T = std::remove_cvref_t<decltype(arg)>;
1123
1124 if constexpr (std::ranges::input_range<T>)
1125 {
1126 if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, char>)
1127 {
1128 stream_it.write_range(arg);
1129 }
1130 else if constexpr (std::same_as<std::remove_cvref_t<std::ranges::range_reference_t<T>>, std::byte>)
1131 {
1132 if (!std::ranges::empty(arg))
1133 {
1134 stream_it.write_number(std::to_integer<uint8_t>(*std::ranges::begin(arg)));
1135
1136 for (auto && elem : arg | std::views::drop(1))
1137 {
1138 *stream_it = ',';
1139 stream_it.write_number(std::to_integer<uint8_t>(elem));
1140 }
1141 }
1142 }
1143 else
1144 {
1145 if (!std::ranges::empty(arg))
1146 {
1147 stream_it.write_number(*std::ranges::begin(arg));
1148
1149 for (auto && elem : arg | std::views::drop(1))
1150 {
1151 *stream_it = ',';
1152 stream_it.write_number(elem);
1153 }
1154 }
1155 }
1156 }
1157 else if constexpr (std::same_as<std::remove_cvref_t<T>, char>)
1158 {
1159 *stream_it = arg;
1160 }
1161 else // number
1162 {
1163 stream_it.write_number(arg);
1164 }
1165 };
1166
1167 for (auto & [tag, variant] : tag_dict)
1168 {
1169 *stream_it = separator;
1170
1171 char const char0 = tag / 256;
1172 char const char1 = tag % 256;
1173
1174 *stream_it = char0;
1175 *stream_it = char1;
1176 *stream_it = ':';
1177 *stream_it = detail::sam_tag_type_char[variant.index()];
1178 *stream_it = ':';
1179
1180 if (detail::sam_tag_type_char_extra[variant.index()] != '\0')
1181 {
1182 *stream_it = detail::sam_tag_type_char_extra[variant.index()];
1183 *stream_it = ',';
1184 }
1185
1186 std::visit(stream_variant_fn, variant);
1187 }
1188}
1189
1190} // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
T begin(T... args)
The SAM format (tag).
Definition: format_sam.hpp:117
~format_sam()=default
Defaulted.
format_sam & operator=(format_sam const &)=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, align_type &align, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:367
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_sam.hpp:134
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_sam.hpp:316
format_sam(format_sam &&)=default
Defaulted.
format_sam & operator=(format_sam &&)=default
Defaulted.
void read_sequence_record(stream_type &stream, sequence_file_input_options< seq_legal_alph_type > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_sam.hpp:286
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, align_type &&align, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score)
Write the given fields to the specified stream.
Definition: format_sam.hpp:626
format_sam()=default
Defaulted.
format_sam(format_sam const &)=default
Defaulted.
The alphabet of a gap character '-'.
Definition: gap.hpp:39
Stores the header information of alignment files.
Definition: header.hpp:35
std::unordered_map< key_type, int32_t, std::hash< key_type >, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:160
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:121
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:337
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides the seqan3::format_sam_base that can be inherited from.
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:63
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
@ none
None of the flags below are set.
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr auto is_space
Checks whether c is a space character.
Definition: predicate.hpp:128
typename decltype(detail::split_after< i >(list_t{}))::second_type drop
Return a seqan3::type_list of the types in the input type list, except the first n.
Definition: traits.hpp:388
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: traits.hpp:471
constexpr size_t size
The size of a type pack.
Definition: traits.hpp:151
constexpr auto slice
A view adaptor that returns a half-open interval on the underlying range.
Definition: slice.hpp:183
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Resolves to std::ranges::implicitly_convertible_to<type1, type2>().
A more refined container concept than seqan3::container.
The generic concept for a (biological) sequence.
Whether a type behaves like a tuple.
Auxiliary functions for the alignment IO.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
T push_back(T... args)
The <ranges> header from C++20's standard library.
Provides seqan3::sam_file_input_format and auxiliary classes.
Provides seqan3::sam_file_output_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
Provides seqan3::views::slice.
Thrown if information given to output format didn't match expectations.
Definition: exception.hpp:91
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition: exception.hpp:48
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: output_options.hpp:26
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition: output_options.hpp:30
bool sam_require_header
Whether to require a header for SAM files.
Definition: output_options.hpp:44
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition: input_options.hpp:29
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:26
T swap(T... args)
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
T tie(T... args)
Provides seqan3::views::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides seqan3::tuple_like.
T visit(T... args)