SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ > Class Template Reference

A class for reading SAM files, both SAM and its binary representation BAM are supported. More...

#include <seqan3/io/sam_file/input.hpp>

Public Types

Template arguments

Exposed as member types for public access.

using traits_type = traits_type_
 A traits type that defines aliases and template for storage of the fields.
 
using selected_field_ids = selected_field_ids_
 A seqan3::fields list with the fields selected for the record.
 
using valid_formats = valid_formats_
 A seqan3::type_list with the possible formats.
 
using stream_char_type = char
 Character type of the stream(s).
 
Field types and record type

These types are relevant for record/row-based reading; they may be manipulated via the traits_type to achieve different storage behaviour.

using sequence_type = typename traits_type::template sequence_container< typename traits_type::sequence_alphabet >
 The type of field::seq (default std::vector<seqan3::dna5>).
 
using id_type = typename traits_type::template id_container< char >
 The type of field::id (default std::string by default).
 
using ref_sequence_type = std::conditional_t< std::same_as< typename traits_type::ref_sequences, ref_info_not_given >, dummy_ref_type, ref_sequence_sliced_type >
 The type of field::ref_seq (default depends on construction).
 
using ref_id_type = std::optional< int32_t >
 The type of field::ref_id is fixed to std::optional<int32_t>.
 
using ref_offset_type = std::optional< int32_t >
 The type of field::ref_offset is fixed to a std::optional<int32_t>.
 
using mapq_type = uint8_t
 The type of field::mapq is fixed to uint8_t.
 
using quality_type = typename traits_type::template quality_container< typename traits_type::quality_alphabet >
 The type of field::qual (default std::vector<seqan3::phred42>).
 
using flag_type = sam_flag
 The type of field::flag is fixed to seqan3::sam_flag.
 
using cigar_type = std::vector< cigar >
 The type of field::cigar is fixed to std::vector<cigar>.
 
using mate_type = std::tuple< ref_id_type, ref_offset_type, int32_t >
 The type of field::mate is fixed to std::tuple<ref_id_type, ref_offset_type, int32_t>).
 
using header_type = sam_file_header< typename traits_type::ref_ids >
 The type of field::header_ptr (default: sam_file_header<typename traits_type::ref_ids>).
 
using field_types = type_list< sequence_type, id_type, ref_id_type, ref_offset_type, std::vector< cigar >, mapq_type, quality_type, flag_type, mate_type, sam_tag_dictionary, header_type * >
 The previously defined types aggregated in a seqan3::type_list.
 
using field_ids = fields< field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr >
 The subset of seqan3::field tags valid for this file; order corresponds to the types in field_types.
 
using record_type = sam_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids >
 The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field types.
 
Range associated types

The types necessary to facilitate the behaviour of an input range (used in record-wise reading).

using value_type = record_type
 The value_type is the record_type.
 
using reference = record_type &
 The reference type.
 
using const_reference = void
 The const_reference type is void because files are not const-iterable.
 
using size_type = size_t
 An unsigned integer type, usually std::size_t.
 
using difference_type = std::make_signed_t< size_t >
 A signed integer type, usually std::ptrdiff_t.
 
using iterator = detail::in_file_iterator< sam_file_input >
 The iterator type of this view (an input iterator).
 
using const_iterator = void
 The const iterator type is void because files are not const-iterable.
 
using sentinel = std::default_sentinel_t
 The type returned by end().
 

Public Member Functions

header_typeheader ()
 Access the file's header.
 
Constructors, destructor and assignment
 sam_file_input ()=delete
 Default constructor is explicitly deleted, you need to give a stream or file name.
 
 sam_file_input (sam_file_input const &)=delete
 Copy construction is explicitly deleted because you cannot have multiple access to the same file.
 
sam_file_inputoperator= (sam_file_input const &)=delete
 Copy assignment is explicitly deleted because you cannot have multiple access to the same file.
 
 sam_file_input (sam_file_input &&)=default
 Move construction is defaulted.
 
sam_file_inputoperator= (sam_file_input &&)=default
 Move assignment is defaulted.
 
 ~sam_file_input ()=default
 Destructor is defaulted.
 
 sam_file_input (std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
 Construct from filename.
 
template<input_stream stream_t, sam_file_input_format file_format>
requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
 sam_file_input (stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
 Construct from an existing stream and with specified format.
 
template<input_stream stream_t, sam_file_input_format file_format>
requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
 sam_file_input (stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
 This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
 
 sam_file_input (std::filesystem::path filename, typename traits_type::ref_ids &ref_ids, typename traits_type::ref_sequences &ref_sequences, selected_field_ids const &fields_tag=selected_field_ids{})
 Construct from filename and given additional reference information.
 
template<input_stream stream_t, sam_file_input_format file_format>
 sam_file_input (stream_t &stream, typename traits_type::ref_ids &ref_ids, typename traits_type::ref_sequences &ref_sequences, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
 Construct from an existing stream and with specified format.
 
template<input_stream stream_t, sam_file_input_format file_format>
 sam_file_input (stream_t &&stream, typename traits_type::ref_ids &ref_ids, typename traits_type::ref_sequences &ref_sequences, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
 This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
 
Range interface

Provides functions for record based reading of the file.

iterator begin ()
 Returns an iterator to current position in the file.
 
sentinel end () noexcept
 Returns a sentinel for comparison with iterator.
 
reference front () noexcept
 Return the record we are currently at in the file.
 

Public Attributes

sam_file_input_options< typename traits_type::sequence_legal_alphabet > options
 The options are public and its members can be set directly.
 

Related Symbols

(Note that these are not member symbols.)

Type deduction guides
template<input_stream stream_type, sam_file_input_format file_format, detail::fields_specialisation selected_field_ids>
 sam_file_input (stream_type &&stream, file_format const &, selected_field_ids const &) -> sam_file_input< typename sam_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
 Deduce selected fields, file_format, and default the rest.
 
template<input_stream stream_type, sam_file_input_format file_format, detail::fields_specialisation selected_field_ids>
 sam_file_input (stream_type &stream, file_format const &, selected_field_ids const &) -> sam_file_input< typename sam_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
 Deduce selected fields, file_format, and default the rest.
 
template<input_stream stream_type, sam_file_input_format file_format>
 sam_file_input (stream_type &&stream, file_format const &) -> sam_file_input< typename sam_file_input<>::traits_type, typename sam_file_input<>::selected_field_ids, type_list< file_format > >
 Deduce file_format, and default the rest.
 
template<input_stream stream_type, sam_file_input_format file_format>
 sam_file_input (stream_type &stream, file_format const &) -> sam_file_input< typename sam_file_input<>::traits_type, typename sam_file_input<>::selected_field_ids, type_list< file_format > >
 Deduce file_format, and default the rest.
 
template<std::ranges::forward_range ref_ids_t, std::ranges::forward_range ref_sequences_t, detail::fields_specialisation selected_field_ids>
 sam_file_input (std::filesystem::path path, ref_ids_t &, ref_sequences_t &, selected_field_ids const &) -> sam_file_input< sam_file_input_default_traits< std::remove_reference_t< ref_sequences_t >, std::remove_reference_t< ref_ids_t > >, selected_field_ids, typename sam_file_input<>::valid_formats >
 Deduce selected fields, ref_sequences_t and ref_ids_t, default the rest.
 
template<std::ranges::forward_range ref_ids_t, std::ranges::forward_range ref_sequences_t>
 sam_file_input (std::filesystem::path path, ref_ids_t &, ref_sequences_t &) -> sam_file_input< sam_file_input_default_traits< std::remove_reference_t< ref_sequences_t >, std::remove_reference_t< ref_ids_t > >, typename sam_file_input<>::selected_field_ids, typename sam_file_input<>::valid_formats >
 Deduce ref_sequences_t and ref_ids_t, default the rest.
 
template<input_stream stream_type, std::ranges::forward_range ref_ids_t, std::ranges::forward_range ref_sequences_t, sam_file_input_format file_format, detail::fields_specialisation selected_field_ids>
 sam_file_input (stream_type &&stream, ref_ids_t &, ref_sequences_t &, file_format const &, selected_field_ids const &) -> sam_file_input< sam_file_input_default_traits< std::remove_reference_t< ref_sequences_t >, std::remove_reference_t< ref_ids_t > >, selected_field_ids, type_list< file_format > >
 Deduce selected fields, ref_sequences_t and ref_ids_t, and file format.
 
template<input_stream stream_type, std::ranges::forward_range ref_ids_t, std::ranges::forward_range ref_sequences_t, sam_file_input_format file_format, detail::fields_specialisation selected_field_ids>
 sam_file_input (stream_type &stream, ref_ids_t &, ref_sequences_t &, file_format const &, selected_field_ids const &) -> sam_file_input< sam_file_input_default_traits< std::remove_reference_t< ref_sequences_t >, std::remove_reference_t< ref_ids_t > >, selected_field_ids, type_list< file_format > >
 Deduce selected fields, ref_sequences_t and ref_ids_t, and file format.
 
template<input_stream stream_type, std::ranges::forward_range ref_ids_t, std::ranges::forward_range ref_sequences_t, sam_file_input_format file_format>
 sam_file_input (stream_type &&stream, ref_ids_t &, ref_sequences_t &, file_format const &) -> sam_file_input< sam_file_input_default_traits< std::remove_reference_t< ref_sequences_t >, std::remove_reference_t< ref_ids_t > >, typename sam_file_input<>::selected_field_ids, type_list< file_format > >
 Deduce ref_sequences_t and ref_ids_t, and file format.
 
template<input_stream stream_type, std::ranges::forward_range ref_ids_t, std::ranges::forward_range ref_sequences_t, sam_file_input_format file_format>
 sam_file_input (stream_type &stream, ref_ids_t &, ref_sequences_t &, file_format const &) -> sam_file_input< sam_file_input_default_traits< std::remove_reference_t< ref_sequences_t >, std::remove_reference_t< ref_ids_t > >, typename sam_file_input<>::selected_field_ids, type_list< file_format > >
 Deduce selected fields, ref_sequences_t and ref_ids_t, and file format.
 

Detailed Description

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
class seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >

A class for reading SAM files, both SAM and its binary representation BAM are supported.

Template Parameters
traits_typeAn auxiliary type that defines certain member types and constants, must model seqan3::sam_file_input_traits.
selected_field_idsA seqan3::fields type with the list and order of desired record entries; all fields must be in seqan3::sam_file_input::field_ids.
valid_formatsA seqan3::type_list of the selectable formats (each must meet seqan3::sam_file_input_format).

Reading SAM files

Construction and specialisation

The seqan3::sam_file_input class comes with four constructors: One for construction from a file name, one for construction from an existing stream and a known format and both of the former with or without additional reference information. Constructing from a file name automatically picks the format based on the extension of the file name. Constructing from a stream can be used if you have a non-file stream, like std::cin or std::istringstream. It also comes in handy, if you cannot use file-extension based detection, but know that your input file has a certain format.

Passing reference information, e.g.
  • ref_ids: The name of the references, e.g. "chr1", "chr2", ...
  • ref_sequences: The reference sequence information in the same order as the ref_ids.
comes in handy once you want to convert the CIGAR string, read from your file, into an actual alignment. This will be covered in the section "Transforming the CIGAR information into an actual alignment".In most cases the template parameters are deduced automatically:
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <filesystem>
#include <sstream>
auto sam_file_raw = R"(@HD VN:1.6 SO:coordinate GO:none
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r003 0 ref 29 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 237 30 9M = 7 -39 CAGCGGCAT * NM:i:1
)";
int main()
{
// Create the temporary file.
auto tmp_file = std::filesystem::temp_directory_path() / "my.sam";
std::ofstream tmp_stream{tmp_file};
tmp_stream << sam_file_raw;
tmp_stream.close();
seqan3::sam_file_input fin{tmp_file}; // SAM format assumed, regular std::ifstream taken as stream
}
A class for reading SAM files, both SAM and its binary representation BAM are supported.
Definition sam_file/input.hpp:236
T close(T... args)
T remove(T... args)
Provides seqan3::sam_file_input and corresponding traits classes.
T temp_directory_path(T... args)
Reading from a std::istringstream:
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <sstream>
auto input = R"(@HD VN:1.6 SO:coordinate
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *)";
int main()
{
// ^ no need to specify the template arguments
}
The SAM format (tag).
Definition format_sam.hpp:105
Note that this is not the same as writing sam_file_input<> (with angle brackets). In the latter case they are explicitly set to their default values, in the former case automatic deduction happens which chooses different parameters depending on the constructor arguments. For opening from file, sam_file_input<> would have also worked, but for opening from stream it would not have.

You can define your own traits type to further customise the types used by and returned by this class, see seqan3::sam_file_input_default_traits for more details. As mentioned above, specifying at least one template parameter yourself means that you loose automatic deduction. The following is equivalent to the automatic type deduction example with a stream from above:
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <sstream>
auto input = R"(@HD VN:1.6 SO:coordinate
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *)";
int main()
{
// The default types; you can adjust this list if you don't want to read all this data.
using default_fields = seqan3::fields<seqan3::field::seq,
// The expected format:
default_fields,
// Which formats are allowed:
sam_file_input_t fin{std::istringstream{input}, seqan3::format_sam{}};
}
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ cigar
The cigar vector (std::vector<seqan3::cigar>) representing the alignment in SAM/BAM format.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ header_ptr
A pointer to the seqan3::sam_file_header object storing header information.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ id
The identifier, usually a string.
@ tags
The optional tags in the SAM format, stored in a dictionary.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
A class template that holds a choice of seqan3::field.
Definition record.hpp:125
Type that contains multiple types.
Definition type_list.hpp:26
Provides seqan3::type_list.

Reading record-wise

You can iterate over this file record-wise:
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <sstream>
auto sam_file_raw = R"(@HD VN:1.6 SO:coordinate GO:none
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r003 0 ref 29 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 237 30 9M = 7 -39 CAGCGGCAT * NM:i:1
)";
int main()
{
for (auto & rec : fin)
{
seqan3::debug_stream << "id: " << rec.id() << '\n';
seqan3::debug_stream << "read sequence: " << rec.sequence() << '\n';
seqan3::debug_stream << "mapping position: " << rec.reference_position() << '\n';
seqan3::debug_stream << "mapping quality: " << rec.mapping_quality() << '\n';
// there are more fields read on default
}
}
Provides seqan3::debug_stream and related types.
debug_stream_type debug_stream
A global instance of seqan3::debug_stream_type.
Definition debug_stream.hpp:37
In the above example, rec has the type seqan3::sam_file_input::record_type which is a specialisation of seqan3::record and behaves like a std::tuple (that's why we can access it via get). Instead of using the seqan3::field based interface on the record, you could also use std::get<0> or even std::get<dna4_vector> to retrieve the sequence, but it is not recommended, because it is more error-prone.
Note
It is important to write auto & and not just auto, otherwise you will copy the record on every iteration. Since the buffer gets "refilled" on every iteration, you can also move the data out of the record if you want to store it somewhere without copying:
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <sstream>
#include <utility>
#include <vector>
auto sam_file_raw = R"(@HD VN:1.6 SO:coordinate GO:none
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r003 0 ref 29 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 237 30 9M = 7 -39 CAGCGGCAT * NM:i:1
)";
int main()
{
using record_type = typename decltype(fin)::record_type;
std::vector<record_type> records{}; // store all my records in a vector
for (auto & rec : fin)
records.push_back(std::move(rec));
}
sam_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition sam_file/input.hpp:379
T move(T... args)
SeqAn specific customisations in the standard namespace.

Reading record-wise (custom fields)

If you want to skip specific fields from the record you can pass a non-empty fields trait object to the seqan3::sam_file_input constructor to select the fields that should be read from the input. For example, you may only be interested in the mapping flag and mapping quality of your SAM data to get some statistics. The following snippets demonstrate the usage of such a fields trait object.
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <sstream>
auto sam_file_raw = R"(@HD VN:1.6 SO:coordinate GO:none
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r003 0 ref 29 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 237 30 9M = 7 -39 CAGCGGCAT * NM:i:1
)";
int main()
{
for (auto & rec : fin)
{
seqan3::debug_stream << "flag: " << rec.flag() << '\n';
seqan3::debug_stream << "mapping quality: " << rec.mapping_quality() << '\n';
}
}
When reading a file, all fields not present in the file (but requested implicitly or via the selected_field_ids parameter) are ignored and the respective value in the record stays empty.

Reading record-wise (decomposed records)

Instead of using get on the record, you can also use structured bindings to decompose the record into its elements. Considering the example of reading only the flag and mapping quality like before you can also write:
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <sstream>
auto sam_file_raw = R"(@HD VN:1.6 SO:coordinate GO:none
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r003 0 ref 29 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 237 30 9M = 7 -39 CAGCGGCAT * NM:i:1
)";
int main()
{
for (auto & [flag, mapq] : fin) // the order is the same as specified in fields!
{
seqan3::debug_stream << "flag: " << flag << '\n';
seqan3::debug_stream << "mapping quality: " << mapq << '\n';
}
}
In this case you immediately get the two elements of the tuple: flag of seqan3::sam_file_input::flag_type and mapq of seqan3::sam_file_input::mapq_type.
Note
But beware: with structured bindings you do need to get the order of elements correctly!

Transforming the CIGAR information into an actual alignment

In SeqAn, we represent an alignment as a tuple of two seqan3::aligned_sequences.The conversion from a CIGAR string to an alignment can be done with the function seqan3::alignment_from_cigar. You need to pass the reference sequence with the position the read was aligned to and the read sequence. All of it is already in the record when reading a SAM file:
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
using namespace seqan3::literals;
auto sam_file_raw = R"(@HD VN:1.6
@SQ SN:ref LN:34
read1 41 ref 1 61 1S1M1D1M1I ref 10 300 ACGT !##$ AS:i:2 NM:i:7
read2 42 ref 2 62 1H7M1D1M1S2H ref 10 300 AGGCTGNAG !##$&'()* xy:B:S,3,4,5
read3 43 ref 3 63 1S1M1P1M1I1M1I1D1M1S ref 10 300 GGAGTATA !!*+,-./
)";
int main()
{
// The reference sequence might be read from a different file.
seqan3::dna5_vector reference = "ACTGATCGAGAGGATCTAGAGGAGATCGTAGGAC"_dna5;
// You will probably read it from a file, e.g., like this:
// seqan3::sam_file_input fin{"test.sam"};
for (auto && rec : fin)
{
auto alignment =
alignment_from_cigar(rec.cigar_sequence(), reference, rec.reference_position().value(), rec.sequence());
}
// prints:
// (ACT-,C-GT)
// (CTGATCGAG,AGGCTGN-A)
// (T-G-A-TC,G-AGTA-T)
}
Provides the function seqan3::alignment_from_cigar.
record_type & reference
The reference type.
Definition sam_file/input.hpp:389
decltype(auto) reference_position() &&
(Reference) Sequence (seqan3::sam_record::reference_sequence) relative start position (0-based),...
Definition sam_file/record.hpp:202
auto alignment_from_cigar(std::vector< cigar > const &cigar_vector, reference_type const &reference, uint32_t const zero_based_reference_start_position, sequence_type const &query)
Construct an alignment from a CIGAR string and the corresponding sequences.
Definition alignment_from_cigar.hpp:81
@ alignment
The (pairwise) alignment stored in an object that models seqan3::detail::pairwise_alignment.
Meta-header for the IO / SAM File submodule .
The SeqAn namespace for literals.
The code will print the following:
(ACT-,C-GT)
(CTGATCGAG,AGGCTGN-A)
(T-G-A-TC,G-AGTA-T)

Views on files

Since SeqAn files are ranges, you can also create views over files. A useful example is to filter the records based on certain criteria, e.g. minimum length of the sequence field:
// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <ranges>
#include <sstream>
auto sam_file_raw = R"(@HD VN:1.6 SO:coordinate GO:none
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r003 0 ref 29 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 237 30 9M = 7 -39 CAGCGGCAT * NM:i:1
)";
int main()
{
auto minimum_length10_filter = std::views::filter(
[](auto const & rec)
{
return std::ranges::size(rec.sequence()) >= 10;
});
for (auto & rec : fin | minimum_length10_filter) // only records with sequence length >= 10 will "appear"
seqan3::debug_stream << rec.id() << '\n';
}
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26

End of file

You can check whether a file is at its end by comparing begin() and end() (if they are the same, the file is at its end).

Formats

We currently support reading the following formats:

Remarks
For a complete overview, take a look at SAM File

Member Typedef Documentation

◆ field_ids

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
using seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::field_ids = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>

The subset of seqan3::field tags valid for this file; order corresponds to the types in field_types.

The SAM file abstraction supports reading 10 different fields:

  1. seqan3::field::seq
  2. seqan3::field::id
  3. seqan3::field::ref_id
  4. seqan3::field::ref_offset
  5. seqan3::field::cigar
  6. seqan3::field::mapq
  7. seqan3::field::qual
  8. seqan3::field::flag
  9. seqan3::field::mate
  10. seqan3::field::tags

There exists one more field for SAM files, the seqan3::field::header_ptr, but this field is mostly used internally. Please see the seqan3::sam_file_output::header member function for details on how to access the seqan3::sam_file_header of the file.

◆ ref_id_type

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
using seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::ref_id_type = std::optional<int32_t>

The type of field::ref_id is fixed to std::optional<int32_t>.

To be consistent with the BAM format, the field::ref_id will hold the index to the actual reference information stored in the header. If a read is unmapped, the optional will remain valueless.

Attention
SeqaAn3 transforms the 1-based SAM format position into a 0-based position.

◆ ref_offset_type

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
using seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::ref_offset_type = std::optional<int32_t>

The type of field::ref_offset is fixed to a std::optional<int32_t>.

The SAM format is 1-based and a 0 in the ref_offset field indicated an unmapped read. Since we convert 1-based positions to 0-based positions when reading the SAM format, we model the ref_offset_type as a std::optional. If the input value is 0, the std::optional will remain valueless.

◆ ref_sequence_type

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
using seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::ref_sequence_type = std::conditional_t<std::same_as<typename traits_type::ref_sequences, ref_info_not_given>, dummy_ref_type, ref_sequence_sliced_type>

The type of field::ref_seq (default depends on construction).

If no reference information are given on construction, this type deduces to a sized view that throws on access (since there is nothing to access anyway). If the reference information are given, the type is deduced to a view over the given input reference sequence type such that no sequence information is copied.

Constructor & Destructor Documentation

◆ sam_file_input() [1/4]

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::sam_file_input ( std::filesystem::path  filename,
selected_field_ids const &  fields_tag = selected_field_ids{} 
)
inline

Construct from filename.

Parameters
[in]filenamePath to the file you wish to open.
[in]fields_tagA seqan3::fields tag. [optional]
Exceptions
seqan3::file_open_errorIf the file could not be opened, e.g. non-existent, non-readable, unknown format.

In addition to the file name, you may specify a custom seqan3::fields object (e.g. seqan3::fields<seqan3::field::seq>{}) which may be easier than defining all the template parameters.

Decompression

This constructor transparently applies a decompression stream on top of the file stream in case the file is detected as being compressed. See the section on compression and decompression for more information.

◆ sam_file_input() [2/4]

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
template<input_stream stream_t, sam_file_input_format file_format>
requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::sam_file_input ( stream_t &  stream,
file_format const &  format_tag,
selected_field_ids const &  fields_tag = selected_field_ids{} 
)
inline

Construct from an existing stream and with specified format.

Template Parameters
stream_tThe stream type; must model seqan3::input_stream.
file_formatThe format of the file in the stream, must model seqan3::sam_file_input_format.
Parameters
[in]streamThe stream to operate on; must be derived of std::basic_istream.
[in]format_tagThe file format tag.
[in]fields_tagA seqan3::fields tag. [optional]

In addition to the stream and the format, you may specify a custom seqan3::fields object (e.g. seqan3::fields<seqan3::field::seq>{}) which may be easier than defining all the template parameters.

Decompression

This constructor transparently applies a decompression stream on top of the stream in case it is detected as being compressed. See the section on compression and decompression for more information.

◆ sam_file_input() [3/4]

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::sam_file_input ( std::filesystem::path  filename,
typename traits_type::ref_ids &  ref_ids,
typename traits_type::ref_sequences &  ref_sequences,
selected_field_ids const &  fields_tag = selected_field_ids{} 
)
inline

Construct from filename and given additional reference information.

Parameters
[in]filenamePath to the file you wish to open.
[in]ref_idsA range containing the reference ids that correspond to the SAM/BAM file.
[in]ref_sequencesA range containing the reference sequences that correspond to the SAM/BAM file.
[in]fields_tagA seqan3::fields tag. [optional]
Exceptions
seqan3::file_open_errorIf the file could not be opened, e.g. non-existent, non-readable, unknown format.

Reference information

The reference information given by the IDs (names) and sequences will be used to keep the record entry seqan3::sam_file_input::record_type::reference_id() consistent with the order imposed by ref_ids. This way, you can use the value of seqan3::sam_file_input::record_type::reference_id() to access the lists ref_ids and ref_sequences to retrieve the correct information for the current record.

Selecting custom fields

In addition to the file name and reference information, you may specify a custom seqan3::fields object (e.g. seqan3::fields<seqan3::field::seq>{}) which may be easier than defining all the template parameters.

Decompression

This constructor transparently applies a decompression stream on top of the file stream in case the file is detected as being compressed. See the section on compression and decompression for more information.

◆ sam_file_input() [4/4]

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
template<input_stream stream_t, sam_file_input_format file_format>
seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::sam_file_input ( stream_t &  stream,
typename traits_type::ref_ids &  ref_ids,
typename traits_type::ref_sequences &  ref_sequences,
file_format const &  format_tag,
selected_field_ids const &  fields_tag = selected_field_ids{} 
)
inline

Construct from an existing stream and with specified format.

Template Parameters
stream_tThe stream type; must model seqan3::input_stream.
file_formatThe format of the file in the stream; must model seqan3::sam_file_input_format.
Parameters
[in]streamThe stream to operate on; must be derived of std::basic_istream.
[in]ref_idsA range containing the reference ids that correspond to the SAM/BAM file.
[in]ref_sequencesA range containing the reference sequences that correspond to the SAM/BAM file.
[in]format_tagThe file format tag.
[in]fields_tagA seqan3::fields tag. [optional]

Reference information

The reference information given by the IDs (names) and sequences will be used to keep the record entry seqan3::sam_file_input::record_type::reference_id() consistent with the order imposed by ref_ids. This way, you can use the value of seqan3::sam_file_input::record_type::reference_id() to access the lists ref_ids and ref_sequences to retrieve the correct information for the current record.

Selecting custom fields

In addition to the stream, reference information and format, you may specify a custom seqan3::fields object (e.g. seqan3::fields<seqan3::field::seq>{}) which may be easier than defining all the template parameters.

Decompression

This constructor transparently applies a decompression stream on top of the stream in case it is detected as being compressed. See the section on compression and decompression for more information.

Member Function Documentation

◆ begin()

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
iterator seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::begin ( )
inline

Returns an iterator to current position in the file.

Returns
An iterator pointing to the current position in the file.
Exceptions
seqan3::format_error

Equals end() if the file is at end.

// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <sstream>
auto sam_file_raw = R"(@HD VN:1.6 SO:coordinate GO:none
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r003 0 ref 29 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 237 30 9M = 7 -39 CAGCGGCAT * NM:i:1
)";
int main()
{
auto it = fin.begin();
// the following are equivalent:
auto & rec0 = *it;
auto & rec1 = fin.front();
std::cout << std::boolalpha << (rec0.id() == rec1.id()) << '\n'; // true
// Note: both become invalid after incrementing "it"!
}
T boolalpha(T... args)

Complexity

Constant.

Exceptions

Throws seqan3::format_error if the first record could not be read into the buffer.

◆ end()

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
sentinel seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::end ( )
inlinenoexcept

Returns a sentinel for comparison with iterator.

Returns
Iterator to the first element.

This element acts as a placeholder; attempting to dereference it results in undefined behaviour.

Complexity

Constant.

Exceptions

No-throw guarantee.

◆ front()

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
reference seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::front ( )
inlinenoexcept

Return the record we are currently at in the file.

Returns
A reference to the currently buffered record.

This function returns a reference to the currently buffered record, it is identical to dereferencing begin(), and begin also always points to the current record on single pass input ranges:

// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <sstream>
auto sam_file_raw = R"(@HD VN:1.6 SO:coordinate GO:none
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r003 0 ref 29 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 237 30 9M = 7 -39 CAGCGGCAT * NM:i:1
)";
int main()
{
auto it = fin.begin();
// the following are equivalent:
auto & rec0 = *it;
auto & rec1 = fin.front();
std::cout << std::boolalpha << (rec0.id() == rec1.id()) << '\n'; // true
// Note: both become invalid after incrementing "it"!
}

In most situations using the iterator interface or a range-based for-loop are preferable to using front(), because you can only move to the next record via the iterator.

In any case, don't forget the reference! If you want to save the data from the record elsewhere, use move:

// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <sstream>
auto sam_file_raw = R"(@HD VN:1.6 SO:coordinate GO:none
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r003 0 ref 29 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 237 30 9M = 7 -39 CAGCGGCAT * NM:i:1
)";
int main()
{
auto rec = std::move(fin.front()); // rec now stores the data permanently
}

Complexity

Constant.

Exceptions

No-throw guarantee.

◆ header()

template<sam_file_input_traits traits_type_ = sam_file_input_default_traits<>, detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::ref_id, field::ref_offset, field::cigar, field::mapq, field::qual, field::flag, field::mate, field::tags, field::header_ptr>, detail::type_list_of_sam_file_input_formats valid_formats_ = type_list<format_sam, format_bam>>
header_type & seqan3::sam_file_input< traits_type_, selected_field_ids_, valid_formats_ >::header ( )
inline

Access the file's header.

You can access the header directly after the construction with reference information of the file object.

Example

// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
// SPDX-License-Identifier: CC0-1.0
#include <sstream>
auto sam_file_raw = R"(@HD VN:1.6 SO:coordinate GO:none
@SQ SN:ref LN:45
r001 99 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTG *
r003 0 ref 29 30 5S6M * 0 0 GCCTAAGCTAA * SA:Z:ref,29,-,6H5M,17,0;
r003 2064 ref 29 17 6H5M * 0 0 TAGGC * SA:Z:ref,9,+,5S6M,30,1;
r001 147 ref 237 30 9M = 7 -39 CAGCGGCAT * NM:i:1
)";
int main()
{
// access the header information
seqan3::debug_stream << fin.header().format_version << '\n'; // 1.6
seqan3::debug_stream << fin.header().ref_dict << '\n'; // [(ref,(45,))] (this only works with seqan3::debug_stream!)
}
See also
seqan3::sam_file_header

The documentation for this class was generated from the following file:
Hide me