SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
|
The SAM format (tag). More...
#include <seqan3/io/sam_file/format_sam.hpp>
Public Member Functions | |
Constructors, destructor and assignment | |
format_sam ()=default | |
Defaulted. | |
format_sam (format_sam const &)=delete | |
Deleted. Header holds a unique_ptr. | |
format_sam & | operator= (format_sam const &)=delete |
Deleted. Header holds a unique_ptr. | |
format_sam (format_sam &&)=default | |
Defaulted. | |
format_sam & | operator= (format_sam &&)=default |
Defaulted. | |
~format_sam ()=default | |
Defaulted. | |
Static Public Attributes | |
static std::vector< std::string > | file_extensions |
The valid file extensions for this format; note that you can modify this value. | |
Protected Member Functions | |
template<typename stream_type , typename seq_legal_alph_type , typename ref_seqs_type , typename ref_ids_type , typename stream_pos_type , typename seq_type , typename id_type , typename ref_seq_type , typename ref_id_type , typename ref_offset_type , typename cigar_type , typename flag_type , typename mapq_type , typename qual_type , typename mate_type , typename tag_dict_type , typename e_value_type , typename bit_score_type > | |
void | read_alignment_record (stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score) |
Read from the specified stream and back-insert into the given field buffers. | |
template<typename stream_type , typename seq_legal_alph_type , typename stream_pos_type , typename seq_type , typename id_type , typename qual_type > | |
void | read_sequence_record (stream_type &stream, sequence_file_input_options< seq_legal_alph_type > const &options, stream_pos_type &position_buffer, seq_type &sequence, id_type &id, qual_type &qualities) |
Read from the specified stream and back-insert into the given field buffers. | |
template<typename stream_type , typename header_type , typename seq_type , typename id_type , typename ref_seq_type , typename ref_id_type , typename qual_type , typename mate_type , typename tag_dict_type , typename e_value_type , typename bit_score_type > | |
void | write_alignment_record (stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, std::vector< cigar > const &cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, e_value_type &&e_value, bit_score_type &&bit_score) |
Write the given fields to the specified stream. | |
template<typename stream_type , typename seq_type , typename id_type , typename qual_type > | |
void | write_sequence_record (stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities) |
Write the given fields to the specified stream. | |
Protected Member Functions inherited from seqan3::detail::format_sam_base | |
template<typename ref_id_type , typename ref_id_tmp_type , typename header_type , typename ref_seqs_type > | |
void | check_and_assign_ref_id (ref_id_type &ref_id, ref_id_tmp_type &ref_id_tmp, header_type &header, ref_seqs_type &) |
Checks for known reference ids or adds a new reference is and assigns a reference id to ref_id . | |
template<arithmetic arithmetic_target_type> | |
void | read_arithmetic_field (std::string_view const &str, arithmetic_target_type &arithmetic_target) |
Reads arithmetic fields using std::from_chars. | |
template<std::ranges::forward_range target_range_type> | |
void | read_forward_range_field (std::string_view const str, target_range_type &target) |
Reads from str to target , converting values with seqan3::views::char_to. | |
template<typename stream_view_type , std::ranges::forward_range target_range_type> | |
void | read_forward_range_field (stream_view_type &&stream_view, target_range_type &target) |
Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to. | |
template<typename stream_view_type , typename ref_ids_type , typename ref_seqs_type , typename seq_legal_alph_type > | |
void | read_header (stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &, sam_file_input_options< seq_legal_alph_type > const &options) |
Reads the SAM header. | |
int32_t | soft_clipping_at_front (std::vector< cigar > const &cigar_vector) const |
Returns the soft clipping value at the front of the cigar_vector or 0 if none present. | |
template<typename stream_t , typename header_type > | |
void | write_header (stream_t &stream, sam_file_output_options const &options, header_type &header) |
Writes the SAM header. | |
format_sam_base ()=default | |
Defaulted. | |
format_sam_base (format_sam_base const &)=default | |
Defaulted. | |
format_sam_base & | operator= (format_sam_base const &)=default |
Defaulted. | |
format_sam_base (format_sam_base &&)=default | |
Defaulted. | |
format_sam_base & | operator= (format_sam_base &&)=default |
Defaulted. | |
~format_sam_base ()=default | |
Defaulted. | |
Private Member Functions | |
std::string_view const & | default_or (detail::ignore_t) const noexcept |
brief Returns a reference to dummy if passed a std::ignore. | |
template<typename t > | |
decltype(auto) | default_or (t &&v) const noexcept |
brief Returns the input unchanged. | |
void | read_sam_byte_vector (seqan3::detail::sam_tag_variant &variant, std::string_view const str) |
Reads a list of byte pairs as it is the case for SAM tag byte arrays. | |
void | read_sam_dict (std::string_view const tag_str, sam_tag_dictionary &target) |
Reads the optional tag fields into the seqan3::sam_tag_dictionary. | |
template<arithmetic value_type> | |
void | read_sam_dict_vector (seqan3::detail::sam_tag_variant &variant, std::string_view const str, value_type value) |
Reads a list of values separated by comma as it is the case for SAM tag arrays. | |
template<typename stream_it_t > | |
void | write_range_or_asterisk (stream_it_t &stream_it, char const *const field_value) |
Writes a field value to the stream. | |
template<typename stream_it_t , std::ranges::forward_range field_type> | |
void | write_range_or_asterisk (stream_it_t &stream_it, field_type &&field_value) |
Writes a field value to the stream. | |
template<typename stream_it_t > | |
void | write_tag_fields (stream_it_t &stream, sam_tag_dictionary const &tag_dict, char const separator) |
Writes the optional fields of the seqan3::sam_tag_dictionary. | |
Private Attributes | |
sam_file_header | default_header {} |
The default header for the alignment format. | |
std::array< std::string_view, 11 > | raw_record {} |
A buffer to store a raw record pointing into the stream buffer of the input. | |
std::string | tmp_qual {} |
Stores quality values temporarily if seq and qual information are combined (not supported by SAM yet). | |
Static Private Attributes | |
static constexpr std::string_view | dummy {} |
An empty dummy container to pass to align_format.write() such that an empty field is written. | |
Additional Inherited Members | |
Protected Attributes inherited from seqan3::detail::format_sam_base | |
std::array< char, 316 > | arithmetic_buffer {} |
A buffer used when parsing arithmetic values with std::from_chars. | |
bool | header_was_written {false} |
A variable that tracks whether the content of header has been written or not. | |
bool | ref_info_present_in_header {false} |
Tracks whether reference information (@SQ tag) were found in the SAM header. | |
Static Protected Attributes inherited from seqan3::detail::format_sam_base | |
static constexpr std::array | format_version {'1', '.', '6'} |
The format version string. | |
The SAM format (tag).
SAM is often used for storing alignments of several read sequences against one or more reference sequences. See the article on wikipedia for an introduction of the format or look into the official SAM format specifications. SeqAn implements version 1.6 of the SAM specification.
Take a look at our tutorial SAM Input and Output in SeqAn for a walk through of how to read SAM/BAM files.
The SAM format provides the following fields: seqan3::field::seq, seqan3::field::qual, seqan3::field::id, seqan3::field::ref_seq, seqan3::field::ref_id seqan3::field::ref_offset, seqan3::field::offset, seqan3::field::flag, seqan3::field::mapq and seqan3::field::mate. In addition there is the seqan3::field::header_ptr, which is usually only used internally to provide the range-based functionality of the file.
None of the fields are required when writing. If they are not given, a default value of '0' for numeric fields and '*' for other fields is used.
Since many users will be accustomed to the columns of the SAM format, here is a mapping of the common SAM format columns to the SeqAn record fields:
# | SAM Column ID | FIELD name |
---|---|---|
1 | QNAME | seqan3::field::id |
2 | FLAG | seqan3::field::flag |
3 | RNAME | seqan3::field::ref_id |
4 | POS | seqan3::field::ref_offset |
5 | MAPQ | seqan3::field::mapq |
6 | CIGAR | seqan3::field::cigar |
7 | RNEXT | seqan3::field::mate (tuple pos 0) |
8 | PNEXT | seqan3::field::mate (tuple pos 1) |
9 | TLEN | seqan3::field::mate (tuple pos 2) |
10 | SEQ | seqan3::field::seq |
11 | QUAL | seqan3::field::qual |
The format checks are implemented according to the official SAM format specifications in order to ensure correct SAM file output.
If a non-recoverable format violation is encountered on reading, or you specify invalid values/combinations when writing, seqan3::format_error is thrown.
The SAM header (if present) is read/written once in the beginning before the first record is read/written.
|
inlineprotected |
Read from the specified stream and back-insert into the given field buffers.
stream_type | The input stream type; Must be derived from std::ostream. |
stream_pos_type | Type of the position buffer, aka, the std::streampos of the current record. |
ref_seqs_type | e.g. std::deque<ref_sequence_type> or decltype(std::ignore). |
seq_type | Type of the seqan3::field::seq input (see seqan3::sam_file_input_traits). |
qual_type | Type of the seqan3::field::qual input (see seqan3::sam_file_input_traits). |
id_type | Type of the seqan3::field::id input (see seqan3::sam_file_input_traits). |
ref_seq_type | Type of the seqan3::field::ref_seq input (see seqan3::sam_file_input_traits). |
ref_id_type | Type of the seqan3::field::ref_id input (see seqan3::sam_file_input_traits). |
ref_offset_type | Type of the seqan3::field::ref_offset input (see seqan3::sam_file_input_traits). |
cigar_type | Type of the seqan3::field::cigar input (a std::vector<cigar> or std::ignore). |
flag_type | Type of the seqan3::field::flag input (see seqan3::sam_file_input_traits). |
mapq_type | Type of the seqan3::field::mapq input (see seqan3::sam_file_input_traits). |
mate_type | std::tuple<ref_id_type, ref_offset_type, int32_t> or decltype(std::ignore). |
tag_dict_type | seqan3::sam_tag_dictionary or decltype(std::ignore). |
e_value_type | Type of the seqan3::field::evalue input (see seqan3::sam_file_input_traits). |
bit_score_type | Type of the seqan3::field::bit_score input (see seqan3::sam_file_input_traits). |
[in,out] | stream | The input stream to read from. |
[in,out] | position_buffer | The buffer to store the current record's position. |
[in] | options | File specific options passed to the format. |
[out] | ref_seqs | The reference sequences to the corresponding alignments. |
[out] | header | A pointer to the seqan3::sam_file_header object. |
[out] | seq | The buffer for seqan3::field::seq input. |
[out] | qual | The buffer for seqan3::field::qual input. |
[out] | id | The buffer for seqan3::field::id input. |
[out] | ref_seq | The buffer for seqan3::field::ref_seq input. |
[out] | ref_id | The buffer for seqan3::field::ref_id input. |
[out] | ref_offset | The buffer for seqan3::field::ref_offset input. |
[out] | cigar_vector | The buffer for seqan3::field::cigar input. |
[out] | flag | The buffer for seqan3::field::flag input. |
[out] | mapq | The buffer for seqan3::field::mapq input. |
[out] | mate | The buffer for seqan3::field::mate input. |
[out] | tag_dict | The buffer for seqan3::field::tags input. |
[out] | e_value | The buffer for seqan3::field::evalue input. |
[out] | bit_score | The buffer for seqan3::field::bit_score input. |
|
inlineprivate |
Reads a list of byte pairs as it is the case for SAM tag byte arrays.
[in,out] | variant | A std::variant object to store the tag arrays. |
[in,out] | str | The string_view to parse. |
Reading the byte tags is done according to the official SAM format specifications.
The function throws a seqan3::format_error if there was an uneven number of bytes.
|
inlineprivate |
Reads the optional tag fields into the seqan3::sam_tag_dictionary.
[in,out] | tag_str | The string_view to parse for the sam_tag_dictionary entries. |
[in,out] | target | The seqan3::sam_tag_dictionary to store the tag information. |
seqan3::format_error | if any unexpected character or format is encountered. |
Reading the tags is done according to the official SAM format specifications.
The function throws a seqan3::format_error if any unknown tag type was encountered. It will also fail if the format is not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual error.
|
inlineprivate |
Reads a list of values separated by comma as it is the case for SAM tag arrays.
value_type | The type of values to be stored in the tag array. |
[in,out] | variant | A std::variant object to store the tag arrays. |
[in,out] | str | The string_view to parse. |
[in] | value | A temporary value that determines the underlying type of the tag array. |
Reading the tags is done according to the official SAM format specifications.
The function throws a seqan3::format_error if any unknown tag type was encountered. It will also fail if the format is not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual error.
|
inlineprotected |
Read from the specified stream and back-insert into the given field buffers.
stream_type | Input stream, must satisfy seqan3::input_stream_over with char . |
stream_pos_type | Buffer for storing the current record's file position. |
seq_type | Type of the seqan3::field::seq input; must satisfy std::ranges::output_range over a seqan3::alphabet. |
id_type | Type of the seqan3::field::id input; must satisfy std::ranges::output_range over a seqan3::alphabet. |
qual_type | Type of the seqan3::field::qual input; must satisfy std::ranges::output_range over a seqan3::writable_quality_alphabet. |
[in,out] | stream | The input stream to read from. |
[in,out] | position_buffer | The buffer to store the current record's file position. |
[in] | options | File specific options passed to the format. |
[out] | sequence | The buffer for seqan3::field::seq input, i.e. the "sequence". |
[out] | id | The buffer for seqan3::field::id input, e.g. the header line in FASTA . |
[out] | qualities | The buffer for seqan3::field::qual input. |
|
inlineprotected |
Write the given fields to the specified stream.
stream_type | Output stream, must model seqan3::output_stream_over with char . |
seq_type | Type of the seqan3 |
id_type | Type of the seqan3 |
ref_seq_type | Type of the seqan3 |
ref_id_type | Type of the seqan3 |
ref_offset_type | Type of the seqan3 |
flag_type | Type of the seqan3 |
mapq_type | Type of the seqan3 |
qual_type | Type of the seqan3 |
mate_type | Type of the seqan3 |
tag_dict_type | Type of the seqan3 |
e_value_type | Type of the seqan3 |
bit_score_type | Type of the seqan3 |
[in,out] | stream | The output stream to write into. |
[in] | options | File specific options passed to the format. |
[in] | header | A pointer to the header object of the file. |
[in] | seq | The data for seqan3::field::seq, i.e. the query sequence. |
[in] | qual | The data for seqan3::field::qual, e.g. the query quality sequence. |
[in] | id | The data for seqan3::field::id, e.g. the read id. |
[in] | ref_seq | The data for seqan3::field::ref_offset, i.e. the reference sequence. |
[in] | ref_id | The data for seqan3::field::ref_id, e.g. the reference id.. |
[in] | ref_offset | The data for seqan3::field::ref_offset, i.e. the start position of the alignment in ref_seq . |
[in] | cigar_vector | The data for seqan3::field::cigar, e.g. representing the alignment between query and ref. |
[in] | flag | The data for seqan3::field::flag, e.g. the SAM mapping flag value. |
[in] | mapq | The data for seqan3::field::mapq, e.g. the mapping quality value. |
[in] | mate | The data for seqan3::field::mate, e.g. the mate information of paired reads. |
[in] | tag_dict | The data for seqan3::field::tags, e.g. the optional SAM field tag dictionary. |
[in] | e_value | The data for seqan3::field::e_value, e.g. the e-value of the alignment (BLAST). |
[in] | bit_score | The data for seqan3::field::, e.g. the bit score of the alignment (BLAST). |
|
inlineprivate |
Writes a field value to the stream.
stream_it_t | The stream iterator type. |
[in,out] | stream_it | The stream iterator to print to. |
[in] | field_value | The value to print; a null-terminated CString. |
|
inlineprivate |
Writes a field value to the stream.
stream_it_t | The stream iterator type. |
field_type | The type of the field value. Must model std::ranges::forward_range. |
[in,out] | stream_it | The stream iterator to print to. |
[in] | field_value | The value to print. |
|
inlineprotected |
Write the given fields to the specified stream.
stream_type | Output stream, must satisfy seqan3::output_stream_over with char . |
seq_type | Type of the seqan3::field::seq output; must satisfy std::ranges::output_range over a seqan3::alphabet. |
id_type | Type of the seqan3::field::id output; must satisfy std::ranges::output_range over a seqan3::alphabet. |
qual_type | Type of the seqan3::field::qual output; must satisfy std::ranges::output_range over a seqan3::quality_alphabet. |
[in,out] | stream | The output stream to write into. |
[in] | options | File specific options passed to the format. |
[in] | sequence | The data for seqan3::field::seq, i.e. the "sequence". |
[in] | id | The data for seqan3::field::id, e.g. the header line in FASTA. |
[in] | qualities | The data for seqan3::field::qual. |
|
inlineprivate |
Writes the optional fields of the seqan3::sam_tag_dictionary.
stream_it_t | The stream iterator's type. |
[in,out] | stream_it | The stream iterator to print to. |
[in] | tag_dict | The tag dictionary to print. |
[in] | separator | The field separator to append. |
|
inlinestatic |
The valid file extensions for this format; note that you can modify this value.