SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
seqan3::format_bam Class Reference

The BAM format. More...

#include <seqan3/io/sam_file/format_bam.hpp>

+ Inheritance diagram for seqan3::format_bam:

Classes

struct  alignment_record_core
 Stores all fixed length variables which can be read/written directly by reinterpreting the binary stream. More...
 

Public Member Functions

Constructors, destructor and assignment
 format_bam ()=default
 Defaulted.
 
 format_bam (format_bam const &)=default
 Defaulted.
 
format_bamoperator= (format_bam const &)=default
 Defaulted.
 
 format_bam (format_bam &&)=default
 Defaulted.
 
format_bamoperator= (format_bam &&)=default
 Defaulted.
 
 ~format_bam ()=default
 Defaulted.
 

Static Public Attributes

static std::vector< std::stringfile_extensions {{"bam"}}
 The valid file extensions for this format; note that you can modify this value.
 

Protected Member Functions

template<typename stream_type , typename seq_legal_alph_type , typename ref_seqs_type , typename ref_ids_type , typename stream_pos_type , typename seq_type , typename id_type , typename ref_seq_type , typename ref_id_type , typename ref_offset_type , typename cigar_type , typename flag_type , typename mapq_type , typename qual_type , typename mate_type , typename tag_dict_type , typename e_value_type , typename bit_score_type >
void read_alignment_record (stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
 Read from the specified stream and back-insert into the given field buffers.
 
template<typename stream_type , typename header_type , typename seq_type , typename id_type , typename ref_seq_type , typename ref_id_type , typename cigar_type , typename qual_type , typename mate_type , typename tag_dict_type >
void write_alignment_record (stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
 Write the given fields to the specified stream.
 

Private Member Functions

std::vector< cigarparse_binary_cigar (std::string_view const cigar_str) const
 Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
 
template<typename stream_view_type >
void read_float_byte_field (stream_view_type &&stream_view, float &target)
 Reads a float field from binary stream by directly reinterpreting the bits.
 
template<std::integral number_type>
void read_integral_byte_field (std::string_view const str, number_type &target)
 This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
 
template<typename stream_view_type , std::integral number_type>
void read_integral_byte_field (stream_view_type &&stream_view, number_type &target)
 Reads a arithmetic field from binary stream by directly reinterpreting the bits.
 
void read_sam_dict (std::string_view const tag_str, sam_tag_dictionary &target)
 Reads the optional tag fields into the seqan3::sam_tag_dictionary.
 
template<typename value_type >
int32_t read_sam_dict_vector (seqan3::detail::sam_tag_variant &variant, std::string_view const str, value_type const &value)
 Reads a list of values separated by comma as it is the case for SAM tag arrays.
 
template<typename stream_t , typename header_type >
void write_header (stream_t &stream, sam_file_output_options const &options, header_type &header)
 Writes the SAM header.
 
- Private Member Functions inherited from seqan3::detail::format_sam_base
template<typename ref_id_type , typename ref_id_tmp_type , typename header_type , typename ref_seqs_type >
void check_and_assign_ref_id (ref_id_type &ref_id, ref_id_tmp_type &ref_id_tmp, header_type &header, ref_seqs_type &)
 Checks for known reference ids or adds a new reference is and assigns a reference id to ref_id.
 
template<arithmetic arithmetic_target_type>
void read_arithmetic_field (std::string_view const &str, arithmetic_target_type &arithmetic_target)
 Reads arithmetic fields using std::from_chars.
 
template<std::ranges::forward_range target_range_type>
void read_forward_range_field (std::string_view const str, target_range_type &target)
 Reads from str to target, converting values with seqan3::views::char_to.
 
template<typename stream_view_type , std::ranges::forward_range target_range_type>
void read_forward_range_field (stream_view_type &&stream_view, target_range_type &target)
 Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to.
 
template<typename stream_view_type , typename ref_ids_type , typename ref_seqs_type , typename seq_legal_alph_type >
void read_header (stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &, sam_file_input_options< seq_legal_alph_type > const &options)
 Reads the SAM header.
 
int32_t soft_clipping_at_front (std::vector< cigar > const &cigar_vector) const
 Returns the soft clipping value at the front of the cigar_vector or 0 if none present.
 
template<typename stream_t , typename header_type >
void write_header (stream_t &stream, sam_file_output_options const &options, header_type &header)
 Writes the SAM header.
 
 format_sam_base ()=default
 Defaulted.
 
 format_sam_base (format_sam_base const &)=default
 Defaulted.
 
format_sam_baseoperator= (format_sam_base const &)=default
 Defaulted.
 
 format_sam_base (format_sam_base &&)=default
 Defaulted.
 
format_sam_baseoperator= (format_sam_base &&)=default
 Defaulted.
 
 ~format_sam_base ()=default
 Defaulted.
 

Static Private Member Functions

static std::string get_tag_dict_str (sam_tag_dictionary const &tag_dict)
 Writes the optional fields of the seqan3::sam_tag_dictionary.
 
static uint16_t reg2bin (int32_t beg, int32_t end) noexcept
 Computes the bin number for a given region [beg, end), copied from the official SAM specifications.
 

Private Attributes

bool header_was_read {false}
 A variable that tracks whether the content of header has been read or not.
 
std::string string_buffer {}
 Local buffer to read into while avoiding reallocation.
 
- Private Attributes inherited from seqan3::detail::format_sam_base
std::array< char, 316 > arithmetic_buffer {}
 A buffer used when parsing arithmetic values with std::from_chars.
 
bool header_was_written {false}
 A variable that tracks whether the content of header has been written or not.
 
bool ref_info_present_in_header {false}
 Tracks whether reference information (@SQ tag) were found in the SAM header.
 

Static Private Attributes

static constexpr std::array< uint8_t, 256 > char_to_sam_rank
 Converts a cigar op character to the rank according to the official BAM specifications.
 
- Static Private Attributes inherited from seqan3::detail::format_sam_base
static constexpr std::array format_version {'1', '.', '6'}
 The format version string.
 

Detailed Description

The BAM format.

The BAM format is the binary version of the SAM format:

Introduction

SAM is often used for storing alignments of several read sequences against one or more reference sequences. See the article on wikipedia for an introduction of the format or look into the official SAM format specifications. SeqAn implements version 1.6 of the SAM specification.

Take a look at our tutorial SAM Input and Output in SeqAn for a walk through of how to read SAM/BAM files.

fields_specialisation

The SAM format provides the following fields: seqan3::field::seq, seqan3::field::qual, seqan3::field::id, seqan3::field::ref_seq, seqan3::field::ref_id seqan3::field::ref_offset, seqan3::field::offset, seqan3::field::flag, seqan3::field::mapq and seqan3::field::mate. In addition there is the seqan3::field::header_ptr, which is usually only used internally to provide the range-based functionality of the file.

None of the fields are required when writing. If they are not given, a default value of '0' for numeric fields and '*' for other fields is used.

SAM format columns -> fields

Since many users will be accustomed to the columns of the SAM format, here is a mapping of the common SAM format columns to the SeqAn record fields:

# SAM Column ID FIELD name
1 QNAME seqan3::field::id
2 FLAG seqan3::field::flag
3 RNAME seqan3::field::ref_id
4 POS seqan3::field::ref_offset
5 MAPQ seqan3::field::mapq
6 CIGAR seqan3::field::cigar
7 RNEXT seqan3::field::mate (tuple pos 0)
8 PNEXT seqan3::field::mate (tuple pos 1)
9 TLEN seqan3::field::mate (tuple pos 2)
10 SEQ seqan3::field::seq
11 QUAL seqan3::field::qual

Format Check

The format checks are implemented according to the official SAM format specifications in order to ensure correct SAM file output.

If a non-recoverable format violation is encountered on reading, or you specify invalid values/combinations when writing, seqan3::format_error is thrown.

Header implementation

The SAM header (if present) is read/written once in the beginning before the first record is read/written.

Remarks
For a complete overview, take a look at SAM File
For a complete overview, take a look at SAM File

Member Function Documentation

◆ get_tag_dict_str()

std::string seqan3::format_bam::get_tag_dict_str ( sam_tag_dictionary const &  tag_dict)
inlinestaticprivate

Writes the optional fields of the seqan3::sam_tag_dictionary.

Parameters
[in]tag_dictThe tag dictionary to print.

◆ parse_binary_cigar()

std::vector< cigar > seqan3::format_bam::parse_binary_cigar ( std::string_view const  cigar_str) const
inlineprivate

Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).

Parameters
[in]cigar_strA std::string_view that points to the information of the CIGAR string in the BAM file.
Returns
A std::vector over seqan3::cigar, that describes the alignment.

◆ read_alignment_record()

template<typename stream_type , typename seq_legal_alph_type , typename ref_seqs_type , typename ref_ids_type , typename stream_pos_type , typename seq_type , typename id_type , typename ref_seq_type , typename ref_id_type , typename ref_offset_type , typename cigar_type , typename flag_type , typename mapq_type , typename qual_type , typename mate_type , typename tag_dict_type , typename e_value_type , typename bit_score_type >
void seqan3::format_bam::read_alignment_record ( stream_type &  stream,
sam_file_input_options< seq_legal_alph_type > const &  options,
ref_seqs_type &  ref_seqs,
sam_file_header< ref_ids_type > &  header,
stream_pos_type &  position_buffer,
seq_type &  seq,
qual_type &  qual,
id_type &  id,
ref_seq_type &  ref_seq,
ref_id_type &  ref_id,
ref_offset_type &  ref_offset,
cigar_type &  cigar_vector,
flag_type &  flag,
mapq_type &  mapq,
mate_type &  mate,
tag_dict_type &  tag_dict,
e_value_type &  e_value,
bit_score_type &  bit_score 
)
inlineprotected

Read from the specified stream and back-insert into the given field buffers.

Template Parameters
stream_typeThe input stream type; Must be derived from std::ostream.
stream_pos_typeType of the position buffer, aka, the std::streampos of the current record.
ref_seqs_typee.g. std::deque<ref_sequence_type> or decltype(std::ignore).
seq_typeType of the seqan3::field::seq input (see seqan3::sam_file_input_traits).
qual_typeType of the seqan3::field::qual input (see seqan3::sam_file_input_traits).
id_typeType of the seqan3::field::id input (see seqan3::sam_file_input_traits).
ref_seq_typeType of the seqan3::field::ref_seq input (see seqan3::sam_file_input_traits).
ref_id_typeType of the seqan3::field::ref_id input (see seqan3::sam_file_input_traits).
ref_offset_typeType of the seqan3::field::ref_offset input (see seqan3::sam_file_input_traits).
cigar_typeType of the seqan3::field::cigar input (a std::vector<cigar> or std::ignore).
flag_typeType of the seqan3::field::flag input (see seqan3::sam_file_input_traits).
mapq_typeType of the seqan3::field::mapq input (see seqan3::sam_file_input_traits).
mate_typestd::tuple<ref_id_type, ref_offset_type, int32_t> or decltype(std::ignore).
tag_dict_typeseqan3::sam_tag_dictionary or decltype(std::ignore).
e_value_typeType of the seqan3::field::evalue input (see seqan3::sam_file_input_traits).
bit_score_typeType of the seqan3::field::bit_score input (see seqan3::sam_file_input_traits).
Parameters
[in,out]streamThe input stream to read from.
[in,out]position_bufferThe buffer to store the current record's position.
[in]optionsFile specific options passed to the format.
[out]ref_seqsThe reference sequences to the corresponding alignments.
[out]headerA pointer to the seqan3::sam_file_header object.
[out]seqThe buffer for seqan3::field::seq input.
[out]qualThe buffer for seqan3::field::qual input.
[out]idThe buffer for seqan3::field::id input.
[out]ref_seqThe buffer for seqan3::field::ref_seq input.
[out]ref_idThe buffer for seqan3::field::ref_id input.
[out]ref_offsetThe buffer for seqan3::field::ref_offset input.
[out]cigar_vectorThe buffer for seqan3::field::cigar input.
[out]flagThe buffer for seqan3::field::flag input.
[out]mapqThe buffer for seqan3::field::mapq input.
[out]mateThe buffer for seqan3::field::mate input.
[out]tag_dictThe buffer for seqan3::field::tags input.
[out]e_valueThe buffer for seqan3::field::evalue input.
[out]bit_scoreThe buffer for seqan3::field::bit_score input.

Additional requirements

  • The function must also accept std::ignore as parameter for any of the fields, except stream, options and header. [This is enforced by the concept checker!]
  • In this case the data read for that field shall be discarded by the format.

◆ read_float_byte_field()

template<typename stream_view_type >
void seqan3::format_bam::read_float_byte_field ( stream_view_type &&  stream_view,
float &  target 
)
inlineprivate

Reads a float field from binary stream by directly reinterpreting the bits.

Template Parameters
stream_view_typeThe type of the stream as a view.
Parameters
[in,out]stream_viewThe stream view to read from.
[out]targetAn float value to store the parsed value in.

◆ read_integral_byte_field()

template<typename stream_view_type , std::integral number_type>
void seqan3::format_bam::read_integral_byte_field ( stream_view_type &&  stream_view,
number_type &  target 
)
inlineprivate

Reads a arithmetic field from binary stream by directly reinterpreting the bits.

Template Parameters
stream_view_typeThe type of the stream as a view.
number_typeThe type of number to parse; must model std::integral.
Parameters
[in,out]stream_viewThe stream view to read from.
[out]targetAn integral value to store the parsed value in.

◆ read_sam_dict()

void seqan3::format_bam::read_sam_dict ( std::string_view const  tag_str,
sam_tag_dictionary target 
)
inlineprivate

Reads the optional tag fields into the seqan3::sam_tag_dictionary.

Parameters
[in,out]tag_strThe string_view to parse.
[out]targetThe seqan3::sam_tag_dictionary to store the tag information.
Exceptions
seqan3::format_errorif any unexpected character or format is encountered.

Reading the tags is done according to the official SAM format specifications.

The function throws a seqan3::format_error if any unknown tag type was encountered. It will also fail if the format is not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual error.

◆ read_sam_dict_vector()

template<typename value_type >
int32_t seqan3::format_bam::read_sam_dict_vector ( seqan3::detail::sam_tag_variant variant,
std::string_view const  str,
value_type const &  value 
)
inlineprivate

Reads a list of values separated by comma as it is the case for SAM tag arrays.

Template Parameters
value_typeThe type of values to be stored in the tag array.
Parameters
[in,out]variantA std::variant object to store the tag arrays.
[in,out]strThe string_view to parse.
[in]valueA temporary value that determines the underlying type of the tag array.
Returns
The length of the vector processed.

Reading the tags is done according to the official SAM format specifications.

The function throws a seqan3::format_error if any unknown tag type was encountered. It will also fail if the format is not in a correct state (e.g. required fields are not given), but throwing might occur downstream of the actual error.

◆ write_alignment_record()

template<typename stream_type , typename header_type , typename seq_type , typename id_type , typename ref_seq_type , typename ref_id_type , typename cigar_type , typename qual_type , typename mate_type , typename tag_dict_type >
void seqan3::format_bam::write_alignment_record ( stream_type &  stream,
sam_file_output_options const &  options,
header_type &&  header,
seq_type &&  seq,
qual_type &&  qual,
id_type &&  id,
ref_seq_type &&  ref_seq,
ref_id_type &&  ref_id,
std::optional< int32_t >  ref_offset,
cigar_type &&  cigar_vector,
sam_flag const  flag,
uint8_t const  mapq,
mate_type &&  mate,
tag_dict_type &&  tag_dict,
double  e_value,
double  bit_score 
)
inlineprotected

Write the given fields to the specified stream.

Template Parameters
stream_typeOutput stream, must model seqan3::output_stream_over with char.
seq_typeType of the seqan3
id_typeType of the seqan3
ref_seq_typeType of the seqan3
ref_id_typeType of the seqan3
ref_offset_typeType of the seqan3
flag_typeType of the seqan3
mapq_typeType of the seqan3
qual_typeType of the seqan3
mate_typeType of the seqan3
tag_dict_typeType of the seqan3
e_value_typeType of the seqan3
bit_score_typeType of the seqan3
Parameters
[in,out]streamThe output stream to write into.
[in]optionsFile specific options passed to the format.
[in]headerA pointer to the header object of the file.
[in]seqThe data for seqan3::field::seq, i.e. the query sequence.
[in]qualThe data for seqan3::field::qual, e.g. the query quality sequence.
[in]idThe data for seqan3::field::id, e.g. the read id.
[in]ref_seqThe data for seqan3::field::ref_offset, i.e. the reference sequence.
[in]ref_idThe data for seqan3::field::ref_id, e.g. the reference id..
[in]ref_offsetThe data for seqan3::field::ref_offset, i.e. the start position of the alignment in ref_seq.
[in]cigar_vectorThe data for seqan3::field::cigar, e.g. representing the alignment between query and ref.
[in]flagThe data for seqan3::field::flag, e.g. the SAM mapping flag value.
[in]mapqThe data for seqan3::field::mapq, e.g. the mapping quality value.
[in]mateThe data for seqan3::field::mate, e.g. the mate information of paired reads.
[in]tag_dictThe data for seqan3::field::tags, e.g. the optional SAM field tag dictionary.
[in]e_valueThe data for seqan3::field::e_value, e.g. the e-value of the alignment (BLAST).
[in]bit_scoreThe data for seqan3::field::, e.g. the bit score of the alignment (BLAST).

◆ write_header()

template<typename stream_t , typename header_type >
void seqan3::format_bam::write_header ( stream_t &  stream,
sam_file_output_options const &  options,
header_type &  header 
)
inlineprivate

Writes the SAM header.

Template Parameters
stream_tThe stream type.
Parameters
[in,out]streamThe stream to print to.
[in]optionsThe options to alter printing.
[in]headerThe header to print.
Exceptions
seqan3::format_errorif the header object contains the wrong information or the contents are ill-formed.

Before writing the header, the contents are checked for correctness according to the rules of the official SAM format specifications.

Member Data Documentation

◆ char_to_sam_rank

constexpr std::array<uint8_t, 256> seqan3::format_bam::char_to_sam_rank
staticconstexprprivate
Initial value:
{
[]() constexpr {
using index_t = std::make_unsigned_t<char>;
ret[static_cast<index_t>('I')] = 1;
ret[static_cast<index_t>('D')] = 2;
ret[static_cast<index_t>('N')] = 3;
ret[static_cast<index_t>('S')] = 4;
ret[static_cast<index_t>('H')] = 5;
ret[static_cast<index_t>('P')] = 6;
ret[static_cast<index_t>('=')] = 7;
ret[static_cast<index_t>('X')] = 8;
return ret;
}()
}

Converts a cigar op character to the rank according to the official BAM specifications.


The documentation for this class was generated from the following file:
Hide me