SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
input.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <cassert>
16 #include <fstream>
17 #include <string>
18 #include <variant>
19 #include <vector>
29 #include <seqan3/io/exception.hpp>
30 #include <seqan3/std/filesystem>
31 #include <seqan3/io/record.hpp>
34 #include <seqan3/io/detail/record.hpp>
41 
42 namespace seqan3
43 {
44 
45 // ----------------------------------------------------------------------------
46 // sequence_file_input_traits
47 // ----------------------------------------------------------------------------
48 
97 template <typename t>
100 SEQAN3_CONCEPT sequence_file_input_traits = requires (t v)
101 {
106 
109 
112 };
114 
115 // ----------------------------------------------------------------------------
116 // sequence_file_input_default_traits
117 // ----------------------------------------------------------------------------
118 
133 {
139  using sequence_alphabet = dna5;
141 
144 
146  template <typename _sequence_alphabet>
148 
150  using id_alphabet = char;
151 
153  template <typename _id_alphabet>
155 
158 
160  template <typename _quality_alphabet>
162 
164 };
165 
169 {
175  using sequence_alphabet = aa27;
177 
181 };
182 
183 // ----------------------------------------------------------------------------
184 // sequence_file_input
185 // ----------------------------------------------------------------------------
186 
307 template <
309  detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::qual>,
310  detail::type_list_of_sequence_file_input_formats valid_formats_ = type_list<format_embl,
311  format_fasta,
312  format_fastq,
314  format_sam>>
316 {
317 public:
322  using traits_type = traits_type_;
325  using selected_field_ids = selected_field_ids_;
327  using valid_formats = valid_formats_;
329  using stream_char_type = char;
331 
336 
337  static_assert([] () constexpr
338  {
339  for (field f : selected_field_ids::as_array)
340  if (!field_ids::contains(f))
341  return false;
342  return true;
343  }(),
344  "You selected a field that is not valid for sequence files, please refer to the documentation "
345  "of sequence_file_input::field_ids for the accepted values.");
346 
347  static_assert([] () constexpr
348  {
349  return !(selected_field_ids::contains(field::seq_qual) &&
351  (selected_field_ids::contains(field::qual))));
352  }(),
353  "You may not select field::seq_qual and either of field::seq and field::qual at the same time.");
354 
360  using sequence_type = typename traits_type::template sequence_container<
362  typename traits_type::sequence_alphabet>;
364  using id_type = typename traits_type::template id_container<
365  typename traits_type::id_alphabet>;
367  using quality_type = typename traits_type::template quality_container<
368  typename traits_type::quality_alphabet>;
370  using sequence_quality_type = typename traits_type::
371  template sequence_container<qualified<typename traits_type::sequence_alphabet,
372  typename traits_type::quality_alphabet>>;
373 
376 
381 
386  using value_type = record_type;
391  using const_reference = void;
393  using size_type = size_t;
397  using iterator = detail::in_file_iterator<sequence_file_input>;
399  using const_iterator = void;
401  using sentinel = std::default_sentinel_t;
403 
407  sequence_file_input() = delete;
418  ~sequence_file_input() = default;
419 
437  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
438  primary_stream{new std::ifstream{}, stream_deleter_default}
439  {
440  primary_stream->rdbuf()->pubsetbuf(stream_buffer.data(), stream_buffer.size());
441  static_cast<std::basic_ifstream<char> *>(primary_stream.get())->open(filename,
442  std::ios_base::in | std::ios::binary);
443 
444  if (!primary_stream->good())
445  throw file_open_error{"Could not open file " + filename.string() + " for reading."};
446 
447  // possibly add intermediate compression stream
448  secondary_stream = detail::make_secondary_istream(*primary_stream, filename);
449 
450  // initialise format handler or throw if format is not found
451  detail::set_format(format, filename);
452  }
453  /* NOTE(h-2): Curiously we do not need a user-defined deduction guide for the above constructor.
454  * A combination of default template parameters and auto-deduction guides works as expected,
455  * independent of whether the second/optional parameter is specified or not, i.e. it is possible
456  * to auto-deduct and overwrite a single template parameter out of the four if the optional parameter
457  * is specified and use the default otherwise.
458  */
459 
474  template <input_stream stream_t,
475  sequence_file_input_format file_format>
477  requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
479  sequence_file_input(stream_t & stream,
480  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
481  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
482  primary_stream{&stream, stream_deleter_noop},
483  format{detail::sequence_file_input_format_exposer<file_format>{}}
484  {
485  static_assert(list_traits::contains<file_format, valid_formats>,
486  "You selected a format that is not in the valid_formats of this file.");
487 
488  // possibly add intermediate compression stream
489  secondary_stream = detail::make_secondary_istream(*primary_stream);
490  }
491 
493  template <input_stream stream_t,
494  sequence_file_input_format file_format>
496  requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
498  sequence_file_input(stream_t && stream,
499  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
500  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
501  primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
502  format{detail::sequence_file_input_format_exposer<file_format>{}}
503  {
504  static_assert(list_traits::contains<file_format, valid_formats>,
505  "You selected a format that is not in the valid_formats of this file.");
506 
507  // possibly add intermediate compression stream
508  secondary_stream = detail::make_secondary_istream(*primary_stream);
509  }
511 
531  {
532  // buffer first record
533  if (!first_record_was_read)
534  {
535  read_next_record();
536  first_record_was_read = true;
537  }
538 
539  return {*this};
540  }
541 
555  sentinel end() noexcept
556  {
557  return {};
558  }
559 
583  reference front() noexcept
584  {
585  return *begin();
586  }
588 
590  sequence_file_input_options<typename traits_type::sequence_legal_alphabet,
592 
593 protected:
595 
598  record_type record_buffer;
601  std::vector<char> stream_buffer{std::vector<char>(1'000'000)};
603 
611  static void stream_deleter_noop(std::basic_istream<stream_char_type> *) {}
613  static void stream_deleter_default(std::basic_istream<stream_char_type> * ptr) { delete ptr; }
614 
616  stream_ptr_t primary_stream{nullptr, stream_deleter_noop};
618  stream_ptr_t secondary_stream{nullptr, stream_deleter_noop};
619 
621  bool first_record_was_read{false};
623  bool at_end{false};
624 
626  using format_type = typename detail::variant_from_tags<valid_formats,
627  detail::sequence_file_input_format_exposer>::type;
629  format_type format;
631 
633  void read_next_record()
634  {
635  // clear the record
636  record_buffer.clear();
637 
638  // at end if we could not read further
639  if ((std::istreambuf_iterator<stream_char_type>{*secondary_stream} ==
641  {
642  at_end = true;
643  return;
644  }
645 
646  assert(!format.valueless_by_exception());
647  std::visit([&] (auto & f)
648  {
649  // read new record
650  if constexpr (selected_field_ids::contains(field::seq_qual))
651  {
652  f.read_sequence_record(*secondary_stream,
653  options,
654  detail::get_or_ignore<field::seq_qual>(record_buffer),
655  detail::get_or_ignore<field::id>(record_buffer),
656  detail::get_or_ignore<field::seq_qual>(record_buffer));
657  }
658  else
659  {
660  f.read_sequence_record(*secondary_stream,
661  options,
662  detail::get_or_ignore<field::seq>(record_buffer),
663  detail::get_or_ignore<field::id>(record_buffer),
664  detail::get_or_ignore<field::qual>(record_buffer));
665  }
666  }, format);
667  }
668 
670  friend iterator;
671 };
672 
678 template <input_stream stream_type,
680  sequence_file_input_format file_format>
681 sequence_file_input(stream_type & stream,
682  file_format const &)
684  typename sequence_file_input<>::selected_field_ids, // default field ids.
686 
688 template <input_stream stream_type,
689  sequence_file_input_format file_format>
690 sequence_file_input(stream_type && stream,
691  file_format const &)
693  typename sequence_file_input<>::selected_field_ids, // default field ids.
695 
697 template <input_stream stream_type,
698  sequence_file_input_format file_format,
699  detail::fields_specialisation selected_field_ids>
700 sequence_file_input(stream_type && stream,
701  file_format const &,
702  selected_field_ids const &)
706 
708 template <input_stream stream_type,
709  sequence_file_input_format file_format,
710  detail::fields_specialisation selected_field_ids>
711 sequence_file_input(stream_type & stream,
712  file_format const &,
713  selected_field_ids const &)
718 
719 } // namespace seqan3
seqan3::sequence_file_input::end
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: input.hpp:555
seqan3::sequence_file_input::id_type
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of field::id (std::string by defaul).
Definition: input.hpp:365
seqan3::sequence_file_input::operator=
sequence_file_input & operator=(sequence_file_input &&)=default
Move assignment is defaulted.
qualified.hpp
Provides quality alphabet composites.
input_format_concept.hpp
Provides seqan3::sequence_file_input_format and auxiliary classes.
fstream
std::basic_string
pack_algorithm.hpp
Provides algorithms for meta programming, parameter packs and seqan3::type_list.
sequence_container
A more refined container concept than seqan3::container.
seqan3::type_list
meta::list< types... > type_list
Type that contains multiple types, an alias for meta::list.
Definition: type_list.hpp:31
dna15.hpp
Provides seqan3::dna15, container aliases and string literals.
concept.hpp
Stream concepts.
seqan3::sequence_file_input::sequence_file_input
sequence_file_input(stream_type &stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >>
This is an overloaded member function, provided for convenience. It differs from the above function o...
seqan3::sequence_file_input::const_reference
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: input.hpp:391
vector
seqan3::sequence_file_input::options
sequence_file_input_options< typename traits_type::sequence_legal_alphabet, selected_field_ids::contains(field::seq_qual)> options
The options are public and its members can be set directly.
Definition: input.hpp:591
std::vector::size
T size(T... args)
explicitly_convertible_to
Resolves to std::ranges::explicitly_convertible_to<type1, type2>().
seqan3::format_genbank
The GenBank format.
Definition: format_genbank.hpp:73
seqan3::sequence_file_input::sequence_file_input
sequence_file_input(stream_type &&stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >>
Deduces the sequence input file type from the stream, the format and the field ids.
std::unique_ptr::get
T get(T... args)
format_sam.hpp
Provides the seqan3::format_sam.
seqan3::dna15
The 15 letter DNA alphabet, containing all IUPAC smybols minus the gap.
Definition: dna15.hpp:49
seqan3::sequence_file_input::sequence_file_input
sequence_file_input(sequence_file_input &&)=default
Move construction is defaulted.
seqan3::sequence_file_input::sequence_file_input
sequence_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
seqan3::sequence_file_input::sentinel
std::default_sentinel_t sentinel
The type returned by end().
Definition: input.hpp:401
record.hpp
Provides the seqan3::record template and the seqan3::field enum.
std::function
seqan3::sequence_file_input::record_type
record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition: input.hpp:379
filesystem
This header includes C++17 filesystem support and imports it into namespace std::filesystem (independ...
seqan3::format_fasta
The FastA format.
Definition: format_fasta.hpp:80
seqan3::sequence_file_input::field_types
type_list< sequence_type, id_type, quality_type, sequence_quality_type > field_types
The previously defined types aggregated in a seqan3::type_list.
Definition: input.hpp:375
seqan3::sequence_file_input::quality_type
typename traits_type::template quality_container< typename traits_type::quality_alphabet > quality_type
The type of field::qual (std::vector <seqan3::phred42> by default).
Definition: input.hpp:368
std::filesystem::path
seqan3::sequence_file_input::traits_type
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: input.hpp:323
seqan3::pack_traits::contains
constexpr bool contains
Whether a type occurs in a pack or not.
Definition: traits.hpp:193
seqan3::fields
A class template that holds a choice of seqan3::field.
Definition: record.hpp:166
seqan3::sequence_file_input_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:26
seqan3::seq
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
seqan3::sequence_file_input::valid_formats
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: input.hpp:327
seqan3::sequence_file_input::sequence_file_input
sequence_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: input.hpp:436
format_fastq.hpp
seqan3::sequence_file_input::sequence_file_input
sequence_file_input(sequence_file_input const &)=delete
Copy construction is explicitly deleted, because you can't have multiple access to the same file.
aa27.hpp
Provides seqan3::aa27, container aliases and string literals.
seqan3::views::move
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:68
seqan3::sequence_file_input::begin
iterator begin()
Returns an iterator to current position in the file.
Definition: input.hpp:530
seqan3::sequence_file_input::operator=
sequence_file_input & operator=(sequence_file_input const &)=delete
Copy assignment is explicitly deleted, because you can't have multiple access to the same file.
seqan3::sequence_file_input::sequence_type
typename traits_type::template sequence_container< typename traits_type::sequence_alphabet > sequence_type
The type of field::seq (std::vector <seqan3::dna5> by default).
Definition: input.hpp:362
exception.hpp
Provides exceptions used in the I/O module.
dna5.hpp
Provides seqan3::dna5, container aliases and string literals.
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
seqan3::sequence_file_input::size_type
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: input.hpp:393
seqan3::sequence_file_input::selected_field_ids
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: input.hpp:325
std::istreambuf_iterator
std::make_signed_t
seqan3::phred42
Quality type for traditional Sanger and modern Illumina Phred scores (typical range).
Definition: phred42.hpp:44
format_fasta.hpp
std::format
T format(T... args)
seqan3::sequence_file_input::front
reference front() noexcept
Return the record we are currently at in the file.
Definition: input.hpp:583
misc_input.hpp
Provides various utility functions required only for input.
seqan3::sequence_file_input::iterator
detail::in_file_iterator< sequence_file_input > iterator
The iterator type of this view (an input iterator).
Definition: input.hpp:397
char.hpp
Provides alphabet adaptations for standard char types.
seqan3::sequence_file_input_default_traits_aa
A traits type that specifies input as amino acids.
Definition: input.hpp:169
seqan3::sequence_file_input::const_iterator
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: input.hpp:399
seqan3::sequence_file_input::stream_char_type
char stream_char_type
Character type of the stream(s).
Definition: input.hpp:329
seqan3::sequence_file_input::sequence_file_input
sequence_file_input(stream_type &&stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format >>
This is an overloaded member function, provided for convenience. It differs from the above function o...
seqan3::sequence_file_input
A class for reading sequence files, e.g. FASTA, FASTQ ...
Definition: input.hpp:316
seqan3::sequence_file_input::sequence_file_input
sequence_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: input.hpp:498
seqan3::format_embl
The EMBL format.
Definition: format_embl.hpp:71
seqan3::format_fastq
The FastQ format.
Definition: format_fastq.hpp:78
cassert
seqan3::field
field
An enumerator for the fields used in file formats.
Definition: record.hpp:65
seqan3::sequence_file_input_default_traits_dna::id_alphabet
char id_alphabet
The alphabet for an identifier string is char.
Definition: input.hpp:150
format_embl.hpp
seqan3::record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids >
sequence_file_input_format
The generic concept for sequence file in formats.
seqan3::qualified
Joins an arbitrary alphabet with a quality alphabet.
Definition: qualified.hpp:59
seqan3::format_sam
The SAM format (tag).
Definition: format_sam.hpp:126
std::visit
T visit(T... args)
seqan3::sequence_file_input::sequence_file_input
sequence_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: input.hpp:479
seqan3::dna5
The five letter DNA alphabet of A,C,G,T and the unknown character N.
Definition: dna5.hpp:49
in_file_iterator.hpp
Provides the seqan3::detail::in_file_iterator class template.
sequence_file_input_traits
The requirements a traits_type for seqan3::sequence_file_input must meet.
seqan3::sequence_file_input::~sequence_file_input
~sequence_file_input()=default
Destructor is defaulted.
traits.hpp
Provides traits for seqan3::type_list.
std::basic_istream
writable_alphabet
Refines seqan3::alphabet and adds assignability.
std::unique_ptr
seqan3::sequence_file_input::sequence_file_input
sequence_file_input(stream_type &stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format >>
Deduces the sequence input file type from the stream and the format.
writable_quality_alphabet
A concept that indicates whether a writable alphabet represents quality scores.
seqan3::aa27
The twenty-seven letter amino acid alphabet.
Definition: aa27.hpp:44
phred42.hpp
Provides seqan3::phred42 quality scores.
seqan3::sequence_file_input_default_traits_dna
The default traits for seqan3::sequence_file_input.
Definition: input.hpp:133
std::vector::data
T data(T... args)
seqan3::sequence_file_input::sequence_quality_type
typename traits_type::template sequence_container< qualified< typename traits_type::sequence_alphabet, typename traits_type::quality_alphabet > > sequence_quality_type
The type of field::seq_qual (std::vector <seqan3::dna5q> by default).
Definition: input.hpp:372
format_genbank.hpp
Provides the seqan3::sequence_file_format_genbank class.
variant
std::ifstream
string