SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
input.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <cassert>
16 #include <seqan3/std/filesystem>
17 #include <fstream>
18 #include <string>
19 #include <variant>
20 #include <vector>
21 
31 #include <seqan3/io/detail/record.hpp>
32 #include <seqan3/io/exception.hpp>
42 
43 namespace seqan3
44 {
45 
46 // ----------------------------------------------------------------------------
47 // sequence_file_input_traits
48 // ----------------------------------------------------------------------------
49 
96 template <typename t>
97 SEQAN3_CONCEPT sequence_file_input_traits = requires (t v)
98 {
103 
106 
109 };
111 
112 // ----------------------------------------------------------------------------
113 // sequence_file_input_default_traits
114 // ----------------------------------------------------------------------------
115 
130 {
138 
141 
143  template <typename _sequence_alphabet>
145 
147  using id_alphabet = char;
148 
150  template <typename _id_alphabet>
152 
155 
157  template <typename _quality_alphabet>
159 
161 };
162 
166 {
174 
178 };
179 
180 // ----------------------------------------------------------------------------
181 // sequence_file_input
182 // ----------------------------------------------------------------------------
183 
300 template <
302  detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::qual>,
303  detail::type_list_of_sequence_file_input_formats valid_formats_ = type_list<format_embl,
304  format_fasta,
305  format_fastq,
307  format_sam>>
309 {
310 public:
316  using traits_type = traits_type_;
318  using selected_field_ids = selected_field_ids_;
320  using valid_formats = valid_formats_;
322  using stream_char_type = char;
324 
328 #ifdef SEQAN3_DEPRECATED_310
330 #else // ^^^ before seqan 3.1 / after seqan 3.1 vvv
332 #endif // SEQAN3_DEPRECATED_310
333 
334  static_assert([] () constexpr
335  {
336  for (field f : selected_field_ids::as_array)
337  if (!field_ids::contains(f))
338  return false;
339  return true;
340  }(),
341  "You selected a field that is not valid for sequence files, please refer to the documentation "
342  "of sequence_file_input::field_ids for the accepted values.");
343 
344 #ifdef SEQAN3_DEPRECATED_310
345  static_assert([] () constexpr
346  {
350  }(),
351  "You may not select field::seq_qual and either of field::seq and field::qual at the same time.");
352 #endif // SEQAN3_DEPRECATED_310
353 
360  using sequence_type = typename traits_type::template sequence_container<
361  typename traits_type::sequence_alphabet>;
363  using id_type = typename traits_type::template id_container<
364  typename traits_type::id_alphabet>;
366  using quality_type = typename traits_type::template quality_container<
367  typename traits_type::quality_alphabet>;
368 #ifdef SEQAN3_DEPRECATED_310
372  using sequence_quality_type = typename traits_type::
373  template sequence_container<qualified<typename traits_type::sequence_alphabet,
374  typename traits_type::quality_alphabet>>;
375 
378 #else // ^^^ before seqan 3.1 / after seqan 3.1 vvv
380  using field_types = type_list<sequence_type, id_type, quality_type>;
381 #endif // SEQAN3_DEPRECATED_310
382 
384  using record_type = sequence_record<detail::select_types_with_ids_t<field_types,
385  field_ids,
389 
399  using const_reference = void;
401  using size_type = size_t;
405  using iterator = detail::in_file_iterator<sequence_file_input>;
407  using const_iterator = void;
409  using sentinel = std::default_sentinel_t;
411 
426  ~sequence_file_input() = default;
427 
445  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
446  primary_stream{new std::ifstream{}, stream_deleter_default}
447  {
448  primary_stream->rdbuf()->pubsetbuf(stream_buffer.data(), stream_buffer.size());
449  static_cast<std::basic_ifstream<char> *>(primary_stream.get())->open(filename,
450  std::ios_base::in | std::ios::binary);
451 
452  if (!primary_stream->good())
453  throw file_open_error{"Could not open file " + filename.string() + " for reading."};
454 
455  // possibly add intermediate compression stream
456  secondary_stream = detail::make_secondary_istream(*primary_stream, filename);
457 
458  // initialise format handler or throw if format is not found
459  using format_variant_t = typename detail::variant_from_tags<valid_formats,
460  detail::sequence_file_input_format_exposer>::type;
461  format_variant_t format_variant{};
462  detail::set_format(format_variant, filename);
463 
464  std::visit([&] (auto && selected_format)
465  {
466  using format_t = std::remove_cvref_t<decltype(selected_format)>;
467  format = std::make_unique<selected_sequence_format<format_t>>();
468  }, format_variant);
469  }
470  /* NOTE(h-2): Curiously we do not need a user-defined deduction guide for the above constructor.
471  * A combination of default template parameters and auto-deduction guides works as expected,
472  * independent of whether the second/optional parameter is specified or not, i.e. it is possible
473  * to auto-deduct and overwrite a single template parameter out of the four if the optional parameter
474  * is specified and use the default otherwise.
475  */
476 
491  template <input_stream stream_t,
492  sequence_file_input_format file_format>
494  requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
496  sequence_file_input(stream_t & stream,
497  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
498  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
499  primary_stream{&stream, stream_deleter_noop},
500  format{std::make_unique<selected_sequence_format<file_format>>()}
501  {
502  static_assert(list_traits::contains<file_format, valid_formats>,
503  "You selected a format that is not in the valid_formats of this file.");
504 
505  // possibly add intermediate compression stream
506  secondary_stream = detail::make_secondary_istream(*primary_stream);
507  }
508 
510  template <input_stream stream_t,
511  sequence_file_input_format file_format>
513  requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
515  sequence_file_input(stream_t && stream,
516  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
517  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
518  primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
519  format{std::make_unique<selected_sequence_format<file_format>>()}
520  {
521  static_assert(list_traits::contains<file_format, valid_formats>,
522  "You selected a format that is not in the valid_formats of this file.");
523 
524  // possibly add intermediate compression stream
525  secondary_stream = detail::make_secondary_istream(*primary_stream);
526  }
528 
548  {
549  // buffer first record
550  if (!first_record_was_read)
551  {
552  read_next_record();
553  first_record_was_read = true;
554  }
555 
556  return {*this};
557  }
558 
572  sentinel end() noexcept
573  {
574  return {};
575  }
576 
600  reference front() noexcept
601  {
602  return *begin();
603  }
605 
607 #ifdef SEQAN3_DEPRECATED_310
609  = sequence_file_input_options<typename traits_type::sequence_legal_alphabet,
611 #else // ^^^ before seqan 3.1 / after seqan 3.1 vvv
613 #endif // SEQAN3_DEPRECATED_310
615  sequence_file_input_options_type options;
616 
617 protected:
619 
623  record_type record_buffer;
625  std::vector<char> stream_buffer{std::vector<char>(1'000'000)};
627 
635  static void stream_deleter_noop(std::basic_istream<stream_char_type> *) {}
637  static void stream_deleter_default(std::basic_istream<stream_char_type> * ptr) { delete ptr; }
638 
640  stream_ptr_t primary_stream{nullptr, stream_deleter_noop};
642  stream_ptr_t secondary_stream{nullptr, stream_deleter_noop};
643 
645  bool first_record_was_read{false};
647  bool at_end{false};
649 
650 private:
652  void read_next_record()
653  {
654  // clear the record
655  record_buffer.clear();
656 
657  // at end if we could not read further
658  if ((std::istreambuf_iterator<stream_char_type>{*secondary_stream} ==
660  {
661  at_end = true;
662  return;
663  }
664 
665  format->read_sequence_record(*secondary_stream, record_buffer, options);
666  }
667 
678  struct sequence_format_base
679  {
683  sequence_format_base() = default;
684  sequence_format_base(sequence_format_base const &) = default;
685  sequence_format_base(sequence_format_base &&) = default;
686  sequence_format_base & operator=(sequence_format_base const &) = default;
687  sequence_format_base & operator=(sequence_format_base &&) = default;
688  virtual ~sequence_format_base() = default;
690 
701  virtual void read_sequence_record(std::istream & instream,
702  record_type & record_buffer,
704  };
705 
717  template <typename format_t>
718  struct selected_sequence_format final : public sequence_format_base
719  {
723  selected_sequence_format() = default;
724  selected_sequence_format(selected_sequence_format const &) = default;
725  selected_sequence_format(selected_sequence_format &&) = default;
726  selected_sequence_format & operator=(selected_sequence_format const &) = default;
727  selected_sequence_format & operator=(selected_sequence_format &&) = default;
728  ~selected_sequence_format() = default;
730 
732  void read_sequence_record(std::istream & instream,
733  record_type & record_buffer,
734  sequence_file_input_options_type const & options) override
735  {
736  // read new record
737 #ifdef SEQAN3_DEPRECATED_310
739  {
740  _format.read_sequence_record(instream,
741  options,
742  detail::get_or_ignore<field::_seq_qual_deprecated>(record_buffer),
743  detail::get_or_ignore<field::id>(record_buffer),
744  detail::get_or_ignore<field::_seq_qual_deprecated>(record_buffer));
745  }
746  else
747 #endif // SEQAN3_DEPRECATED_310
748  {
749  _format.read_sequence_record(instream,
750  options,
751  detail::get_or_ignore<field::seq>(record_buffer),
752  detail::get_or_ignore<field::id>(record_buffer),
753  detail::get_or_ignore<field::qual>(record_buffer));
754  }
755  };
756 
758  detail::sequence_file_input_format_exposer<format_t> _format{};
759  };
760 
763 
765  friend iterator;
766 };
767 
774 template <input_stream stream_type,
775  sequence_file_input_format file_format>
776 sequence_file_input(stream_type & stream,
777  file_format const &)
779  typename sequence_file_input<>::selected_field_ids, // default field ids.
781 
783 template <input_stream stream_type,
784  sequence_file_input_format file_format>
785 sequence_file_input(stream_type && stream,
786  file_format const &)
788  typename sequence_file_input<>::selected_field_ids, // default field ids.
790 
792 template <input_stream stream_type,
793  sequence_file_input_format file_format,
794  detail::fields_specialisation selected_field_ids>
795 sequence_file_input(stream_type && stream,
796  file_format const &,
797  selected_field_ids const &)
801 
803 template <input_stream stream_type,
804  sequence_file_input_format file_format,
805  detail::fields_specialisation selected_field_ids>
806 sequence_file_input(stream_type & stream,
807  file_format const &,
808  selected_field_ids const &)
813 
814 } // namespace seqan3
Provides seqan3::aa27, container aliases and string literals.
Provides alphabet adaptations for standard char types.
The twenty-seven letter amino acid alphabet.
Definition: aa27.hpp:46
The 15 letter DNA alphabet, containing all IUPAC smybols minus the gap.
Definition: dna15.hpp:51
The five letter DNA alphabet of A,C,G,T and the unknown character N.
Definition: dna5.hpp:51
The EMBL format.
Definition: format_embl.hpp:73
The FastA format.
Definition: format_fasta.hpp:81
The FastQ format.
Definition: format_fastq.hpp:79
The GenBank format.
Definition: format_genbank.hpp:73
The SAM format (tag).
Definition: format_sam.hpp:115
Quality type for traditional Sanger and modern Illumina Phred scores.
Definition: phred42.hpp:47
Joins an arbitrary alphabet with a quality alphabet.
Definition: qualified.hpp:61
A class for reading sequence files, e.g. FASTA, FASTQ ...
Definition: input.hpp:309
sequence_file_input(stream_type &stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format >>
Deduces the sequence input file type from the stream and the format.
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of field::id (std::string by defaul).
Definition: input.hpp:364
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: input.hpp:399
std::default_sentinel_t sentinel
The type returned by end().
Definition: input.hpp:409
sequence_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: input.hpp:444
type_list< sequence_type, id_type, quality_type, sequence_quality_type > field_types
The previously defined types aggregated in a seqan3::type_list.
Definition: input.hpp:377
sequence_file_input & operator=(sequence_file_input const &)=delete
Copy assignment is explicitly deleted, because you can't have multiple access to the same file.
reference front() noexcept
Return the record we are currently at in the file.
Definition: input.hpp:600
sequence_file_input(stream_type &stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >>
This is an overloaded member function, provided for convenience. It differs from the above function o...
iterator begin()
Returns an iterator to current position in the file.
Definition: input.hpp:547
sequence_file_input_options_type options
The options are public and its members can be set directly.
Definition: input.hpp:615
sequence_file_input & operator=(sequence_file_input &&)=default
Move assignment is defaulted.
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: input.hpp:572
char stream_char_type
Character type of the stream(s).
Definition: input.hpp:322
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: input.hpp:401
sequence_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: input.hpp:496
sequence_file_input(sequence_file_input const &)=delete
Copy construction is explicitly deleted, because you can't have multiple access to the same file.
sequence_file_input(stream_type &&stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format >>
This is an overloaded member function, provided for convenience. It differs from the above function o...
detail::in_file_iterator< sequence_file_input > iterator
The iterator type of this view (an input iterator).
Definition: input.hpp:405
sequence_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: input.hpp:515
~sequence_file_input()=default
Destructor is defaulted.
sequence_file_input(sequence_file_input &&)=default
Move construction is defaulted.
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: input.hpp:407
sequence_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
typename traits_type::template quality_container< typename traits_type::quality_alphabet > quality_type
The type of field::qual (std::vector <seqan3::phred42> by default).
Definition: input.hpp:367
typename traits_type::template sequence_container< typename traits_type::sequence_alphabet > sequence_type
The type of field::seq (std::vector <seqan3::dna5> by default).
Definition: input.hpp:361
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: input.hpp:316
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: input.hpp:318
sequence_file_input_options< typename traits_type::sequence_legal_alphabet, selected_field_ids::contains(field::_seq_qual_deprecated)> sequence_file_input_options_type
The input file options type.
Definition: input.hpp:610
fields< field::seq, field::id, field::qual, field::_seq_qual_deprecated > field_ids
The subset of seqan3::field IDs that are valid for this file; order corresponds to the types in field...
Definition: input.hpp:329
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: input.hpp:320
typename traits_type::template sequence_container< qualified< typename traits_type::sequence_alphabet, typename traits_type::quality_alphabet > > sequence_quality_type
[DEPRECATED] The type of field::seq_qual (std::vector <seqan3::dna5q> by default).
Definition: input.hpp:374
sequence_file_input(stream_type &&stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format >>
Deduces the sequence input file type from the stream, the format and the field ids.
sequence_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition: input.hpp:387
T data(T... args)
Provides seqan3::dna15, container aliases and string literals.
Provides seqan3::dna5, container aliases and string literals.
This header includes C++17 filesystem support and imports it into namespace std::filesystem (independ...
Provides the seqan3::sequence_file_format_genbank class.
T get(T... args)
field
An enumerator for the fields used in file formats.
Definition: record.hpp:63
@ _seq_qual_deprecated
[DEPRECATED] Sequence and qualities combined in one range. Use field::seq and field::qual instead.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
constexpr bool contains
Whether a type occurs in a type list or not.
Definition: traits.hpp:231
auto const move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:74
Provides the seqan3::detail::in_file_iterator class template.
Resolves to std::ranges::explicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A more refined container concept than seqan3::container.
The generic concept for sequence file in formats.
The requirements a traits_type for seqan3::sequence_file_input must meet.
Refines seqan3::alphabet and adds assignability.
A concept that indicates whether a writable alphabet represents quality scores.
Provides exceptions used in the I/O module.
Stream concepts.
Provides various utility functions required only for input.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides algorithms for meta programming, parameter packs and seqan3::type_list.
Provides seqan3::phred42 quality scores.
Provides quality alphabet composites.
Provides the seqan3::format_sam.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_record.
T size(T... args)
A class template that holds a choice of seqan3::field.
Definition: record.hpp:172
void clear() noexcept(noexcept(std::apply(expander, std::declval< record & >())))
Clears containers that provide .clear() and (re-)initialises all other elements with = {}.
Definition: record.hpp:279
A traits type that specifies input as amino acids.
Definition: input.hpp:166
The default traits for seqan3::sequence_file_input.
Definition: input.hpp:130
char id_alphabet
The alphabet for an identifier string is char.
Definition: input.hpp:147
Type that contains multiple types.
Definition: type_list.hpp:29
Provides traits for seqan3::type_list.
T visit(T... args)