SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
input.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <cassert>
16 #include <fstream>
17 #include <string>
18 #include <variant>
19 #include <vector>
20 
21 #include <range/v3/algorithm/equal.hpp>
22 
30 #include <seqan3/io/exception.hpp>
31 #include <seqan3/std/filesystem>
32 #include <seqan3/io/record.hpp>
35 #include <seqan3/io/detail/record.hpp>
43 
44 namespace seqan3
45 {
46 
47 // ----------------------------------------------------------------------------
48 // SequenceFileInputTraits
49 // ----------------------------------------------------------------------------
50 
111 template <typename t>
114 SEQAN3_CONCEPT SequenceFileInputTraits = requires (t v)
115 {
116  requires WritableAlphabet<typename t::sequence_alphabet>;
117  requires WritableAlphabet<typename t::sequence_legal_alphabet>;
118  requires ExplicitlyConvertibleTo<typename t::sequence_legal_alphabet, typename t::sequence_alphabet>;
119  requires SequenceContainer<typename t::template sequence_container<typename t::sequence_alphabet>>;
120  requires SequenceContainer<typename t::template sequence_container_container<
121  typename t::template sequence_container<typename t::sequence_alphabet>>>;
122 
123  requires WritableAlphabet<typename t::id_alphabet>;
124  requires SequenceContainer<typename t::template id_container<typename t::id_alphabet>>;
125  requires SequenceContainer<typename t::template id_container_container<typename t::template id_container<
126  typename t::id_alphabet>>>;
127 
128  requires WritableQualityAlphabet<typename t::quality_alphabet>;
129  requires SequenceContainer<typename t::template quality_container<typename t::quality_alphabet>>;
130  requires SequenceContainer<typename t::template quality_container_container<
131  typename t::template quality_container<typename t::quality_alphabet>>>;
132 };
134 
135 // ----------------------------------------------------------------------------
136 // sequence_file_input_default_traits
137 // ----------------------------------------------------------------------------
138 
153 {
159  using sequence_alphabet = dna5;
161 
164 
166  template <typename _sequence_alphabet>
168 
170  template <typename _sequence_container>
172 
174  using id_alphabet = char;
175 
177  template <typename _id_alphabet>
179 
181  template <typename _id_container>
183 
186 
188  template <typename _quality_alphabet>
190 
192  template <typename _quality_container>
195 };
196 
200 {
206  using sequence_alphabet = aa27;
208 
212 };
213 
214 // ----------------------------------------------------------------------------
215 // sequence_file_input
216 // ----------------------------------------------------------------------------
217 
353 template <
355  detail::Fields selected_field_ids_ = fields<field::SEQ,
356  field::ID,
357  field::QUAL>,
358  detail::TypeListOfSequenceFileInputFormats valid_formats_ = type_list<format_embl,
359  format_fasta,
360  format_fastq,
362  format_sam>,
363  Char stream_char_type_ = char>
365 {
366 public:
371  using traits_type = traits_type_;
374  using selected_field_ids = selected_field_ids_;
376  using valid_formats = valid_formats_;
378  using stream_char_type = stream_char_type_;
380 
385 
386  static_assert([] () constexpr
387  {
388  for (field f : selected_field_ids::as_array)
389  if (!field_ids::contains(f))
390  return false;
391  return true;
392  }(),
393  "You selected a field that is not valid for sequence files, please refer to the documentation "
394  "of sequence_file_input::field_ids for the accepted values.");
395 
396  static_assert([] () constexpr
397  {
398  return !(selected_field_ids::contains(field::SEQ_QUAL) &&
399  (selected_field_ids::contains(field::SEQ) ||
400  (selected_field_ids::contains(field::QUAL))));
401  }(),
402  "You may not select field::SEQ_QUAL and either of field::SEQ and field::QUAL at the same time.");
403 
409  using sequence_type = typename traits_type::template sequence_container<
411  typename traits_type::sequence_alphabet>;
413  using id_type = typename traits_type::template id_container<
414  typename traits_type::id_alphabet>;
416  using quality_type = typename traits_type::template quality_container<
417  typename traits_type::quality_alphabet>;
419  using sequence_quality_type = typename traits_type::
420  template sequence_container<qualified<typename traits_type::sequence_alphabet,
421  typename traits_type::quality_alphabet>>;
422 
425 
430 
436  using sequence_column_type = typename traits_type::template sequence_container_container<sequence_type>;
439  using id_column_type = typename traits_type::template id_container_container<id_type>;
441  using quality_column_type = typename traits_type::template quality_container_container<quality_type>;
443  using sequence_quality_column_type = typename traits_type::template sequence_container_container<sequence_quality_type>;
450  using file_as_tuple_type = record<detail::select_types_with_ids_t<field_column_types,
451  field_ids,
455 
460  using value_type = record_type;
465  using const_reference = void;
467  using size_type = size_t;
471  using iterator = detail::in_file_iterator<sequence_file_input>;
473  using const_iterator = void;
477 
481  sequence_file_input() = delete;
484  sequence_file_input(sequence_file_input const &) = delete;
492  ~sequence_file_input() = default;
493 
511  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
512  primary_stream{new std::ifstream{filename, std::ios_base::in | std::ios::binary}, stream_deleter_default}
513  {
514  if (!primary_stream->good())
515  throw file_open_error{"Could not open file " + filename.string() + " for reading."};
516 
517  // possibly add intermediate compression stream
518  secondary_stream = detail::make_secondary_istream(*primary_stream, filename);
519 
520  // initialise format handler or throw if format is not found
521  detail::set_format(format, filename);
522 
523  // buffer first record
524  read_next_record();
525  }
526  /* NOTE(h-2): Curiously we do not need a user-defined deduction guide for the above constructor.
527  * A combination of default template parameters and auto-deduction guides works as expected,
528  * independent of whether the second/optional parameter is specified or not, i.e. it is possible
529  * to auto-deduct and overwrite a single template parameter out of the four if the optional parameter
530  * is specified and use the default otherwise.
531  */
532 
547  template <IStream2 stream_t,
548  SequenceFileInputFormat file_format>
549  sequence_file_input(stream_t & stream,
550  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
551  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
552  primary_stream{&stream, stream_deleter_noop},
553  format{detail::sequence_file_input_format<file_format>{}}
554  {
555  static_assert(meta::in<valid_formats, file_format>::value,
556  "You selected a format that is not in the valid_formats of this file.");
557 
558  // possibly add intermediate compression stream
559  secondary_stream = detail::make_secondary_istream(*primary_stream);
560 
561  // buffer first record
562  read_next_record();
563  }
564 
566  template <IStream2 stream_t,
567  SequenceFileInputFormat file_format>
568  sequence_file_input(stream_t && stream,
569  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
570  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
571  primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
572  format{detail::sequence_file_input_format<file_format>{}}
573  {
574  static_assert(meta::in<valid_formats, file_format>::value,
575  "You selected a format that is not in the valid_formats of this file.");
576 
577  // possibly add intermediate compression stream
578  secondary_stream = detail::make_secondary_istream(*primary_stream);
579 
580  // buffer first record
581  read_next_record();
582  }
584 
602  iterator begin() noexcept
603  {
604  return {*this};
605  }
606 
620  sentinel end() noexcept
621  {
622  return {};
623  }
624 
648  reference front() noexcept
649  {
650  return record_buffer;
651  }
653 
658  template <field f>
660  friend auto & get(sequence_file_input & file)
661  {
662  static_assert(sequence_file_input::selected_field_ids::contains(f),
663  "You requested a field via get that was not selected for the file.");
664 
665  file.read_columns();
666 
667  return seqan3::get<f>(file.columns_buffer);
668  }
669 
671  template <field f>
672  friend auto && get(sequence_file_input && file)
673  {
674  return std::move(get<f>(file));
675  }
676 
678  template <size_t i>
679  friend auto & get(sequence_file_input & file)
680  {
681  static_assert(i < sequence_file_input::selected_field_ids::as_array.size(),
682  "You requested a field number larger than the number of selected fields for the file.");
683  file.read_columns();
684 
685  return std::get<i>(file.columns_buffer);
686  }
687 
689  template <size_t i>
690  friend auto && get(sequence_file_input && file)
691  {
692  return std::move(get<i>(file));
693  }
694 
696  template <typename t>
697  friend auto & get(sequence_file_input & file)
698  {
699  file.read_columns();
700 
701  return std::get<t>(file.columns_buffer);
702  }
703 
705  template <typename t>
706  friend auto && get(sequence_file_input && file)
707  {
708  return std::move(get<t>(file));
709  }
711 
713  sequence_file_input_options<typename traits_type::sequence_legal_alphabet,
714  selected_field_ids::contains(field::SEQ_QUAL)> options;
715 
716 protected:
718 
721  record_type record_buffer;
724  file_as_tuple_type columns_buffer;
726 
734  static void stream_deleter_noop(std::basic_istream<stream_char_type> *) {}
736  static void stream_deleter_default(std::basic_istream<stream_char_type> * ptr) { delete ptr; }
737 
739  stream_ptr_t primary_stream{nullptr, stream_deleter_noop};
741  stream_ptr_t secondary_stream{nullptr, stream_deleter_noop};
742 
744  bool at_end{false};
745 
747  using format_type = typename detail::variant_from_tags<valid_formats, detail::sequence_file_input_format>::type;
749  format_type format;
751 
753  void read_next_record()
754  {
755  // clear the record
756  record_buffer.clear();
757 
758  // at end if we could not read further
759  if ((std::istreambuf_iterator<stream_char_type>{*secondary_stream} ==
761  {
762  at_end = true;
763  return;
764  }
765 
766  assert(!format.valueless_by_exception());
767  std::visit([&] (auto & f)
768  {
769  // read new record
770  if constexpr (selected_field_ids::contains(field::SEQ_QUAL))
771  {
772  f.read(*secondary_stream,
773  options,
774  detail::get_or_ignore<field::SEQ_QUAL>(record_buffer),
775  detail::get_or_ignore<field::ID>(record_buffer),
776  detail::get_or_ignore<field::SEQ_QUAL>(record_buffer));
777  }
778  else
779  {
780  f.read(*secondary_stream,
781  options,
782  detail::get_or_ignore<field::SEQ>(record_buffer),
783  detail::get_or_ignore<field::ID>(record_buffer),
784  detail::get_or_ignore<field::QUAL>(record_buffer));
785  }
786  }, format);
787  }
788 
790  void read_columns()
791  {
792  //TODO don't do multiple visits
793  //TODO create specialised version for concatenated_sequences where we append on the concat
794  auto & sequence_column_buffer = detail::get_or_ignore<field::SEQ>(columns_buffer);
795  auto & id_column_buffer = detail::get_or_ignore<field::ID>(columns_buffer);
796  auto & qual_column_buffer = detail::get_or_ignore<field::QUAL>(columns_buffer);
797  auto & seq_qual_column_buffer = detail::get_or_ignore<field::SEQ_QUAL>(columns_buffer);
798 
799  // read the remaining records and split into column buffers
800  for (auto & rec : *this)
801  {
802  if constexpr (selected_field_ids::contains(field::SEQ))
803  sequence_column_buffer.push_back(std::move(seqan3::get<field::SEQ>(rec)));
804  if constexpr (selected_field_ids::contains(field::ID))
805  id_column_buffer.push_back(std::move(seqan3::get<field::ID>(rec)));
806  if constexpr (selected_field_ids::contains(field::QUAL))
807  qual_column_buffer.push_back(std::move(seqan3::get<field::QUAL>(rec)));
808  if constexpr (selected_field_ids::contains(field::SEQ_QUAL))
809  seq_qual_column_buffer.push_back(std::move(seqan3::get<field::SEQ_QUAL>(rec)));
810  }
811  }
812 
814  friend iterator;
815 };
816 
822 template <IStream2 stream_type,
824  SequenceFileInputFormat file_format,
825  detail::Fields selected_field_ids>
826 sequence_file_input(stream_type && stream,
827  file_format const &,
828  selected_field_ids const &)
830  selected_field_ids,
831  type_list<file_format>,
833 
835 template <IStream2 stream_type,
836  SequenceFileInputFormat file_format,
837  detail::Fields selected_field_ids>
838 sequence_file_input(stream_type & stream,
839  file_format const &,
840  selected_field_ids const &)
842  selected_field_ids,
843  type_list<file_format>,
846 
847 } // namespace seqan3
848 
849 // ------------------------------------------------------------------
850 // std-overloads for the tuple-like interface
851 // ------------------------------------------------------------------
852 
853 namespace std
854 {
860 template <seqan3::SequenceFileInputTraits traits_type,
861  seqan3::detail::Fields selected_field_ids,
862  seqan3::detail::TypeListOfSequenceFileInputFormats valid_formats,
863  seqan3::Char stream_char_t>
864 struct tuple_size<seqan3::sequence_file_input<traits_type, selected_field_ids, valid_formats, stream_char_t>>
865 {
867  static constexpr size_t value = selected_field_ids::as_array.size();
868 };
869 
875 template <size_t elem_no,
877  seqan3::detail::Fields selected_field_ids,
878  seqan3::detail::TypeListOfSequenceFileInputFormats valid_formats,
879  seqan3::Char stream_char_t>
880 struct tuple_element<elem_no, seqan3::sequence_file_input<traits_type, selected_field_ids, valid_formats, stream_char_t>>
881  : tuple_element<elem_no, typename seqan3::sequence_file_input<traits_type,
882  selected_field_ids,
883  valid_formats,
884  stream_char_t>::file_as_tuple_type>
885 {};
886 
887 } // namespace std
Provides quality alphabet composites.
sequence_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: input.hpp:510
Provides seqan3::SequenceFileInputFormat and auxiliary classes.
type_list< sequence_column_type, id_column_type, quality_column_type, sequence_quality_column_type > field_column_types
The previously defined types aggregated in a seqan3::type_list.
Definition: input.hpp:448
Provides the seqan3::format_embl tag and the seqan3::sequence_file_input_format and seqan3::sequence_...
T visit(T... args)
The "sequence", usually a range of nucleotides or amino acids.
The FastQ format. (tag)
Definition: format_fastq.hpp:83
sequence_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
Provides exceptions used in the I/O module.
The SAM format (tag).
Definition: format_sam.hpp:126
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: input.hpp:473
sequence_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Definition: input.hpp:568
The EMBL format (tag).
Definition: format_embl.hpp:75
Provides the seqan3::format_fastq tag and the seqan3::sequence_file_input_format and seqan3::sequence...
The requirements a traits_type for seqan3::sequence_file_input must meet.
reference front() noexcept
Return the record we are currently at in the file.
Definition: input.hpp:648
sequence_file_input & operator=(sequence_file_input const &)=delete
Copy assignment is explicitly deleted, because you can&#39;t have multiple access to the same file...
The default traits for seqan3::sequence_file_input.
Definition: input.hpp:152
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: input.hpp:376
This concept encompasses exactly the types char, signed char, unsigned char, wchar_t, char16_t and char32_t.
Provides seqan3::aa27, container aliases and string literals.
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: input.hpp:372
sequence_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: input.hpp:549
SeqAn specific customisations in the standard namespace.
sequence_file_input_options< typename traits_type::sequence_legal_alphabet, selected_field_ids::contains(field::SEQ_QUAL)> options
The options are public and its members can be set directly.
Definition: input.hpp:714
detail::in_file_iterator< sequence_file_input > iterator
The iterator type of this view (an input iterator).
Definition: input.hpp:471
typename traits_type::template quality_container_container< quality_type > quality_column_type
Column type of field::QUAL (seqan3::concatenated_sequences<quality_type> by default).
Definition: input.hpp:441
The 15 letter DNA alphabet, containing all IUPAC smybols minus the gap.
Definition: dna15.hpp:48
::ranges::size size
Alias for ranges::size. Obtains the size of a range whose size can be calculated in constant time...
Definition: ranges:189
The main SeqAn3 namespace.
typename traits_type::template sequence_container_container< sequence_quality_type > sequence_quality_column_type
Column type of field::SEQ_QUAL (seqan3::concatenated_sequences<sequence_quality_type> by default)...
Definition: input.hpp:443
The qualities, usually in phred-score notation.
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: input.hpp:620
Joins an arbitrary alphabet with a quality alphabet.
Definition: qualified.hpp:59
Provides seqan3::concatenated_sequences.
iterator begin() noexcept
Returns an iterator to current position in the file.
Definition: input.hpp:602
The twenty-seven letter amino acid alphabet.
Definition: aa27.hpp:43
typename traits_type::template quality_container< typename traits_type::quality_alphabet > quality_type
The type of field::QUAL (std::vector <seqan3::phred42> by default).
Definition: input.hpp:417
A class template that holds a choice of seqan3::field.
Definition: record.hpp:127
Container that stores sequences concatenated internally.
Definition: concatenated_sequences.hpp:89
The five letter DNA alphabet of A,C,G,T and the unknown character N.
Definition: dna5.hpp:48
The options type defines various option members that influence the behaviour of all or some formats...
Definition: input_options.hpp:25
Sequence and qualities combined in one range.
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: input.hpp:374
Provides alphabet adaptations for standard char types.
A traits type that specifies input as amino acids.
Definition: input.hpp:199
record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition: input.hpp:428
Provides the seqan3::record template and the seqan3::field enum.
Provides various utility functions required only for input.
The identifier, usually a string.
The FastA format (tag).
Definition: format_fasta.hpp:79
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: input.hpp:465
typename traits_type::template sequence_container< typename traits_type::sequence_alphabet > sequence_type
The type of field::SEQ (std::vector <seqan3::dna5> by default).
Definition: input.hpp:411
typename traits_type::template sequence_container< qualified< typename traits_type::sequence_alphabet, typename traits_type::quality_alphabet > > sequence_quality_type
The type of field::SEQ_QUAL (std::vector <seqan3::dna5q> by default).
Definition: input.hpp:421
type_list< sequence_type, id_type, quality_type, sequence_quality_type > field_types
The previously defined types aggregated in a seqan3::type_list.
Definition: input.hpp:424
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: input.hpp:467
Stream concepts.
typename traits_type::template sequence_container_container< sequence_type > sequence_column_type
Column type of field::SEQ (seqan3::concatenated_sequences<sequence_type> by default).
Definition: input.hpp:437
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of field::ID (std::string by defaul).
Definition: input.hpp:414
Provides seqan3::phred42 quality scores.
Provides various type traits on generic types.
::ranges::default_sentinel_t default_sentinel_t
Alias for ranges::default_sentinel_t. Type of ranges::default_sentinel.
Definition: iterator:351
Meta-header for the nucleotide submodule; includes all headers from alphabet/nucleotide/.
meta::list< types... > type_list
Type that contains multiple types, an alias for meta::list.
Definition: type_list.hpp:27
stream_char_type_ stream_char_type
Character type of the stream(s), usually char.
Definition: input.hpp:378
fields< field::SEQ, field::ID, field::QUAL, field::SEQ_QUAL > field_ids
The subset of seqan3::field IDs that are valid for this file; order corresponds to the types in field...
Definition: input.hpp:384
typename traits_type::template id_container_container< id_type > id_column_type
Column type of field::ID (seqan3::concatenated_sequences<id_type> by default).
Definition: input.hpp:439
Provides the seqan3::sequence_file_format_genbank class.
field
An enumerator for the fields used in file formats.Some of the fields are shared between formats...
Definition: record.hpp:63
A class for reading sequence files, e.g. FASTA, FASTQ ...
Definition: input.hpp:364
Provides the seqan3::format_sam tag and the seqan3::sequence_file_input_format and seqan3::sequence_f...
std::ranges::default_sentinel_t sentinel
The type returned by end().
Definition: input.hpp:475
Quality type for traditional Sanger and modern Illumina Phred scores (typical range).
Definition: phred42.hpp:43
Provides the seqan3::detail::in_file_iterator class template.
Provides the seqan3::format_fasta tag and the seqan3::sequence_file_input_format and seqan3::sequence...
The GenBank format (tag).
Definition: format_genbank.hpp:72
~sequence_file_input()=default
Destructor is defaulted.
char id_alphabet
The alphabet for an identifier string is char.
Definition: input.hpp:174
This header includes C++17 filesystem support and imports it into namespace seqan3::filesystem (indep...