SeqAn3  3.0.1
The Modern C++ library for sequence analysis.
input.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <cassert>
16 #include <fstream>
17 #include <limits>
18 #include <optional>
19 #include <string>
20 #include <type_traits>
21 #include <utility>
22 #include <variant>
23 #include <vector>
24 
25 // remove the following after range-v3 is updated to 1.0
26 #pragma GCC diagnostic push
27 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
28 
36 #include <seqan3/io/exception.hpp>
37 #include <seqan3/std/filesystem>
38 #include <seqan3/io/record.hpp>
41 #include <seqan3/io/detail/record.hpp>
46 
47 namespace seqan3
48 {
49 // ----------------------------------------------------------------------------
50 // structure_file_input_traits
51 // ----------------------------------------------------------------------------
52 
142 template <typename t>
145 SEQAN3_CONCEPT structure_file_input_traits = requires(t v)
146 {
147  // TODO(joergi-w) The expensive concept checks are currently omitted. Check again when compiler has improved.
148  // sequence
153 
154  // id
157 
158  // bpp
159  requires std::is_floating_point_v<typename t::bpp_prob>;
161 
162 // requires container // TODO check Associative container Concept when implemented
163 // <typename t::template bpp_queue
164 // <typename t::template bpp_item
165 // <typename t::bpp_prob, typename t::bpp_partner>>>
166 // && requires(typename t::template bpp_queue // TODO maybe implement also a version that allows emplace_back
167 // <typename t::template bpp_item
168 // <typename t::bpp_prob, typename t::bpp_partner>> value) { value.emplace(1.0, 1); };
169 // requires sequence_container
170 // <typename t::template bpp_container
171 // <typename t::template bpp_queue
172 // <typename t::template bpp_item
173 // <typename t::bpp_prob, typename t::bpp_partner>>>>;
174 
175  // structure
176  requires std::is_same_v<typename t::structure_alphabet, dssp9> // TODO(joergi-w) add aa_structure_concept
179 
180  // structured sequence: tuple composites of seq and structure
181  requires std::is_base_of_v<alphabet_tuple_base
182  <typename t::template structured_seq_alphabet
183  <typename t::seq_alphabet, typename t::structure_alphabet>,
184  typename t::seq_alphabet, typename t::structure_alphabet>,
185  typename t::template structured_seq_alphabet<typename t::seq_alphabet, typename t::structure_alphabet>>;
186 // requires sequence_container
187 // <typename t::template structured_seq_container
188 // <typename t::template structured_seq_alphabet
189 // <typename t::seq_alphabet, typename t::structure_alphabet>>>;
190 
191  // energy: std::optional of floating point number
192  requires std::is_floating_point_v<typename t::energy_type::value_type>;
193 
194  // reactivity [error]
195  requires std::is_floating_point_v<typename t::react_type>;
197 
198  // comment
201 
202  // offset
204 };
206 
207 // ----------------------------------------------------------------------------
208 // structure_file_input_default_traits
209 // ----------------------------------------------------------------------------
210 
225 {
231  // sequence
232 
235 
238 
240  template <typename _seq_alphabet>
242 
243  // id
244 
246  using id_alphabet = char;
247 
249  template <typename _id_alphabet>
251 
252  // base pair probability structure
253 
255  using bpp_prob = double;
256 
258  using bpp_partner = size_t;
259 
261  template <typename _bpp_prob, typename _bpp_partner>
263 
265  template <typename _bpp_item>
267 
269  template <typename _bpp_queue>
271 
272  // fixed structure
273 
275  using structure_alphabet = wuss51;
276 
278  template <typename _structure_alphabet>
280 
281  // combined sequence and structure
282 
284  template <typename _seq_alphabet, typename _structure_alphabet>
286 
288  template <typename _structured_seq_alphabet>
290 
291  // energy
292 
295 
296  // reactivity [error]
297 
299  using react_type = double;
300 
302  template <typename _react_type>
304 
305  // comment
306 
308  using comment_alphabet = char;
309 
311  template <typename _comment_alphabet>
313 
314  // offset
315 
317  using offset_type = size_t;
319 };
320 
324 {
330  using seq_alphabet = aa27;
337  template <typename _seq_alphabet, typename _structure_alphabet>
340 };
341 
342 // ----------------------------------------------------------------------------
343 // structure_file_input
344 // ----------------------------------------------------------------------------
345 
466  detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::structure>,
467  detail::type_list_of_structure_file_input_formats valid_formats_ = type_list<format_vienna>>
469 {
470 public:
475  using traits_type = traits_type_;
478  using selected_field_ids = selected_field_ids_;
480  using valid_formats = valid_formats_;
482  using stream_char_type = char;
484 
488  using field_ids = fields<field::seq,
489  field::id,
490  field::bpp,
494  field::react,
498 
499  static_assert([]() constexpr
500  {
501  for (field f : selected_field_ids::as_array)
502  if (!field_ids::contains(f))
503  return false;
504  return true;
505  }(),
506  "You selected a field that is not valid for structure files, please refer to the documentation "
507  "of structure_file_input::field_ids for the accepted values.");
508 
509  static_assert([]() constexpr
510  {
514  }(), "You may not select field::structured_seq and either of field::seq and field::structure "
515  "at the same time.");
516 
522  using seq_type = typename traits_type::template seq_container<typename traits_type::seq_alphabet>;
525  using id_type = typename traits_type::template id_container<typename traits_type::id_alphabet>;
527  using bpp_type = typename traits_type::template bpp_container
528  <typename traits_type::template bpp_queue
529  <typename traits_type::template bpp_item
530  <typename traits_type::bpp_prob, typename traits_type::bpp_partner>>>;
532  using structure_type = typename traits_type::template structure_container
533  <typename traits_type::structure_alphabet>;
535  using structured_seq_type = typename traits_type::template structured_seq_container
536  <typename traits_type::template structured_seq_alphabet
537  <typename traits_type::seq_alphabet, typename traits_type::structure_alphabet>>;
539  using energy_type = typename traits_type::energy_type;
541  using react_type = typename traits_type::template react_container<typename traits_type::react_type>;
543  using comment_type = typename traits_type::template comment_container
544  <typename traits_type::comment_alphabet>;
546  using offset_type = typename traits_type::offset_type;
547 
551 
556 
561  using value_type = record_type;
566  using const_reference = void;
568  using size_type = size_t;
572  using iterator = detail::in_file_iterator<structure_file_input>;
574  using const_iterator = void;
576  using sentinel = std::ranges::default_sentinel_t;
578 
582  structure_file_input() = delete;
585  structure_file_input(structure_file_input const &) = delete;
593  ~structure_file_input() = default;
594 
612  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
613  primary_stream{new std::ifstream{filename, std::ios_base::in | std::ios::binary}, stream_deleter_default}
614  {
615  if (!primary_stream->good())
616  throw file_open_error{"Could not open file " + filename.string() + " for reading."};
617 
618  // possibly add intermediate decompression stream
619  secondary_stream = detail::make_secondary_istream(*primary_stream, filename);
620 
621  // initialise format handler
622  detail::set_format(format, filename);
623  }
624 
640  template <input_stream stream_t, structure_file_input_format file_format>
644  structure_file_input(stream_t & stream,
645  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
646  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
647  primary_stream{&stream, stream_deleter_noop},
648  format{detail::structure_file_input_format_exposer<file_format>{}}
649  {
650  static_assert(list_traits::contains<file_format, valid_formats>,
651  "You selected a format that is not in the valid_formats of this file.");
652 
653  // possibly add intermediate decompression stream
654  secondary_stream = detail::make_secondary_istream(*primary_stream);
655  }
656 
658  template <input_stream stream_t, structure_file_input_format file_format>
662  structure_file_input(stream_t && stream,
663  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
664  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
665  primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
666  format{detail::structure_file_input_format_exposer<file_format>{}}
667  {
668  static_assert(list_traits::contains<file_format, valid_formats>,
669  "You selected a format that is not in the valid_formats of this file.");
670 
671  // possibly add intermediate compression stream
672  secondary_stream = detail::make_secondary_istream(*primary_stream);
673  }
675 
695  {
696  // buffer first record
697  if (!first_record_was_read)
698  {
699  read_next_record();
700  first_record_was_read = true;
701  }
702 
703  return {*this};
704  }
705 
719  sentinel end() noexcept
720  {
721  return {};
722  }
723 
747  reference front() noexcept
748  {
749  return *begin();
750  }
752 
754  structure_file_input_options<typename traits_type::seq_legal_alphabet,
756 
757 protected:
759 
762  record_type record_buffer;
765 
773  static void stream_deleter_noop(std::basic_istream<stream_char_type> *) {}
775  static void stream_deleter_default(std::basic_istream<stream_char_type> * ptr) { delete ptr; }
776 
778  stream_ptr_t primary_stream{nullptr, stream_deleter_noop};
780  stream_ptr_t secondary_stream{nullptr, stream_deleter_noop};
781 
783  bool first_record_was_read{false};
785  bool at_end{false};
786 
788  using format_type = typename detail::variant_from_tags<valid_formats,
789  detail::structure_file_input_format_exposer>::type;
791  format_type format;
793 
795  void read_next_record()
796  {
797  // clear the record
798  record_buffer.clear();
799 
800  // at end if we could not read further
801  if ((std::istreambuf_iterator<stream_char_type>{*secondary_stream} ==
803  {
804  at_end = true;
805  return;
806  }
807 
808  assert(!format.valueless_by_exception());
809  std::visit([&] (auto & f)
810  {
811  // read new record
813  {
815  "You may not select field::structured_seq and field::structure at the same time.");
817  "You may not select field::structured_seq and field::seq at the same time.");
818  f.read_structure_record(*secondary_stream,
819  options,
820  detail::get_or_ignore<field::structured_seq>(record_buffer), // seq
821  detail::get_or_ignore<field::id>(record_buffer),
822  detail::get_or_ignore<field::bpp>(record_buffer),
823  detail::get_or_ignore<field::structured_seq>(record_buffer), // structure
824  detail::get_or_ignore<field::energy>(record_buffer),
825  detail::get_or_ignore<field::react>(record_buffer),
826  detail::get_or_ignore<field::react_err>(record_buffer),
827  detail::get_or_ignore<field::comment>(record_buffer),
828  detail::get_or_ignore<field::offset>(record_buffer));
829  }
830  else
831  {
832  f.read_structure_record(*secondary_stream,
833  options,
834  detail::get_or_ignore<field::seq>(record_buffer),
835  detail::get_or_ignore<field::id>(record_buffer),
836  detail::get_or_ignore<field::bpp>(record_buffer),
837  detail::get_or_ignore<field::structure>(record_buffer),
838  detail::get_or_ignore<field::energy>(record_buffer),
839  detail::get_or_ignore<field::react>(record_buffer),
840  detail::get_or_ignore<field::react_err>(record_buffer),
841  detail::get_or_ignore<field::comment>(record_buffer),
842  detail::get_or_ignore<field::offset>(record_buffer));
843  }
844  }, format);
845  }
846 
848  friend iterator;
849 };
850 
856 template <input_stream stream_type,
858  structure_file_input_format file_format,
859  detail::fields_specialisation selected_field_ids>
860 structure_file_input(stream_type && stream, file_format const &, selected_field_ids const &)
862  selected_field_ids,
863  type_list<file_format>>;
864 
866 template <input_stream stream_type,
867  structure_file_input_format file_format,
868  detail::fields_specialisation selected_field_ids>
869 structure_file_input(stream_type & stream, file_format const &, selected_field_ids const &)
871  selected_field_ids,
872  type_list<file_format>>;
874 
875 } // namespace seqan3
876 
877 #pragma GCC diagnostic pop
seqan3::structure_file_input_default_traits_rna::bpp_prob
double bpp_prob
The type for a base pair probability is double.
Definition: input.hpp:255
rna15.hpp
Provides seqan3::rna15, container aliases and string literals.
seqan3::structure_file_input::offset_type
typename traits_type::offset_type offset_type
The type of the offset field (default size_t).
Definition: input.hpp:546
seqan3::structure_file_input::const_iterator
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: input.hpp:574
seqan3::field::seq
The "sequence", usually a range of nucleotides or amino acids.
format_vienna.hpp
Provides the seqan3::format_vienna.
fstream
std::basic_string
sequence_container
A more refined container concept than seqan3::container.
utility
seqan3::structure_file_input::traits_type
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: input.hpp:476
seqan3::structure_file_input_default_traits_aa
A traits type that specifies input as amino acids.
Definition: input.hpp:323
seqan3::structure_file_input::structure_file_input
structure_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: input.hpp:644
structure_file_input_traits
The requirements a traits_type for seqan3::structure_file_input must meet.
seqan3::type_list
meta::list< types... > type_list
Type that contains multiple types, an alias for meta::list.
Definition: type_list.hpp:31
seqan3::structure_file_input::id_type
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of the ID field (default std::string).
Definition: input.hpp:525
seqan3::dssp9
The protein structure alphabet of the characters "HGIEBTSCX".
Definition: dssp9.hpp:60
seqan3::field::offset
Sequence (SEQ) relative start position (0-based), unsigned value.
concept.hpp
Stream concepts.
seqan3::structure_file_input::field_types
type_list< seq_type, id_type, bpp_type, structure_type, structured_seq_type, energy_type, react_type, react_type, comment_type, offset_type > field_types
The previously defined types aggregated in a seqan3::type_list.
Definition: input.hpp:550
std::pair
seqan3::structure_file_input::record_type
record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition: input.hpp:554
seqan3::field::bpp
Base pair probability matrix of interactions, usually a matrix of float numbers.
seqan3::structure_file_input::seq_type
typename traits_type::template seq_container< typename traits_type::seq_alphabet > seq_type
The type of the sequence field (default std::vector of seqan3::rna5).
Definition: input.hpp:523
seqan3::structure_file_input::iterator
detail::in_file_iterator< structure_file_input > iterator
The iterator type of this view (an input iterator).
Definition: input.hpp:572
seqan3::structure_file_input::comment_type
typename traits_type::template comment_container< typename traits_type::comment_alphabet > comment_type
The type of the comment field (default double).
Definition: input.hpp:544
vector
explicitly_convertible_to
Resolves to std::ranges::explicitly_convertible_to<type1, type2>().
seqan3::structure_file_input::size_type
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: input.hpp:568
seqan3::field::id
The identifier, usually a string.
seqan3::views::move
const auto move
A view that turns lvalue-references into rvalue-references.
Definition: move.hpp:68
input_format_concept.hpp
Provides seqan3::structure_file_input_format.
record.hpp
Provides the seqan3::record template and the seqan3::field enum.
std::function
seqan3::field::structure
Fixed interactions, usually a string of structure alphabet characters.
filesystem
This header includes C++17 filesystem support and imports it into namespace seqan3::filesystem (indep...
structure_file_input_format
The generic concept for structure file in formats.
seqan3::structure_file_input::begin
iterator begin()
Returns an iterator to current position in the file.
Definition: input.hpp:694
std::filesystem::path
seqan3::structure_file_input::sentinel
std::ranges::default_sentinel_t sentinel
The type returned by end().
Definition: input.hpp:576
seqan3::pack_traits::contains
constexpr bool contains
Whether a type occurs in a pack or not.
Definition: traits.hpp:193
seqan3::structure_file_input::front
reference front() noexcept
Return the record we are currently at in the file.
Definition: input.hpp:747
seqan3::fields
A class template that holds a choice of seqan3::field.
Definition: record.hpp:165
seqan3::field::energy
Energy of a folded sequence, represented by one float number.
seqan3::structure_file_input::structured_seq_type
typename traits_type::template structured_seq_container< typename traits_type::template structured_seq_alphabet< typename traits_type::seq_alphabet, typename traits_type::structure_alphabet > > structured_seq_type
The type of the sequence-structure field (default std::vector of structured_rna<rna5,...
Definition: input.hpp:537
seqan3::structure_file_input::stream_char_type
char stream_char_type
Character type of the stream(s).
Definition: input.hpp:482
seqan3::structure_file_input_default_traits_rna::bpp_partner
size_t bpp_partner
The type for the partner position of a base pair probability is size_t.
Definition: input.hpp:258
same_as
The concept std::same_as<T, U> is satisfied if and only if T and U denote the same type.
rna_structure_alphabet
A concept that indicates whether an alphabet represents RNA structure.
input_options.hpp
Provides seqan3::structure_file_input_options.
all.hpp
Meta-header for the structure module. It includes all headers from alphabet/structure/.
seqan3::rna15
The 15 letter RNA alphabet, containing all IUPAC smybols minus the gap.
Definition: rna15.hpp:48
seqan3::structure_file_input_default_traits_rna::id_alphabet
char id_alphabet
The alphabet for an identifier string is char.
Definition: input.hpp:246
seqan3::structure_file_input::bpp_type
typename traits_type::template bpp_container< typename traits_type::template bpp_queue< typename traits_type::template bpp_item< typename traits_type::bpp_prob, typename traits_type::bpp_partner > >> bpp_type
The type of the base pair probabilies (default std::vector of std::set<std::pair<double,...
Definition: input.hpp:530
seqan3::structure_file_input::~structure_file_input
~structure_file_input()=default
Destructor is defaulted.
seqan3::field::comment
Comment field of arbitrary content, usually a string.
exception.hpp
Provides exceptions used in the I/O module.
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:36
seqan3::structure_file_input_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
seqan3::structure_file_input::energy_type
typename traits_type::energy_type energy_type
The type of the energy field (default double).
Definition: input.hpp:539
seqan3::structure_file_input_default_traits_rna::react_type
double react_type
The type of the reactivity and reactivity error is double.
Definition: input.hpp:299
std::istreambuf_iterator
seqan3::rna5
The five letter RNA alphabet of A,C,G,U and the unknown character N.
Definition: rna5.hpp:46
std::make_signed_t
seqan3::field::structured_seq
Sequence and fixed interactions combined in one range.
seqan3::structure_file_input
A class for reading structured sequence files, e.g. Stockholm, Connect, Vienna, ViennaRNA bpp matrix ...
Definition: input.hpp:468
misc_input.hpp
Provides various utility functions required only for input.
seqan3::structure_file_input::operator=
structure_file_input & operator=(structure_file_input const &)=delete
Copy assignment is explicitly deleted, because you cannot have multiple access to the same file.
char.hpp
Provides alphabet adaptations for standard char types.
concatenated_sequences.hpp
Provides seqan3::concatenated_sequences.
seqan3::structure_file_input_default_traits_rna::offset_type
size_t offset_type
The type of the offset is size_t.
Definition: input.hpp:317
seqan3::structure_file_input::valid_formats
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: input.hpp:480
seqan3::structure_file_input::selected_field_ids
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: input.hpp:478
seqan3::field::react
Reactivity values of the sequence characters given in a vector of float numbers.
limits
rna5.hpp
Provides seqan3::rna5, container aliases and string literals.
seqan3::structure_file_input::end
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: input.hpp:719
cassert
seqan3::structured_rna
A seqan3::alphabet_tuple_base that joins a nucleotide alphabet with an RNA structure alphabet.
Definition: structured_rna.hpp:52
seqan3::field
field
An enumerator for the fields used in file formats.
Definition: record.hpp:64
seqan3::structure_file_input::structure_file_input
structure_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
seqan3::record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids >
seqan3::structure_file_input_default_traits_rna::comment_alphabet
char comment_alphabet
The alphabet for a comment string is char.
Definition: input.hpp:308
std::visit
T visit(T... args)
all.hpp
Meta-header for the aminoacid submodule; includes all headers from alphabet/aminoacid/.
seqan3::structure_file_input::options
structure_file_input_options< typename traits_type::seq_legal_alphabet, selected_field_ids::contains(field::structured_seq)> options
The options are public and its members can be set directly.
Definition: input.hpp:755
seqan3::field::react_err
Reactivity error values given in a vector corresponding to REACT.
optional
seqan3::structure_file_input_default_traits_rna::structure_alphabet
wuss51 structure_alphabet
The alphabet for a structure annotation is seqan3::phred42.
Definition: input.hpp:275
seqan3::structure_file_input::react_type
typename traits_type::template react_container< typename traits_type::react_type > react_type
The type of the reactivity and reactivity error fields (default double).
Definition: input.hpp:541
in_file_iterator.hpp
Provides the seqan3::detail::in_file_iterator class template.
seqan3::structure_file_input::structure_file_input
structure_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: input.hpp:611
seqan3::structured_aa
A seqan3::alphabet_tuple_base that joins an aminoacid alphabet with a protein structure alphabet.
Definition: structured_aa.hpp:51
seqan3::structure_file_input::const_reference
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: input.hpp:566
seqan3::structure_file_input::structure_type
typename traits_type::template structure_container< typename traits_type::structure_alphabet > structure_type
The type of the structure field (default std::vector of seqan3::wuss51).
Definition: input.hpp:533
traits.hpp
Provides traits for seqan3::type_list.
std::basic_istream
seqan3::structure_file_input_default_traits_rna
The default traits for seqan3::structure_file_input.
Definition: input.hpp:224
writable_alphabet
Refines seqan3::alphabet and adds assignability.
std::unique_ptr< std::basic_istream< stream_char_type >, std::function< void(std::basic_istream< stream_char_type > *)> >
seqan3::aa27
The twenty-seven letter amino acid alphabet.
Definition: aa27.hpp:43
std::numeric_limits
std::set
seqan3::structure_file_input::structure_file_input
structure_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: input.hpp:662
std::is_base_of_v
T is_base_of_v
variant
std::ifstream
string