SeqAn3  3.1.0-rc.1
The Modern C++ library for sequence analysis.
input.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <cassert>
16 #include <seqan3/std/filesystem>
17 #include <fstream>
18 #include <limits>
19 #include <optional>
20 #include <string>
21 #include <type_traits>
22 #include <utility>
23 #include <variant>
24 #include <vector>
25 
33 #include <seqan3/io/exception.hpp>
36 #include <seqan3/io/detail/record.hpp>
43 
44 namespace seqan3
45 {
46 // ----------------------------------------------------------------------------
47 // structure_file_input_traits
48 // ----------------------------------------------------------------------------
49 
141 template <typename t>
142 SEQAN3_CONCEPT structure_file_input_traits = requires(t v)
143 {
144  // TODO(joergi-w) The expensive concept checks are currently omitted. Check again when compiler has improved.
145  // sequence
150 
151  // id
154 
155  // bpp
156  requires std::is_floating_point_v<typename t::bpp_prob>;
158 
159 // requires container // TODO check Associative container Concept when implemented
160 // <typename t::template bpp_queue
161 // <typename t::template bpp_item
162 // <typename t::bpp_prob, typename t::bpp_partner>>>
163 // && requires(typename t::template bpp_queue // TODO maybe implement also a version that allows emplace_back
164 // <typename t::template bpp_item
165 // <typename t::bpp_prob, typename t::bpp_partner>> value) { value.emplace(1.0, 1); };
166 // requires sequence_container
167 // <typename t::template bpp_container
168 // <typename t::template bpp_queue
169 // <typename t::template bpp_item
170 // <typename t::bpp_prob, typename t::bpp_partner>>>>;
171 
172  // structure
173  requires std::is_same_v<typename t::structure_alphabet, dssp9> // TODO(joergi-w) add aa_structure_concept
176 
177  // structured sequence: tuple composites of seq and structure
178  requires std::is_base_of_v<alphabet_tuple_base
179  <typename t::template structured_seq_alphabet
180  <typename t::seq_alphabet, typename t::structure_alphabet>,
181  typename t::seq_alphabet, typename t::structure_alphabet>,
182  typename t::template structured_seq_alphabet<typename t::seq_alphabet, typename t::structure_alphabet>>;
183 // requires sequence_container
184 // <typename t::template structured_seq_container
185 // <typename t::template structured_seq_alphabet
186 // <typename t::seq_alphabet, typename t::structure_alphabet>>>;
187 
188  // energy: std::optional of floating point number
189  requires std::is_floating_point_v<typename t::energy_type::value_type>;
190 
191  // reactivity [error]
192  requires std::is_floating_point_v<typename t::react_type>;
194 
195  // comment
198 
199  // offset
201 };
203 
204 // ----------------------------------------------------------------------------
205 // structure_file_input_default_traits
206 // ----------------------------------------------------------------------------
207 
222 {
228  // sequence
229 
232 
235 
237  template <typename _seq_alphabet>
239 
240  // id
241 
243  using id_alphabet = char;
244 
246  template <typename _id_alphabet>
248 
249  // base pair probability structure
250 
252  using bpp_prob = double;
253 
255  using bpp_partner = size_t;
256 
258  template <typename _bpp_prob, typename _bpp_partner>
260 
262  template <typename _bpp_item>
264 
266  template <typename _bpp_queue>
268 
269  // fixed structure
270 
272  using structure_alphabet = wuss51;
273 
275  template <typename _structure_alphabet>
277 
278  // combined sequence and structure
279 
281  template <typename _seq_alphabet, typename _structure_alphabet>
283 
285  template <typename _structured_seq_alphabet>
287 
288  // energy
289 
292 
293  // reactivity [error]
294 
296  using react_type = double;
297 
299  template <typename _react_type>
301 
302  // comment
303 
305  using comment_alphabet = char;
306 
308  template <typename _comment_alphabet>
310 
311  // offset
312 
314  using offset_type = size_t;
316 };
317 
321 {
334  template <typename _seq_alphabet, typename _structure_alphabet>
337 };
338 
339 // ----------------------------------------------------------------------------
340 // structure_file_input
341 // ----------------------------------------------------------------------------
342 
463  detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::structure>,
464  detail::type_list_of_structure_file_input_formats valid_formats_ = type_list<format_vienna>>
466 {
467 public:
473  using traits_type = traits_type_;
475  using selected_field_ids = selected_field_ids_;
477  using valid_formats = valid_formats_;
479  using stream_char_type = char;
481 
486  field::id,
487  field::bpp,
491  field::react,
494  field::offset>;
495 
496  static_assert([]() constexpr
497  {
498  for (field f : selected_field_ids::as_array)
499  if (!field_ids::contains(f))
500  return false;
501  return true;
502  }(),
503  "You selected a field that is not valid for structure files, please refer to the documentation "
504  "of structure_file_input::field_ids for the accepted values.");
505 
506  static_assert([]() constexpr
507  {
511  }(), "You may not select field::structured_seq and either of field::seq and field::structure "
512  "at the same time.");
513 
520  using seq_type = typename traits_type::template seq_container<typename traits_type::seq_alphabet>;
522  using id_type = typename traits_type::template id_container<typename traits_type::id_alphabet>;
524  using bpp_type = typename traits_type::template bpp_container
525  <typename traits_type::template bpp_queue
526  <typename traits_type::template bpp_item
527  <typename traits_type::bpp_prob, typename traits_type::bpp_partner>>>;
529  using structure_type = typename traits_type::template structure_container
530  <typename traits_type::structure_alphabet>;
532  using structured_seq_type = typename traits_type::template structured_seq_container
533  <typename traits_type::template structured_seq_alphabet
534  <typename traits_type::seq_alphabet, typename traits_type::structure_alphabet>>;
536  using energy_type = typename traits_type::energy_type;
538  using react_type = typename traits_type::template react_container<typename traits_type::react_type>;
540  using comment_type = typename traits_type::template comment_container
541  <typename traits_type::comment_alphabet>;
543  using offset_type = typename traits_type::offset_type;
544 
548 
553 
563  using const_reference = void;
565  using size_type = size_t;
569  using iterator = detail::in_file_iterator<structure_file_input>;
571  using const_iterator = void;
573  using sentinel = std::default_sentinel_t;
575 
591 
609  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
610  primary_stream{new std::ifstream{}, stream_deleter_default}
611  {
612  primary_stream->rdbuf()->pubsetbuf(stream_buffer.data(), stream_buffer.size());
613  static_cast<std::basic_ifstream<char> *>(primary_stream.get())->open(filename,
614  std::ios_base::in | std::ios::binary);
615 
616  if (!primary_stream->good())
617  throw file_open_error{"Could not open file " + filename.string() + " for reading."};
618 
619  // possibly add intermediate decompression stream
620  secondary_stream = detail::make_secondary_istream(*primary_stream, filename);
621 
622  // initialise format handler
623  detail::set_format(format, filename);
624  }
625 
641  template <input_stream stream_t, structure_file_input_format file_format>
643  requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, char>
645  structure_file_input(stream_t & stream,
646  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
647  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
648  primary_stream{&stream, stream_deleter_noop},
649  format{detail::structure_file_input_format_exposer<file_format>{}}
650  {
651  static_assert(list_traits::contains<file_format, valid_formats>,
652  "You selected a format that is not in the valid_formats of this file.");
653 
654  // possibly add intermediate decompression stream
655  secondary_stream = detail::make_secondary_istream(*primary_stream);
656  }
657 
659  template <input_stream stream_t, structure_file_input_format file_format>
661  requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, char>
663  structure_file_input(stream_t && stream,
664  file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
665  selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
666  primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
667  format{detail::structure_file_input_format_exposer<file_format>{}}
668  {
669  static_assert(list_traits::contains<file_format, valid_formats>,
670  "You selected a format that is not in the valid_formats of this file.");
671 
672  // possibly add intermediate compression stream
673  secondary_stream = detail::make_secondary_istream(*primary_stream);
674  }
676 
696  {
697  // buffer first record
698  if (!first_record_was_read)
699  {
700  read_next_record();
701  first_record_was_read = true;
702  }
703 
704  return {*this};
705  }
706 
720  sentinel end() noexcept
721  {
722  return {};
723  }
724 
748  reference front() noexcept
749  {
750  return *begin();
751  }
753 
755  structure_file_input_options<typename traits_type::seq_legal_alphabet,
757 
758 protected:
760 
764  record_type record_buffer;
766  std::vector<char> stream_buffer{std::vector<char>(1'000'000)};
768 
776  static void stream_deleter_noop(std::basic_istream<stream_char_type> *) {}
778  static void stream_deleter_default(std::basic_istream<stream_char_type> * ptr) { delete ptr; }
779 
781  stream_ptr_t primary_stream{nullptr, stream_deleter_noop};
783  stream_ptr_t secondary_stream{nullptr, stream_deleter_noop};
784 
786  bool first_record_was_read{false};
788  bool at_end{false};
789 
791  using format_type = typename detail::variant_from_tags<valid_formats,
792  detail::structure_file_input_format_exposer>::type;
794  format_type format;
796 
798  void read_next_record()
799  {
800  // clear the record
801  record_buffer.clear();
802 
803  // at end if we could not read further
804  if ((std::istreambuf_iterator<stream_char_type>{*secondary_stream} ==
806  {
807  at_end = true;
808  return;
809  }
810 
811  assert(!format.valueless_by_exception());
812  std::visit([&] (auto & f)
813  {
814  // read new record
816  {
818  "You may not select field::structured_seq and field::structure at the same time.");
820  "You may not select field::structured_seq and field::seq at the same time.");
821  f.read_structure_record(*secondary_stream,
822  options,
823  detail::get_or_ignore<field::structured_seq>(record_buffer), // seq
824  detail::get_or_ignore<field::id>(record_buffer),
825  detail::get_or_ignore<field::bpp>(record_buffer),
826  detail::get_or_ignore<field::structured_seq>(record_buffer), // structure
827  detail::get_or_ignore<field::energy>(record_buffer),
828  detail::get_or_ignore<field::react>(record_buffer),
829  detail::get_or_ignore<field::react_err>(record_buffer),
830  detail::get_or_ignore<field::comment>(record_buffer),
831  detail::get_or_ignore<field::offset>(record_buffer));
832  }
833  else
834  {
835  f.read_structure_record(*secondary_stream,
836  options,
837  detail::get_or_ignore<field::seq>(record_buffer),
838  detail::get_or_ignore<field::id>(record_buffer),
839  detail::get_or_ignore<field::bpp>(record_buffer),
840  detail::get_or_ignore<field::structure>(record_buffer),
841  detail::get_or_ignore<field::energy>(record_buffer),
842  detail::get_or_ignore<field::react>(record_buffer),
843  detail::get_or_ignore<field::react_err>(record_buffer),
844  detail::get_or_ignore<field::comment>(record_buffer),
845  detail::get_or_ignore<field::offset>(record_buffer));
846  }
847  }, format);
848  }
849 
851  friend iterator;
852 };
853 
860 template <input_stream stream_type,
861  structure_file_input_format file_format,
862  detail::fields_specialisation selected_field_ids>
863 structure_file_input(stream_type && stream, file_format const &, selected_field_ids const &)
867 
869 template <input_stream stream_type,
870  structure_file_input_format file_format,
871  detail::fields_specialisation selected_field_ids>
872 structure_file_input(stream_type & stream, file_format const &, selected_field_ids const &)
877 
878 } // namespace seqan3
Provides seqan3::aa27, container aliases and string literals.
Provides alphabet adaptations for standard char types.
The twenty-seven letter amino acid alphabet.
Definition: aa27.hpp:46
The protein structure alphabet of the characters "HGIEBTSCX".
Definition: dssp9.hpp:63
The 15 letter RNA alphabet, containing all IUPAC smybols minus the gap.
Definition: rna15.hpp:51
The five letter RNA alphabet of A,C,G,U and the unknown character N.
Definition: rna5.hpp:49
A class for reading structured sequence files, e.g. Stockholm, Connect, Vienna, ViennaRNA bpp matrix ...
Definition: input.hpp:466
structure_file_input_options< typename traits_type::seq_legal_alphabet, selected_field_ids::contains(field::structured_seq)> options
The options are public and its members can be set directly.
Definition: input.hpp:756
reference front() noexcept
Return the record we are currently at in the file.
Definition: input.hpp:748
structure_file_input(structure_file_input const &)=delete
Copy construction is explicitly deleted, because you cannot have multiple access to the same file.
typename traits_type::template structure_container< typename traits_type::structure_alphabet > structure_type
The type of the structure field (default std::vector of seqan3::wuss51).
Definition: input.hpp:530
detail::in_file_iterator< structure_file_input > iterator
The iterator type of this view (an input iterator).
Definition: input.hpp:569
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of the ID field (default std::string).
Definition: input.hpp:522
structure_file_input(stream_type &&stream, file_format const &, selected_field_ids const &) -> structure_file_input< typename structure_file_input<>::traits_type, selected_field_ids, type_list< file_format >>
Deduction of the selected fields, the file format and the stream type.
structure_file_input & operator=(structure_file_input const &)=delete
Copy assignment is explicitly deleted, because you cannot have multiple access to the same file.
structure_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: input.hpp:663
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: input.hpp:565
typename traits_type::template seq_container< typename traits_type::seq_alphabet > seq_type
The type of the sequence field (default std::vector of seqan3::rna5).
Definition: input.hpp:520
std::default_sentinel_t sentinel
The type returned by end().
Definition: input.hpp:573
structure_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition: input.hpp:551
typename traits_type::template comment_container< typename traits_type::comment_alphabet > comment_type
The type of the comment field (default double).
Definition: input.hpp:541
char stream_char_type
Character type of the stream(s).
Definition: input.hpp:479
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: input.hpp:563
structure_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
structure_file_input(stream_type &stream, file_format const &, selected_field_ids const &) -> structure_file_input< typename structure_file_input<>::traits_type, selected_field_ids, type_list< file_format >>
This is an overloaded member function, provided for convenience. It differs from the above function o...
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: input.hpp:473
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: input.hpp:571
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: input.hpp:475
structure_file_input & operator=(structure_file_input &&)=default
Move assignment is defaulted.
structure_file_input(structure_file_input &&)=default
Move construction is defaulted.
typename traits_type::template structured_seq_container< typename traits_type::template structured_seq_alphabet< typename traits_type::seq_alphabet, typename traits_type::structure_alphabet > > structured_seq_type
The type of the sequence-structure field (default std::vector of structured_rna<rna5,...
Definition: input.hpp:534
structure_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: input.hpp:645
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: input.hpp:720
iterator begin()
Returns an iterator to current position in the file.
Definition: input.hpp:695
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: input.hpp:477
typename traits_type::template bpp_container< typename traits_type::template bpp_queue< typename traits_type::template bpp_item< typename traits_type::bpp_prob, typename traits_type::bpp_partner > >> bpp_type
The type of the base pair probabilies (default std::vector of std::set<std::pair<double,...
Definition: input.hpp:527
typename traits_type::energy_type energy_type
The type of the energy field (default double).
Definition: input.hpp:536
typename traits_type::offset_type offset_type
The type of the offset field (default size_t).
Definition: input.hpp:543
structure_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: input.hpp:608
~structure_file_input()=default
Destructor is defaulted.
typename traits_type::template react_container< typename traits_type::react_type > react_type
The type of the reactivity and reactivity error fields (default double).
Definition: input.hpp:538
A seqan3::alphabet_tuple_base that joins an aminoacid alphabet with a protein structure alphabet.
Definition: structured_aa.hpp:56
A seqan3::alphabet_tuple_base that joins a nucleotide alphabet with an RNA structure alphabet.
Definition: structured_rna.hpp:57
T data(T... args)
Provides the dssp format for protein structure.
This header includes C++17 filesystem support and imports it into namespace std::filesystem (independ...
Provides the seqan3::format_vienna.
T format(T... args)
T get(T... args)
field
An enumerator for the fields used in file formats.
Definition: record.hpp:63
@ energy
Energy of a folded sequence, represented by one float number.
@ comment
Comment field of arbitrary content, usually a string.
@ structure
Fixed interactions, usually a string of structure alphabet characters.
@ bpp
Base pair probability matrix of interactions, usually a matrix of float numbers.
@ react
Reactivity values of the sequence characters given in a vector of float numbers.
@ react_err
Reactivity error values given in a vector corresponding to seqan3::field::react.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ structured_seq
Sequence and fixed interactions combined in one range.
@ id
The identifier, usually a string.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
constexpr bool contains
Whether a type occurs in a type list or not.
Definition: traits.hpp:231
Provides the seqan3::detail::in_file_iterator class template.
Resolves to std::ranges::explicitly_convertible_to<type1, type2>(). <dl class="no-api">This entity i...
A concept that indicates whether an alphabet represents RNA structure.
A more refined container concept than seqan3::container.
The generic concept for structure file in formats.
The requirements a traits_type for seqan3::structure_file_input must meet.
Refines seqan3::alphabet and adds assignability.
Provides exceptions used in the I/O module.
Stream concepts.
T is_base_of_v
Provides various utility functions required only for input.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::rna15, container aliases and string literals.
Provides seqan3::rna5, container aliases and string literals.
T size(T... args)
A class template that holds a choice of seqan3::field.
Definition: record.hpp:128
void clear() noexcept(noexcept(std::apply(expander, std::declval< record & >())))
Clears containers that provide .clear() and (re-)initialises all other elements with = {}.
Definition: record.hpp:235
A traits type that specifies input as amino acids.
Definition: input.hpp:321
The default traits for seqan3::structure_file_input.
Definition: input.hpp:222
wuss51 structure_alphabet
The alphabet for a structure annotation is seqan3::phred42.
Definition: input.hpp:272
size_t offset_type
The type of the offset is size_t.
Definition: input.hpp:314
double bpp_prob
The type for a base pair probability is double.
Definition: input.hpp:252
char id_alphabet
The alphabet for an identifier string is char.
Definition: input.hpp:243
char comment_alphabet
The alphabet for a comment string is char.
Definition: input.hpp:305
double react_type
The type of the reactivity and reactivity error is double.
Definition: input.hpp:296
size_t bpp_partner
The type for the partner position of a base pair probability is size_t.
Definition: input.hpp:255
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:28
Type that contains multiple types.
Definition: type_list.hpp:29
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_record.
Provides the composite of aminoacid with structure alphabets.
Provides traits for seqan3::type_list.
Adaptations of concepts from the standard library.
T visit(T... args)