SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
structure_file/input.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <cassert>
13#include <filesystem>
14#include <fstream>
15#include <limits>
16#include <optional>
17#include <string>
18#include <type_traits>
19#include <utility>
20#include <variant>
21#include <vector>
22
31#include <seqan3/io/detail/record.hpp>
40
41namespace seqan3
42{
43// ----------------------------------------------------------------------------
44// structure_file_input_traits
45// ----------------------------------------------------------------------------
46
138template <typename t>
139concept structure_file_input_traits = requires (t v) {
140 // TODO(joergi-w) The expensive concept checks are currently omitted. Check again when compiler has improved.
141 // sequence
146
147 // id
150
151 // bpp
152 requires std::is_floating_point_v<typename t::bpp_prob>;
154
155 // requires container // TODO check Associative container Concept when implemented
156 // <typename t::template bpp_queue
157 // <typename t::template bpp_item
158 // <typename t::bpp_prob, typename t::bpp_partner>>>
159 // && requires(typename t::template bpp_queue // TODO maybe implement also a version that allows emplace_back
160 // <typename t::template bpp_item
161 // <typename t::bpp_prob, typename t::bpp_partner>> value) { value.emplace(1.0, 1); };
162 // requires sequence_container
163 // <typename t::template bpp_container
164 // <typename t::template bpp_queue
165 // <typename t::template bpp_item
166 // <typename t::bpp_prob, typename t::bpp_partner>>>>;
167
168 // structure
169 requires std::is_same_v<typename t::structure_alphabet, dssp9> // TODO(joergi-w) add aa_structure_concept
172
173 // structured sequence: tuple composites of seq and structure
174 requires std::is_base_of_v<
175 alphabet_tuple_base<
176 typename t::template structured_seq_alphabet<typename t::seq_alphabet, typename t::structure_alphabet>,
177 typename t::seq_alphabet,
178 typename t::structure_alphabet>,
179 typename t::template structured_seq_alphabet<typename t::seq_alphabet, typename t::structure_alphabet>>;
180 // requires sequence_container
181 // <typename t::template structured_seq_container
182 // <typename t::template structured_seq_alphabet
183 // <typename t::seq_alphabet, typename t::structure_alphabet>>>;
184
185 // energy: std::optional of floating point number
186 requires std::is_floating_point_v<typename t::energy_type::value_type>;
187
188 // reactivity [error]
189 requires std::is_floating_point_v<typename t::react_type>;
191
192 // comment
195
196 // offset
198};
200
201// ----------------------------------------------------------------------------
202// structure_file_input_default_traits
203// ----------------------------------------------------------------------------
204
219{
225 // sequence
226
229
232
234 template <typename _seq_alphabet>
236
237 // id
238
240 using id_alphabet = char;
241
243 template <typename _id_alphabet>
245
246 // base pair probability structure
247
249 using bpp_prob = double;
250
252 using bpp_partner = size_t;
253
255 template <typename _bpp_prob, typename _bpp_partner>
257
259 template <typename _bpp_item>
261
263 template <typename _bpp_queue>
265
266 // fixed structure
267
269 using structure_alphabet = wuss51;
270
272 template <typename _structure_alphabet>
274
275 // combined sequence and structure
276
278 template <typename _seq_alphabet, typename _structure_alphabet>
280
282 template <typename _structured_seq_alphabet>
284
285 // energy
286
289
290 // reactivity [error]
291
293 using react_type = double;
294
296 template <typename _react_type>
298
299 // comment
300
302 using comment_alphabet = char;
303
305 template <typename _comment_alphabet>
307
308 // offset
309
311 using offset_type = size_t;
313};
314
335
336// ----------------------------------------------------------------------------
337// structure_file_input
338// ----------------------------------------------------------------------------
339
355 detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::structure>,
356 detail::type_list_of_structure_file_input_formats valid_formats_ = type_list<format_vienna>>
358{
359public:
365 using traits_type = traits_type_;
367 using selected_field_ids = selected_field_ids_;
369 using valid_formats = valid_formats_;
371 using stream_char_type = char;
373
378 field::id,
387
388 static_assert(
389 []() constexpr
390 {
391 for (field f : selected_field_ids::as_array)
392 if (!field_ids::contains(f))
393 return false;
394 return true;
395 }(),
396 "You selected a field that is not valid for structure files, please refer to the documentation "
397 "of structure_file_input::field_ids for the accepted values.");
398
399 static_assert(
400 []() constexpr
401 {
402 return !(selected_field_ids::contains(field::structured_seq)
403 && (selected_field_ids::contains(field::seq) || (selected_field_ids::contains(field::structure))));
404 }(),
405 "You may not select field::structured_seq and either of field::seq and field::structure "
406 "at the same time.");
407
414 using seq_type = typename traits_type::template seq_container<typename traits_type::seq_alphabet>;
416 using id_type = typename traits_type::template id_container<typename traits_type::id_alphabet>;
418 using bpp_type = typename traits_type::template bpp_container<typename traits_type::template bpp_queue<
419 typename traits_type::template bpp_item<typename traits_type::bpp_prob, typename traits_type::bpp_partner>>>;
421 using structure_type = typename traits_type::template structure_container<typename traits_type::structure_alphabet>;
423 using structured_seq_type = typename traits_type::template structured_seq_container<
424 typename traits_type::template structured_seq_alphabet<typename traits_type::seq_alphabet,
425 typename traits_type::structure_alphabet>>;
427 using energy_type = typename traits_type::energy_type;
429 using react_type = typename traits_type::template react_container<typename traits_type::react_type>;
431 using comment_type = typename traits_type::template comment_container<typename traits_type::comment_alphabet>;
433 using offset_type = typename traits_type::offset_type;
434
437 id_type,
438 bpp_type,
446
451
461 using const_reference = void;
463 using size_type = size_t;
467 using iterator = detail::in_file_iterator<structure_file_input>;
469 using const_iterator = void;
471 using sentinel = std::default_sentinel_t;
473
489
507 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
508 primary_stream{new std::ifstream{}, stream_deleter_default}
509 {
510 primary_stream->rdbuf()->pubsetbuf(stream_buffer.data(), stream_buffer.size());
511 static_cast<std::basic_ifstream<char> *>(primary_stream.get())
512 ->open(filename, std::ios_base::in | std::ios::binary);
513
514 if (!primary_stream->good())
515 throw file_open_error{"Could not open file " + filename.string() + " for reading."};
516
517 // possibly add intermediate decompression stream
518 secondary_stream = detail::make_secondary_istream(*primary_stream, filename);
519
520 // initialise format handler
521 detail::set_format(format, filename);
522 }
523
539 template <input_stream stream_t, structure_file_input_format file_format>
540 requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, char>
541 structure_file_input(stream_t & stream,
542 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
543 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
544 primary_stream{&stream, stream_deleter_noop},
545 format{detail::structure_file_input_format_exposer<file_format>{}}
546 {
547 static_assert(list_traits::contains<file_format, valid_formats>,
548 "You selected a format that is not in the valid_formats of this file.");
549
550 // possibly add intermediate decompression stream
551 secondary_stream = detail::make_secondary_istream(*primary_stream);
552 }
553
555 template <input_stream stream_t, structure_file_input_format file_format>
556 requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, char>
557 structure_file_input(stream_t && stream,
558 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
559 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
560 primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
561 format{detail::structure_file_input_format_exposer<file_format>{}}
562 {
563 static_assert(list_traits::contains<file_format, valid_formats>,
564 "You selected a format that is not in the valid_formats of this file.");
565
566 // possibly add intermediate compression stream
567 secondary_stream = detail::make_secondary_istream(*primary_stream);
568 }
570
590 {
591 // buffer first record
592 if (!first_record_was_read)
593 {
594 read_next_record();
595 first_record_was_read = true;
596 }
597
598 return {*this};
599 }
600
614 sentinel end() noexcept
615 {
616 return {};
617 }
618
642 reference front() noexcept
643 {
644 return *begin();
645 }
647
649 structure_file_input_options<typename traits_type::seq_legal_alphabet,
650 selected_field_ids::contains(field::structured_seq)>
652
653protected:
655
659 record_type record_buffer;
661 std::vector<char> stream_buffer{std::vector<char>(1'000'000)};
663 std::streampos position_buffer{};
665
673 static void stream_deleter_noop(std::basic_istream<stream_char_type> *)
674 {}
676 static void stream_deleter_default(std::basic_istream<stream_char_type> * ptr)
677 {
678 delete ptr;
679 }
680
682 stream_ptr_t primary_stream{nullptr, stream_deleter_noop};
684 stream_ptr_t secondary_stream{nullptr, stream_deleter_noop};
685
687 bool first_record_was_read{false};
689 bool at_end{false};
690
692 using format_type =
693 typename detail::variant_from_tags<valid_formats, detail::structure_file_input_format_exposer>::type;
695 format_type format;
697
699 void read_next_record()
700 {
701 // clear the record
702 record_buffer.clear();
703
704 // store the current position in the position buffer
705 position_buffer = secondary_stream->tellg();
706
707 // at end if we could not read further
708 if ((std::istreambuf_iterator<stream_char_type>{*secondary_stream}
710 {
711 at_end = true;
712 return;
713 }
714
715 assert(!format.valueless_by_exception());
717 [&](auto & f)
718 {
719 // read new record
720 if constexpr (selected_field_ids::contains(field::structured_seq))
721 {
722 static_assert(!selected_field_ids::contains(field::structure),
723 "You may not select field::structured_seq and field::structure at the same time.");
724 static_assert(!selected_field_ids::contains(field::seq),
725 "You may not select field::structured_seq and field::seq at the same time.");
726 f.read_structure_record(*secondary_stream,
727 options,
728 detail::get_or_ignore<field::structured_seq>(record_buffer), // seq
729 detail::get_or_ignore<field::id>(record_buffer),
730 detail::get_or_ignore<field::bpp>(record_buffer),
731 detail::get_or_ignore<field::structured_seq>(record_buffer), // structure
732 detail::get_or_ignore<field::energy>(record_buffer),
733 detail::get_or_ignore<field::react>(record_buffer),
734 detail::get_or_ignore<field::react_err>(record_buffer),
735 detail::get_or_ignore<field::comment>(record_buffer),
736 detail::get_or_ignore<field::offset>(record_buffer));
737 }
738 else
739 {
740 f.read_structure_record(*secondary_stream,
741 options,
742 detail::get_or_ignore<field::seq>(record_buffer),
743 detail::get_or_ignore<field::id>(record_buffer),
744 detail::get_or_ignore<field::bpp>(record_buffer),
745 detail::get_or_ignore<field::structure>(record_buffer),
746 detail::get_or_ignore<field::energy>(record_buffer),
747 detail::get_or_ignore<field::react>(record_buffer),
748 detail::get_or_ignore<field::react_err>(record_buffer),
749 detail::get_or_ignore<field::comment>(record_buffer),
750 detail::get_or_ignore<field::offset>(record_buffer));
751 }
752 },
753 format);
754 }
755
757 friend iterator;
758};
759
766template <input_stream stream_type,
767 structure_file_input_format file_format,
768 detail::fields_specialisation selected_field_ids>
769structure_file_input(stream_type && stream, file_format const &, selected_field_ids const &)
773
775template <input_stream stream_type,
776 structure_file_input_format file_format,
777 detail::fields_specialisation selected_field_ids>
778structure_file_input(stream_type & stream, file_format const &, selected_field_ids const &)
783
784} // namespace seqan3
Provides seqan3::aa27, container aliases and string literals.
Provides alphabet adaptations for standard char types.
The twenty-seven letter amino acid alphabet.
Definition aa27.hpp:43
The protein structure alphabet of the characters "HGIEBTSCX".
Definition dssp9.hpp:59
The 15 letter RNA alphabet, containing all IUPAC smybols minus the gap.
Definition rna15.hpp:48
The five letter RNA alphabet of A,C,G,U and the unknown character N.
Definition rna5.hpp:46
The generic concept for structure file in formats.
Definition structure_file/input_format_concept.hpp:138
A class for reading structured sequence files, e.g. Stockholm, Connect, Vienna, ViennaRNA bpp matrix ...
Definition structure_file/input.hpp:358
structure_file_input_options< typename traits_type::seq_legal_alphabet, selected_field_ids::contains(field::structured_seq)> options
The options are public and its members can be set directly.
Definition structure_file/input.hpp:651
reference front() noexcept
Return the record we are currently at in the file.
Definition structure_file/input.hpp:642
structure_file_input(structure_file_input const &)=delete
Copy construction is explicitly deleted, because you cannot have multiple access to the same file.
detail::in_file_iterator< structure_file_input > iterator
The iterator type of this view (an input iterator).
Definition structure_file/input.hpp:467
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of the ID field (default std::string).
Definition structure_file/input.hpp:416
size_t size_type
An unsigned integer type, usually std::size_t.
Definition structure_file/input.hpp:463
typename traits_type::template seq_container< typename traits_type::seq_alphabet > seq_type
The type of the sequence field (default std::vector of seqan3::rna5).
Definition structure_file/input.hpp:414
typename traits_type::template comment_container< typename traits_type::comment_alphabet > comment_type
The type of the comment field (default double).
Definition structure_file/input.hpp:431
std::default_sentinel_t sentinel
The type returned by end().
Definition structure_file/input.hpp:471
structure_file_input & operator=(structure_file_input const &)=delete
Copy assignment is explicitly deleted, because you cannot have multiple access to the same file.
structure_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition structure_file/input.hpp:541
structure_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition structure_file/input.hpp:449
structure_file_input(stream_type &&stream, file_format const &, selected_field_ids const &) -> structure_file_input< typename structure_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
Deduction of the selected fields, the file format and the stream type.
structure_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition structure_file/input.hpp:557
char stream_char_type
Character type of the stream(s).
Definition structure_file/input.hpp:371
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition structure_file/input.hpp:461
structure_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition structure_file/input.hpp:365
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition structure_file/input.hpp:469
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition structure_file/input.hpp:367
structure_file_input(structure_file_input &&)=default
Move construction is defaulted.
structure_file_input(stream_type &stream, file_format const &, selected_field_ids const &) -> structure_file_input< typename structure_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
This is an overloaded member function, provided for convenience. It differs from the above function o...
typename traits_type::template structure_container< typename traits_type::structure_alphabet > structure_type
The type of the structure field (default std::vector of seqan3::wuss51).
Definition structure_file/input.hpp:421
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition structure_file/input.hpp:614
structure_file_input & operator=(structure_file_input &&)=default
Move assignment is defaulted.
iterator begin()
Returns an iterator to current position in the file.
Definition structure_file/input.hpp:589
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition structure_file/input.hpp:369
typename traits_type::energy_type energy_type
The type of the energy field (default double).
Definition structure_file/input.hpp:427
typename traits_type::offset_type offset_type
The type of the offset field (default size_t).
Definition structure_file/input.hpp:433
typename traits_type::template structured_seq_container< typename traits_type::template structured_seq_alphabet< typename traits_type::seq_alphabet, typename traits_type::structure_alphabet > > structured_seq_type
The type of the sequence-structure field (default std::vector of structured_rna<rna5,...
Definition structure_file/input.hpp:425
typename traits_type::template bpp_container< typename traits_type::template bpp_queue< typename traits_type::template bpp_item< typename traits_type::bpp_prob, typename traits_type::bpp_partner > > > bpp_type
The type of the base pair probabilies (default std::vector of std::set<std::pair<double,...
Definition structure_file/input.hpp:419
structure_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition structure_file/input.hpp:506
~structure_file_input()=default
Destructor is defaulted.
typename traits_type::template react_container< typename traits_type::react_type > react_type
The type of the reactivity and reactivity error fields (default double).
Definition structure_file/input.hpp:429
A seqan3::alphabet_tuple_base that joins an aminoacid alphabet with a protein structure alphabet.
Definition structured_aa.hpp:52
A seqan3::alphabet_tuple_base that joins a nucleotide alphabet with an RNA structure alphabet.
Definition structured_rna.hpp:53
T data(T... args)
Provides the dssp format for protein structure.
Provides the seqan3::format_vienna.
T format(T... args)
T get(T... args)
field
An enumerator for the fields used in file formats.
Definition record.hpp:60
@ energy
Energy of a folded sequence, represented by one float number.
@ comment
Comment field of arbitrary content, usually a string.
@ structure
Fixed interactions, usually a string of structure alphabet characters.
@ bpp
Base pair probability matrix of interactions, usually a matrix of float numbers.
@ react
Reactivity values of the sequence characters given in a vector of float numbers.
@ react_err
Reactivity error values given in a vector corresponding to seqan3::field::react.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ structured_seq
Sequence and fixed interactions combined in one range.
@ id
The identifier, usually a string.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
Provides the seqan3::detail::in_file_iterator class template.
Checks whether from can be explicitly converted to to.
A concept that indicates whether an alphabet represents RNA structure.
A more refined container concept than seqan3::container.
The requirements a traits_type for seqan3::structure_file_input must meet.
Refines seqan3::alphabet and adds assignability.
Provides exceptions used in the I/O module.
Stream concepts.
T is_base_of_v
Provides various utility functions required only for input.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
SeqAn specific customisations in the standard namespace.
Provides seqan3::rna15, container aliases and string literals.
Provides seqan3::rna5, container aliases and string literals.
T size(T... args)
A class template that holds a choice of seqan3::field.
Definition record.hpp:125
Thrown if there is an unspecified filesystem or stream error while opening, e.g. permission problem.
Definition io/exception.hpp:36
void clear() noexcept(noexcept(std::apply(expander, std::declval< record & >().as_base())))
Clears containers that provide .clear() and (re-)initialises all other elements with = {}.
Definition record.hpp:242
A traits type that specifies input as amino acids.
Definition structure_file/input.hpp:318
The default traits for seqan3::structure_file_input.
Definition structure_file/input.hpp:219
wuss51 structure_alphabet
The alphabet for a structure annotation is seqan3::phred42.
Definition structure_file/input.hpp:269
size_t offset_type
The type of the offset is size_t.
Definition structure_file/input.hpp:311
double bpp_prob
The type for a base pair probability is double.
Definition structure_file/input.hpp:249
char id_alphabet
The alphabet for an identifier string is char.
Definition structure_file/input.hpp:240
char comment_alphabet
The alphabet for a comment string is char.
Definition structure_file/input.hpp:302
double react_type
The type of the reactivity and reactivity error is double.
Definition structure_file/input.hpp:293
size_t bpp_partner
The type for the partner position of a base pair probability is size_t.
Definition structure_file/input.hpp:252
The options type defines various option members that influence the behaviour of all or some formats.
Definition structure_file/input_options.hpp:27
Type that contains multiple types.
Definition type_list.hpp:26
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_record.
Provides the composite of aminoacid with structure alphabets.
Provides traits for seqan3::type_list.
Adaptations of concepts from the standard library.
T visit(T... args)
Hide me