SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
sequence_file/input.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <cassert>
13#include <filesystem>
14#include <fstream>
15#include <string>
16#include <variant>
17#include <vector>
18
27#include <seqan3/io/detail/record.hpp>
38
39namespace seqan3
40{
41
42// ----------------------------------------------------------------------------
43// sequence_file_input_traits
44// ----------------------------------------------------------------------------
45
94template <typename t>
95concept sequence_file_input_traits = requires (t v) {
98 requires detail::is_char_adaptation_v<typename t::sequence_alphabet>
101
104
107};
109
110// ----------------------------------------------------------------------------
111// sequence_file_input_default_traits
112// ----------------------------------------------------------------------------
113
130{
138
141
143 template <typename _sequence_alphabet>
145
148
150 template <typename _id_alphabet>
152
155
157 template <typename _quality_alphabet>
159
161};
162
179
180// ----------------------------------------------------------------------------
181// sequence_file_input
182// ----------------------------------------------------------------------------
183
203 detail::type_list_of_sequence_file_input_formats valid_formats_ =
206{
207public:
221
226
227 static_assert(
228 []() constexpr
229 {
230 for (field f : selected_field_ids::as_array)
231 if (!field_ids::contains(f))
232 return false;
233 return true;
234 }(),
235 "You selected a field that is not valid for sequence files, please refer to the documentation "
236 "of sequence_file_input::field_ids for the accepted values.");
237
251
256
268 using size_type = size_t;
272 using iterator = detail::in_file_iterator<sequence_file_input>;
276 using sentinel = std::default_sentinel_t;
278
294
313 primary_stream{new std::ifstream{}, stream_deleter_default}
314 {
315 primary_stream->rdbuf()->pubsetbuf(stream_buffer.data(), stream_buffer.size());
316 static_cast<std::basic_ifstream<char> *>(primary_stream.get())
317 ->open(filename, std::ios_base::in | std::ios::binary);
318
319 if (!primary_stream->good())
320 throw file_open_error{"Could not open file " + filename.string() + " for reading."};
321
322 // possibly add intermediate compression stream
323 secondary_stream = detail::make_secondary_istream(*primary_stream, filename);
324
325 // initialise format handler or throw if format is not found
326 using format_variant_t =
327 typename detail::variant_from_tags<valid_formats, detail::sequence_file_input_format_exposer>::type;
328 format_variant_t format_variant{};
329 detail::set_format(format_variant, filename);
330
332 [&](auto && selected_format)
333 {
334 using format_t = std::remove_cvref_t<decltype(selected_format)>;
335 format = std::make_unique<selected_sequence_format<format_t>>();
336 },
337 format_variant);
338 }
339 /* NOTE(h-2): Curiously we do not need a user-defined deduction guide for the above constructor.
340 * A combination of default template parameters and auto-deduction guides works as expected,
341 * independent of whether the second/optional parameter is specified or not, i.e. it is possible
342 * to auto-deduct and overwrite a single template parameter out of the four if the optional parameter
343 * is specified and use the default otherwise.
344 */
345
360 template <input_stream stream_t, sequence_file_input_format file_format>
361 requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
365 primary_stream{&stream, stream_deleter_noop},
366 format{std::make_unique<selected_sequence_format<file_format>>()}
367 {
368 static_assert(list_traits::contains<file_format, valid_formats>,
369 "You selected a format that is not in the valid_formats of this file.");
370
371 // possibly add intermediate compression stream
372 secondary_stream = detail::make_secondary_istream(*primary_stream);
373 }
374
376 template <input_stream stream_t, sequence_file_input_format file_format>
377 requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
381 primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
382 format{std::make_unique<selected_sequence_format<file_format>>()}
383 {
384 static_assert(list_traits::contains<file_format, valid_formats>,
385 "You selected a format that is not in the valid_formats of this file.");
386
387 // possibly add intermediate compression stream
388 secondary_stream = detail::make_secondary_istream(*primary_stream);
389 }
391
411 {
412 // buffer first record
413 if (!first_record_was_read)
414 {
415 read_next_record();
416 first_record_was_read = true;
417 }
418
419 return {*this};
420 }
421
436 {
437 return {};
438 }
439
464 {
465 return *begin();
466 }
468
473
474protected:
476
480 record_type record_buffer;
482 std::vector<char> stream_buffer{std::vector<char>(1'000'000)};
484 std::streampos position_buffer{};
486
494 static void stream_deleter_noop(std::basic_istream<stream_char_type> *)
495 {}
497 static void stream_deleter_default(std::basic_istream<stream_char_type> * ptr)
498 {
499 delete ptr;
500 }
501
503 stream_ptr_t primary_stream{nullptr, stream_deleter_noop};
505 stream_ptr_t secondary_stream{nullptr, stream_deleter_noop};
506
508 bool first_record_was_read{false};
510 bool at_end{false};
512
513private:
515 void read_next_record()
516 {
517 // clear the record
518 record_buffer.clear();
519
520 // at end if we could not read further
521 if ((std::istreambuf_iterator<stream_char_type>{*secondary_stream}
523 {
524 at_end = true;
525 return;
526 }
527
528 format->read_sequence_record(*secondary_stream, record_buffer, position_buffer, options);
529 }
530
541 struct sequence_format_base
542 {
546 sequence_format_base() = default;
547 sequence_format_base(sequence_format_base const &) = default;
548 sequence_format_base(sequence_format_base &&) = default;
549 sequence_format_base & operator=(sequence_format_base const &) = default;
550 sequence_format_base & operator=(sequence_format_base &&) = default;
551 virtual ~sequence_format_base() = default;
553
565 virtual void read_sequence_record(std::istream & instream,
566 record_type & record_buffer,
567 std::streampos & position_buffer,
569 };
570
582 template <typename format_t>
583 struct selected_sequence_format final : public sequence_format_base
584 {
588 selected_sequence_format() = default;
589 selected_sequence_format(selected_sequence_format const &) = default;
590 selected_sequence_format(selected_sequence_format &&) = default;
591 selected_sequence_format & operator=(selected_sequence_format const &) = default;
592 selected_sequence_format & operator=(selected_sequence_format &&) = default;
593 ~selected_sequence_format() = default;
595
597 void read_sequence_record(std::istream & instream,
598 record_type & record_buffer,
599 std::streampos & position_buffer,
601 {
602 // read new record
603 {
604 _format.read_sequence_record(instream,
605 options,
606 position_buffer,
607 detail::get_or_ignore<field::seq>(record_buffer),
608 detail::get_or_ignore<field::id>(record_buffer),
609 detail::get_or_ignore<field::qual>(record_buffer));
610 }
611 }
612
614 detail::sequence_file_input_format_exposer<format_t> _format{};
615 };
616
619
621 friend iterator;
622};
623
630template <input_stream stream_type, sequence_file_input_format file_format>
632 file_format const &)
634 typename sequence_file_input<>::selected_field_ids, // default field ids.
636
638template <input_stream stream_type, sequence_file_input_format file_format>
640 file_format const &)
642 typename sequence_file_input<>::selected_field_ids, // default field ids.
644
646template <input_stream stream_type,
648 detail::fields_specialisation selected_field_ids>
650 file_format const &,
651 selected_field_ids const &)
655
657template <input_stream stream_type,
659 detail::fields_specialisation selected_field_ids>
661 file_format const &,
662 selected_field_ids const &)
667
668} // namespace seqan3
Provides seqan3::aa27, container aliases and string literals.
Provides alphabet adaptations for standard char types.
The twenty-seven letter amino acid alphabet.
Definition aa27.hpp:43
A "pretty printer" for most SeqAn data structures and related types.
Definition debug_stream_type.hpp:79
debug_stream_type()=default
Defaulted.
The 15 letter DNA alphabet, containing all IUPAC smybols minus the gap.
Definition dna15.hpp:48
The five letter DNA alphabet of A,C,G,T and the unknown character N.
Definition dna5.hpp:48
Quality type for traditional Sanger and modern Illumina Phred scores.
Definition phred42.hpp:44
The generic concept for sequence file in formats.
Definition sequence_file/input_format_concept.hpp:91
A class for reading sequence files, e.g. FASTA, FASTQ ...
Definition sequence_file/input.hpp:206
std::default_sentinel_t sentinel
The type returned by end().
Definition sequence_file/input.hpp:276
sequence_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition sequence_file/input.hpp:311
reference front() noexcept
Return the record we are currently at in the file.
Definition sequence_file/input.hpp:463
iterator begin()
Returns an iterator to current position in the file.
Definition sequence_file/input.hpp:410
sequence_file_input_options_type options
The options are public and its members can be set directly.
Definition sequence_file/input.hpp:472
typename traits_type::template sequence_container< typename traits_type::sequence_alphabet > sequence_type
The type of field::seq (std::vector <seqan3::dna5> by default).
Definition sequence_file/input.hpp:244
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition sequence_file/input.hpp:435
char stream_char_type
Character type of the stream(s).
Definition sequence_file/input.hpp:219
sequence_file_input(stream_type &stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
This is an overloaded member function, provided for convenience. It differs from the above function o...
sequence_file_input(stream_type &&stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format > >
This is an overloaded member function, provided for convenience. It differs from the above function o...
size_t size_type
An unsigned integer type, usually std::size_t.
Definition sequence_file/input.hpp:268
sequence_file_input(sequence_file_input const &)=delete
Copy construction is explicitly deleted, because you can't have multiple access to the same file.
detail::in_file_iterator< sequence_file_input > iterator
The iterator type of this view (an input iterator).
Definition sequence_file/input.hpp:272
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of field::id (std::string by defaul).
Definition sequence_file/input.hpp:246
~sequence_file_input()=default
Destructor is defaulted.
sequence_file_input & operator=(sequence_file_input &&)=default
Move assignment is defaulted.
sequence_file_input(sequence_file_input &&)=default
Move construction is defaulted.
sequence_file_input & operator=(sequence_file_input const &)=delete
Copy assignment is explicitly deleted, because you can't have multiple access to the same file.
sequence_file_input(stream_type &stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format > >
Deduces the sequence input file type from the stream and the format.
sequence_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition sequence_file/input.hpp:378
sequence_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
sequence_file_input_options< typename traits_type::sequence_legal_alphabet > sequence_file_input_options_type
The input file options type.
Definition sequence_file/input.hpp:470
sequence_file_input(stream_type &&stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
Deduces the sequence input file type from the stream, the format and the field ids.
typename traits_type::template quality_container< typename traits_type::quality_alphabet > quality_type
The type of field::qual (std::vector <seqan3::phred42> by default).
Definition sequence_file/input.hpp:248
sequence_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition sequence_file/input.hpp:362
sequence_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition sequence_file/input.hpp:254
T data(T... args)
Provides seqan3::dna15, container aliases and string literals.
Provides seqan3::dna5, container aliases and string literals.
Provides the seqan3::sequence_file_format_genbank class.
Provides the seqan3::format_sam.
T get(T... args)
field
An enumerator for the fields used in file formats.
Definition record.hpp:60
Provides the seqan3::detail::in_file_iterator class template.
Checks whether from can be explicitly converted to to.
A more refined container concept than seqan3::container.
The requirements a traits_type for seqan3::sequence_file_input must meet.
Refines seqan3::alphabet and adds assignability.
A concept that indicates whether a writable alphabet represents quality scores.
Provides exceptions used in the I/O module.
Stream concepts.
Provides various utility functions required only for input.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
SeqAn specific customisations in the standard namespace.
Provides seqan3::phred42 quality scores.
Provides quality alphabet composites.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_record.
T size(T... args)
Thrown if there is an unspecified filesystem or stream error while opening, e.g. permission problem.
Definition io/exception.hpp:36
void clear() noexcept(noexcept(std::apply(expander, std::declval< record & >().as_base())))
Clears containers that provide .clear() and (re-)initialises all other elements with = {}.
Definition record.hpp:242
A traits type that specifies input as amino acids.
Definition sequence_file/input.hpp:166
The default traits for seqan3::sequence_file_input.
Definition sequence_file/input.hpp:130
Provides traits for seqan3::type_list.
T visit(T... args)
Hide me