SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
input.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <cassert>
16#include <seqan3/std/filesystem>
17#include <fstream>
18#include <string>
19#include <variant>
20#include <vector>
21
30#include <seqan3/io/detail/record.hpp>
41
42namespace seqan3
43{
44
45// ----------------------------------------------------------------------------
46// sequence_file_input_traits
47// ----------------------------------------------------------------------------
48
97template <typename t>
98SEQAN3_CONCEPT sequence_file_input_traits = requires (t v)
99{
104
107
110};
112
113// ----------------------------------------------------------------------------
114// sequence_file_input_default_traits
115// ----------------------------------------------------------------------------
116
133{
141
144
146 template <typename _sequence_alphabet>
148
150 using id_alphabet = char;
151
153 template <typename _id_alphabet>
155
158
160 template <typename _quality_alphabet>
162
164};
165
169{
177
181};
182
183// ----------------------------------------------------------------------------
184// sequence_file_input
185// ----------------------------------------------------------------------------
186
204template <
206 detail::fields_specialisation selected_field_ids_ = fields<field::seq, field::id, field::qual>,
207 detail::type_list_of_sequence_file_input_formats valid_formats_ = type_list<format_embl,
211 format_sam>>
213{
214public:
220 using traits_type = traits_type_;
222 using selected_field_ids = selected_field_ids_;
224 using valid_formats = valid_formats_;
226 using stream_char_type = char;
228
233
234 static_assert([] () constexpr
235 {
236 for (field f : selected_field_ids::as_array)
237 if (!field_ids::contains(f))
238 return false;
239 return true;
240 }(),
241 "You selected a field that is not valid for sequence files, please refer to the documentation "
242 "of sequence_file_input::field_ids for the accepted values.");
243
250 using sequence_type = typename traits_type::template sequence_container<
251 typename traits_type::sequence_alphabet>;
253 using id_type = typename traits_type::template id_container<
254 typename traits_type::id_alphabet>;
256 using quality_type = typename traits_type::template quality_container<
257 typename traits_type::quality_alphabet>;
260
262 using record_type = sequence_record<detail::select_types_with_ids_t<field_types,
263 field_ids,
267
277 using const_reference = void;
279 using size_type = size_t;
283 using iterator = detail::in_file_iterator<sequence_file_input>;
285 using const_iterator = void;
287 using sentinel = std::default_sentinel_t;
289
305
323 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
324 primary_stream{new std::ifstream{}, stream_deleter_default}
325 {
326 primary_stream->rdbuf()->pubsetbuf(stream_buffer.data(), stream_buffer.size());
327 static_cast<std::basic_ifstream<char> *>(primary_stream.get())->open(filename,
328 std::ios_base::in | std::ios::binary);
329
330 if (!primary_stream->good())
331 throw file_open_error{"Could not open file " + filename.string() + " for reading."};
332
333 // possibly add intermediate compression stream
334 secondary_stream = detail::make_secondary_istream(*primary_stream, filename);
335
336 // initialise format handler or throw if format is not found
337 using format_variant_t = typename detail::variant_from_tags<valid_formats,
338 detail::sequence_file_input_format_exposer>::type;
339 format_variant_t format_variant{};
340 detail::set_format(format_variant, filename);
341
342 std::visit([&] (auto && selected_format)
343 {
344 using format_t = std::remove_cvref_t<decltype(selected_format)>;
345 format = std::make_unique<selected_sequence_format<format_t>>();
346 }, format_variant);
347 }
348 /* NOTE(h-2): Curiously we do not need a user-defined deduction guide for the above constructor.
349 * A combination of default template parameters and auto-deduction guides works as expected,
350 * independent of whether the second/optional parameter is specified or not, i.e. it is possible
351 * to auto-deduct and overwrite a single template parameter out of the four if the optional parameter
352 * is specified and use the default otherwise.
353 */
354
369 template <input_stream stream_t,
370 sequence_file_input_format file_format>
372 requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
374 sequence_file_input(stream_t & stream,
375 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
376 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
377 primary_stream{&stream, stream_deleter_noop},
378 format{std::make_unique<selected_sequence_format<file_format>>()}
379 {
380 static_assert(list_traits::contains<file_format, valid_formats>,
381 "You selected a format that is not in the valid_formats of this file.");
382
383 // possibly add intermediate compression stream
384 secondary_stream = detail::make_secondary_istream(*primary_stream);
385 }
386
388 template <input_stream stream_t,
389 sequence_file_input_format file_format>
391 requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, stream_char_type>
393 sequence_file_input(stream_t && stream,
394 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
395 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
396 primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
397 format{std::make_unique<selected_sequence_format<file_format>>()}
398 {
399 static_assert(list_traits::contains<file_format, valid_formats>,
400 "You selected a format that is not in the valid_formats of this file.");
401
402 // possibly add intermediate compression stream
403 secondary_stream = detail::make_secondary_istream(*primary_stream);
404 }
406
426 {
427 // buffer first record
428 if (!first_record_was_read)
429 {
430 read_next_record();
431 first_record_was_read = true;
432 }
433
434 return {*this};
435 }
436
450 sentinel end() noexcept
451 {
452 return {};
453 }
454
478 reference front() noexcept
479 {
480 return *begin();
481 }
483
488
489protected:
491
495 record_type record_buffer;
497 std::vector<char> stream_buffer{std::vector<char>(1'000'000)};
499
507 static void stream_deleter_noop(std::basic_istream<stream_char_type> *) {}
509 static void stream_deleter_default(std::basic_istream<stream_char_type> * ptr) { delete ptr; }
510
512 stream_ptr_t primary_stream{nullptr, stream_deleter_noop};
514 stream_ptr_t secondary_stream{nullptr, stream_deleter_noop};
515
517 bool first_record_was_read{false};
519 bool at_end{false};
521
522private:
524 void read_next_record()
525 {
526 // clear the record
527 record_buffer.clear();
528
529 // at end if we could not read further
530 if ((std::istreambuf_iterator<stream_char_type>{*secondary_stream} ==
532 {
533 at_end = true;
534 return;
535 }
536
537 format->read_sequence_record(*secondary_stream, record_buffer, options);
538 }
539
550 struct sequence_format_base
551 {
555 sequence_format_base() = default;
556 sequence_format_base(sequence_format_base const &) = default;
557 sequence_format_base(sequence_format_base &&) = default;
558 sequence_format_base & operator=(sequence_format_base const &) = default;
559 sequence_format_base & operator=(sequence_format_base &&) = default;
560 virtual ~sequence_format_base() = default;
562
573 virtual void read_sequence_record(std::istream & instream,
574 record_type & record_buffer,
576 };
577
589 template <typename format_t>
590 struct selected_sequence_format final : public sequence_format_base
591 {
595 selected_sequence_format() = default;
596 selected_sequence_format(selected_sequence_format const &) = default;
597 selected_sequence_format(selected_sequence_format &&) = default;
598 selected_sequence_format & operator=(selected_sequence_format const &) = default;
599 selected_sequence_format & operator=(selected_sequence_format &&) = default;
600 ~selected_sequence_format() = default;
602
604 void read_sequence_record(std::istream & instream,
605 record_type & record_buffer,
607 {
608 // read new record
609 {
610 _format.read_sequence_record(instream,
611 options,
612 detail::get_or_ignore<field::seq>(record_buffer),
613 detail::get_or_ignore<field::id>(record_buffer),
614 detail::get_or_ignore<field::qual>(record_buffer));
615 }
616 };
617
619 detail::sequence_file_input_format_exposer<format_t> _format{};
620 };
621
624
626 friend iterator;
627};
628
635template <input_stream stream_type,
636 sequence_file_input_format file_format>
637sequence_file_input(stream_type & stream,
638 file_format const &)
640 typename sequence_file_input<>::selected_field_ids, // default field ids.
642
644template <input_stream stream_type,
645 sequence_file_input_format file_format>
646sequence_file_input(stream_type && stream,
647 file_format const &)
649 typename sequence_file_input<>::selected_field_ids, // default field ids.
651
653template <input_stream stream_type,
654 sequence_file_input_format file_format,
655 detail::fields_specialisation selected_field_ids>
656sequence_file_input(stream_type && stream,
657 file_format const &,
658 selected_field_ids const &)
662
664template <input_stream stream_type,
665 sequence_file_input_format file_format,
666 detail::fields_specialisation selected_field_ids>
667sequence_file_input(stream_type & stream,
668 file_format const &,
669 selected_field_ids const &)
674
675} // namespace seqan3
Provides seqan3::aa27, container aliases and string literals.
Provides alphabet adaptations for standard char types.
The twenty-seven letter amino acid alphabet..
Definition: aa27.hpp:46
The 15 letter DNA alphabet, containing all IUPAC smybols minus the gap..
Definition: dna15.hpp:51
The five letter DNA alphabet of A,C,G,T and the unknown character N..
Definition: dna5.hpp:51
The EMBL format.
Definition: format_embl.hpp:74
The FastA format.
Definition: format_fasta.hpp:80
The FastQ format.
Definition: format_fastq.hpp:79
The GenBank format.
Definition: format_genbank.hpp:73
The SAM format (tag).
Definition: format_sam.hpp:117
Quality type for traditional Sanger and modern Illumina Phred scores..
Definition: phred42.hpp:47
A class for reading sequence files, e.g. FASTA, FASTQ ...
Definition: input.hpp:213
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of field::id (std::string by defaul).
Definition: input.hpp:254
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: input.hpp:277
std::default_sentinel_t sentinel
The type returned by end().
Definition: input.hpp:287
sequence_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: input.hpp:322
reference front() noexcept
Return the record we are currently at in the file.
Definition: input.hpp:478
type_list< sequence_type, id_type, quality_type > field_types
The previously defined types aggregated in a seqan3::type_list.
Definition: input.hpp:259
fields< field::seq, field::id, field::qual > field_ids
The subset of seqan3::field IDs that are valid for this file; order corresponds to the types in field...
Definition: input.hpp:232
iterator begin()
Returns an iterator to current position in the file.
Definition: input.hpp:425
sequence_file_input_options_type options
The options are public and its members can be set directly.
Definition: input.hpp:487
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: input.hpp:450
char stream_char_type
Character type of the stream(s).
Definition: input.hpp:226
sequence_file_input(stream_type &stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
This is an overloaded member function, provided for convenience. It differs from the above function o...
sequence_file_input(stream_type &&stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format > >
This is an overloaded member function, provided for convenience. It differs from the above function o...
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: input.hpp:279
sequence_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: input.hpp:374
sequence_file_input(sequence_file_input const &)=delete
Copy construction is explicitly deleted, because you can't have multiple access to the same file.
detail::in_file_iterator< sequence_file_input > iterator
The iterator type of this view (an input iterator).
Definition: input.hpp:283
sequence_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: input.hpp:393
~sequence_file_input()=default
Destructor is defaulted.
sequence_file_input & operator=(sequence_file_input &&)=default
Move assignment is defaulted.
sequence_file_input(sequence_file_input &&)=default
Move construction is defaulted.
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: input.hpp:285
sequence_file_input & operator=(sequence_file_input const &)=delete
Copy assignment is explicitly deleted, because you can't have multiple access to the same file.
sequence_file_input(stream_type &stream, file_format const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, typename sequence_file_input<>::selected_field_ids, type_list< file_format > >
Deduces the sequence input file type from the stream and the format.
sequence_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
typename traits_type::template quality_container< typename traits_type::quality_alphabet > quality_type
The type of field::qual (std::vector <seqan3::phred42> by default).
Definition: input.hpp:257
typename traits_type::template sequence_container< typename traits_type::sequence_alphabet > sequence_type
The type of field::seq (std::vector <seqan3::dna5> by default).
Definition: input.hpp:251
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: input.hpp:220
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: input.hpp:222
sequence_file_input_options< typename traits_type::sequence_legal_alphabet > sequence_file_input_options_type
The input file options type.
Definition: input.hpp:485
sequence_file_input(stream_type &&stream, file_format const &, selected_field_ids const &) -> sequence_file_input< typename sequence_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
Deduces the sequence input file type from the stream, the format and the field ids.
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: input.hpp:224
sequence_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition: input.hpp:265
Provides seqan3::dna15, container aliases and string literals.
Provides seqan3::dna5, container aliases and string literals.
The <filesystem> header from C++17's standard library.
Provides the seqan3::sequence_file_format_genbank class.
Provides the seqan3::format_sam.
T get(T... args)
field
An enumerator for the fields used in file formats.
Definition: record.hpp:63
Provides the seqan3::detail::in_file_iterator class template.
Resolves to std::ranges::explicitly_convertible_to<type1, type2>().
A more refined container concept than seqan3::container.
The generic concept for sequence file in formats.
The requirements a traits_type for seqan3::sequence_file_input must meet.
Refines seqan3::alphabet and adds assignability.
A concept that indicates whether a writable alphabet represents quality scores.
Provides exceptions used in the I/O module.
Stream concepts.
Provides various utility functions required only for input.
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
Provides seqan3::phred42 quality scores.
Provides quality alphabet composites.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_record.
A class template that holds a choice of seqan3::field.
Definition: record.hpp:128
void clear() noexcept(noexcept(std::apply(expander, std::declval< record & >())))
Clears containers that provide .clear() and (re-)initialises all other elements with = {}.
Definition: record.hpp:235
A traits type that specifies input as amino acids.
Definition: input.hpp:169
The default traits for seqan3::sequence_file_input.
Definition: input.hpp:133
char id_alphabet
The alphabet for an identifier string is char.
Definition: input.hpp:150
Type that contains multiple types.
Definition: type_list.hpp:29
Provides traits for seqan3::type_list.
T visit(T... args)