SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
format_fastq.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
14 #pragma once
15 
16 #include <iterator>
17 #include <string>
18 #include <string_view>
19 #include <vector>
20 
21 #include <range/v3/algorithm/copy.hpp>
22 #include <range/v3/view/chunk.hpp>
23 #include <range/v3/view/join.hpp>
24 #include <range/v3/view/remove_if.hpp>
25 
46 #include <seqan3/std/algorithm>
47 #include <seqan3/std/ranges>
48 
49 namespace seqan3
50 {
51 
84 {
87  {
88  { "fastq" },
89  { "fq" }
90  };
91 };
92 
93 } // namespace seqan
94 
95 namespace seqan3::detail
96 {
97 
100 template <>
101 class sequence_file_input_format<format_fastq>
102 {
103 public:
105  using format_tag = format_fastq;
106 
110  sequence_file_input_format() noexcept = default;
111  sequence_file_input_format(sequence_file_input_format const &) = delete;
114  sequence_file_input_format & operator=(sequence_file_input_format const &) = delete;
115  sequence_file_input_format(sequence_file_input_format &&) noexcept = default;
116  sequence_file_input_format & operator=(sequence_file_input_format &&) noexcept = default;
117  ~sequence_file_input_format() noexcept = default;
118 
121  template <typename stream_type, // constraints checked by file
122  typename seq_legal_alph_type, bool seq_qual_combined,
123  typename seq_type, // other constraints checked inside function
124  typename id_type,
125  typename qual_type>
126  void read(stream_type & stream,
127  sequence_file_input_options<seq_legal_alph_type, seq_qual_combined> const & options,
128  seq_type & sequence,
129  id_type & id,
130  qual_type & qualities)
131  {
132  auto stream_view = view::istreambuf(stream);
133  auto stream_it = begin(stream_view);
134 
135  // cache the begin position so we write quals to the same position as seq in seq_qual case
136  size_t sequence_size_before = 0;
137  size_t sequence_size_after = 0;
138  if constexpr (!detail::decays_to_ignore_v<seq_type>)
139  sequence_size_before = size(sequence);
140 
141  /* ID */
142  if (*stream_it != '@') // [[unlikely]]
143  {
144  throw parse_error{std::string{"Expected '@' on beginning of ID line, got: "} +
145  detail::make_printable(*stream_it)};
146  }
147  ++stream_it; // skip '@'
148 
149  if constexpr (!detail::decays_to_ignore_v<id_type>)
150  {
151  if (options.truncate_ids)
152  {
154  | view::char_to<value_type_t<id_type>>,
155  std::back_inserter(id));
156  detail::consume(stream_view | view::take_line_or_throw);
157  }
158  else
159  {
161  | view::char_to<value_type_t<id_type>>,
162  std::back_inserter(id));
163  }
164  }
165  else
166  {
167  detail::consume(stream_view | view::take_line_or_throw);
168  }
169 
170  /* Sequence */
171  auto seq_view = stream_view | view::take_until_or_throw(is_char<'+'>) // until 2nd ID line
172  | std::view::filter(!is_space); // ignore whitespace
173  if constexpr (!detail::decays_to_ignore_v<seq_type>)
174  {
175  auto constexpr is_legal_alph = is_in_alphabet<seq_legal_alph_type>;
176  std::ranges::copy(seq_view | std::view::transform([is_legal_alph] (char const c) // enforce legal alphabet
177  {
178  if (!is_legal_alph(c))
179  {
180  throw parse_error{std::string{"Encountered an unexpected letter: "} +
181  is_legal_alph.msg.str() +
182  " evaluated to false on " +
183  detail::make_printable(c)};
184  }
185  return c;
186  })
187  | view::char_to<value_type_t<seq_type>>, // convert to actual target alphabet
188  std::back_inserter(sequence));
189  sequence_size_after = size(sequence);
190  }
191  else // consume, but count
192  {
193  auto it = begin(seq_view);
194  auto it_end = end(seq_view);
195  while (it != it_end)
196  {
197  ++it;
198  ++sequence_size_after;
199  }
200  }
201 
202  /* 2nd ID line */
203  if (*stream_it != '+') // [[unlikely]]
204  {
205  throw parse_error{std::string{"Expected '+' on beginning of 2nd ID line, got: "} +
206  detail::make_printable(*stream_it)};
207  }
208  detail::consume(stream_view | view::take_line_or_throw);
209 
210  /* Qualities */
211  auto qview = stream_view | std::view::filter(!is_space) // this consumes trailing newline
212  | view::take_exactly_or_throw(sequence_size_after - sequence_size_before);
213  if constexpr (seq_qual_combined)
214  {
215  // seq_qual field implies that they are the same variable
216  assert(std::addressof(sequence) == std::addressof(qualities));
217  std::ranges::copy(qview | view::char_to<typename value_type_t<qual_type>::quality_alphabet_type>,
218  begin(qualities) + sequence_size_before);
219  }
220  else if constexpr (!detail::decays_to_ignore_v<qual_type>)
221  {
222  std::ranges::copy(qview | view::char_to<value_type_t<qual_type>>,
223  std::back_inserter(qualities));
224  }
225  else
226  {
227  detail::consume(qview);
228  }
229  }
230 };
231 
234 template <>
235 class sequence_file_output_format<format_fastq>
236 {
237 public:
239  using format_tag = format_fastq;
240 
244  sequence_file_output_format() noexcept = default;
245  sequence_file_output_format(sequence_file_output_format const &) = delete;
248  sequence_file_output_format & operator=(sequence_file_output_format const &) = delete;
249  sequence_file_output_format(sequence_file_output_format &&) noexcept = default;
250  sequence_file_output_format & operator=(sequence_file_output_format &&) noexcept = default;
251  ~sequence_file_output_format() noexcept = default;
252 
255  template <typename stream_type, // constraints checked by file
256  typename seq_type, // other constraints checked inside function
257  typename id_type,
258  typename qual_type>
259  void write(stream_type & stream,
260  sequence_file_output_options const & options,
261  seq_type && sequence,
262  id_type && id,
263  qual_type && qualities)
264  {
265  seqan3::ostreambuf_iterator stream_it{stream};
266 
267  // ID
268  if constexpr (detail::decays_to_ignore_v<id_type>)
269  {
270  throw std::logic_error{"The ID field may not be set to ignore when writing FASTQ files."};
271  }
272  else
273  {
274  if (empty(id)) //[[unlikely]]
275  throw std::runtime_error{"The ID field may not be empty when writing FASTQ files."};
276 
277  stream_it = '@';
278  std::ranges::copy(id, stream_it);
279 
280  detail::write_eol(stream_it, options.add_carriage_return);
281  }
282 
283  // Sequence
284  if constexpr (detail::decays_to_ignore_v<seq_type>)
285  {
286  throw std::logic_error{"The SEQ and SEQ_QUAL fields may not both be set to ignore when writing FASTQ files."};
287  }
288  else
289  {
290  if (empty(sequence)) //[[unlikely]]
291  throw std::runtime_error{"The SEQ field may not be empty when writing FASTQ files."};
292 
293  std::ranges::copy(sequence | view::to_char, stream_it);
294 
295  detail::write_eol(stream_it, options.add_carriage_return);
296  }
297 
298  // 2nd ID-line
299  if constexpr (!detail::decays_to_ignore_v<id_type>)
300  {
301  stream_it = '+';
302 
303  if (options.fastq_double_id)
304  std::ranges::copy(id, stream_it);
305 
306  detail::write_eol(stream_it, options.add_carriage_return);
307  }
308 
309  // Quality line
310  if constexpr (detail::decays_to_ignore_v<qual_type>)
311  {
312  throw std::logic_error{"The QUAL and SEQ_QUAL fields may not both be set to ignore when writing FASTQ files."};
313  }
314  else
315  {
316  if (empty(qualities)) //[[unlikely]]
317  throw std::runtime_error{"The SEQ field may not be empty when writing FASTQ files."};
318 
320  {
321  assert(size(sequence) == size(qualities));
322  }
323 
324  std::ranges::copy(qualities | view::to_char, stream_it);
325 
326  detail::write_eol(stream_it, options.add_carriage_return);
327  }
328  }
329 };
330 
331 } // namespace seqan3::detail
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_fastq.hpp:87
Provides seqan3::SequenceFileInputFormat and auxiliary classes.
Provides seqan3::view::istreambuf.
The FastQ format. (tag)
Definition: format_fastq.hpp:83
Provides various shortcuts for common std::ranges functions.
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Writes successive characters onto the output stream from which...
Definition: iterator.hpp:56
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:245
Provides seqan3::view::take.
::ranges::size size
Alias for ranges::size. Obtains the size of a range whose size can be calculated in constant time...
Definition: ranges:189
The main SeqAn3 namespace.
Provides seqan3::dna5, container aliases and string literals.
Provides seqan3::view::take_line and seqan3::view::take_line_or_throw.
auto constexpr is_blank
Checks whether c is a blank character.
Definition: predicate.hpp:163
Specifies the requirements of a Range type that knows its size in constant time with the size functio...
Provides seqan3::sequence_file_output_options.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides various utility functions.
Provides various utility functions.
Provides seqan3::view::char_to.
auto constexpr take_line_or_throw
A view adaptor that returns a single line from the underlying range (throws if there is no end-of-lin...
Definition: take_line.hpp:92
Adaptations of concepts from the Ranges TS.
::ranges::begin begin
Alias for ranges::begin. Returns an iterator to the beginning of a range.
Definition: ranges:174
T addressof(T... args)
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: algorithm:44
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:66
Definition: aligned_sequence_concept.hpp:35
Provides character predicates for tokenisation.
Provides seqan3::ostream and seqan3::ostreambuf iterator.
auto constexpr take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:613
Provides seqan3::SequenceFileFormatOut and auxiliary classes.
T back_inserter(T... args)
Adaptations of algorithms from the Ranges TS.
::ranges::empty empty
Alias for ranges::empty. Checks whether a range is empty.
Definition: ranges:194
Provides seqan3::view::to_char.
Provides seqan3::sequence_file_input_options.
Provides various transformation traits used by the range module.
auto constexpr is_space
Checks whether c is a space character.
Definition: predicate.hpp:146
Provides aliases for qualified.
auto constexpr is_cntrl
Checks whether c is a control character.
Definition: predicate.hpp:110
::ranges::end end
Alias for ranges::end. Returns an iterator to the end of a range.
Definition: ranges:179
constexpr auto transform
A range adaptor that takes a invocable and returns a view of the elements with the invocable applied...
Definition: ranges:911
Provides seqan3::view::take_exactly and seqan3::view::take_exactly_or_throw.
constexpr auto filter
A range adaptor that takes a predicate and returns a view of the elements that satisfy the predicate...
Definition: ranges:565
auto constexpr take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly.hpp:94
auto const char_to
A view over an alphabet, given a range of characters.
Definition: char_to.hpp:70