SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
format_fasta.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
14 #pragma once
15 
16 #include <iterator>
17 #include <string>
18 #include <string_view>
19 #include <vector>
20 
41 #include <seqan3/std/algorithm>
42 #include <seqan3/std/ranges>
43 
44 namespace seqan3
45 {
46 
80 {
83  {
84  { "fasta" },
85  { "fa" },
86  { "fna" },
87  { "ffn" },
88  { "faa" },
89  { "frn" },
90  };
91 };
92 
93 } // namespace seqan
94 
95 namespace seqan3::detail
96 {
97 
100 template <>
101 class sequence_file_input_format<format_fasta>
102 {
103 public:
105  using format_tag = format_fasta;
106 
110  sequence_file_input_format() noexcept = default;
111  sequence_file_input_format(sequence_file_input_format const &) = delete;
114  sequence_file_input_format & operator=(sequence_file_input_format const &) = delete;
115  sequence_file_input_format(sequence_file_input_format &&) noexcept = default;
116  sequence_file_input_format & operator=(sequence_file_input_format &&) noexcept = default;
117  ~sequence_file_input_format() noexcept = default;
118 
121  template <typename stream_type, // constraints checked by file
122  typename seq_legal_alph_type, bool seq_qual_combined,
123  typename seq_type, // other constraints checked inside function
124  typename id_type,
125  typename qual_type>
126  void read(stream_type & stream,
127  sequence_file_input_options<seq_legal_alph_type, seq_qual_combined> const & options,
128  seq_type & sequence,
129  id_type & id,
130  qual_type & SEQAN3_DOXYGEN_ONLY(qualities))
131  {
132  auto stream_view = view::istreambuf(stream);
133 
134  // ID
135  read_id(stream_view, options, id);
136 
137  // Sequence
138  read_seq(stream_view, options, sequence);
139  }
140 
141 protected:
144  template <typename stream_view_t,
145  typename seq_legal_alph_type, bool seq_qual_combined,
146  typename id_type>
147  void read_id(stream_view_t & stream_view,
148  sequence_file_input_options<seq_legal_alph_type, seq_qual_combined> const & options,
149  id_type & id)
150  {
151  auto const is_id = is_char<'>'> || is_char<';'>;
152 
153  if (!is_id(*begin(stream_view)))
154  throw parse_error{std::string{"Expected to be on beginning of ID, but "} + is_id.msg.str() +
155  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
156 
157  // read id
158  if constexpr (!detail::decays_to_ignore_v<id_type>)
159  {
160  if (options.truncate_ids)
161  {
162  #if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
163  auto it = stream_view.begin();
164  auto e = stream_view.end();
165  for (; (it != e) && (is_id || is_blank)(*it); ++it)
166  {}
167 
168  bool at_delimiter = false;
169  for (; it != e; ++it)
170  {
171  if ((is_cntrl || is_blank)(*it))
172  {
173  at_delimiter = true;
174  break;
175  }
176  id.push_back(assign_char_to(*it, value_type_t<id_type>{}));
177  }
178 
179  if (!at_delimiter)
180  throw unexpected_end_of_input{"FastA ID line did not end in newline."};
181 
182  for (; (it != e) && ((!is_char<'\n'>)(*it)); ++it)
183  {}
184 
185  #else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
186 
187  std::ranges::copy(stream_view | std::view::drop_while(is_id || is_blank) // skip leading >
188  | view::take_until_or_throw(is_cntrl || is_blank) // read ID until delimiter…
189  | view::char_to<value_type_t<id_type>>,
190  std::back_inserter(id)); // … ^A is old delimiter
191 
192  // consume rest of line
193  detail::consume(stream_view | view::take_line_or_throw);
194  #endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
195 
196  }
197  else
198  {
199  #if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
200  auto it = stream_view.begin();
201  auto e = stream_view.end();
202  for (; (it != e) && (is_id || is_blank)(*it); ++it)
203  {}
204 
205  bool at_delimiter = false;
206  for (; it != e; ++it)
207  {
208  if ((is_char<'\n'>)(*it))
209  {
210  at_delimiter = true;
211  break;
212  }
213  id.push_back(assign_char_to(*it, value_type_t<id_type>{}));
214  }
215 
216  if (!at_delimiter)
217  throw unexpected_end_of_input{"FastA ID line did not end in newline."};
218 
219  #else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
220 
221  std::ranges::copy(stream_view | view::take_line_or_throw // read line
222  | std::view::drop_while(is_id || is_blank) // skip leading >
223  | view::char_to<value_type_t<id_type>>,
224  std::back_inserter(id));
225  #endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
226  }
227  }
228  else
229  {
230  detail::consume(stream_view | view::take_line_or_throw);
231  }
232  }
233 
235  template <typename stream_view_t,
236  typename seq_legal_alph_type, bool seq_qual_combined,
237  typename seq_type>
238  void read_seq(stream_view_t & stream_view,
239  sequence_file_input_options<seq_legal_alph_type, seq_qual_combined> const &,
240  seq_type & seq)
241  {
242  auto constexpr is_id = is_char<'>'> || is_char<';'>;
243 
244  if constexpr (!detail::decays_to_ignore_v<seq_type>)
245  {
246  auto constexpr not_in_alph = !is_in_alphabet<seq_legal_alph_type>;
247 
248  #if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
249  auto it = stream_view.begin();
250  auto e = stream_view.end();
251  for (; (it != e) && ((!is_id)(*it)); ++it)
252  {
253  if ((is_space || is_digit)(*it))
254  continue;
255  else if (not_in_alph(*it))
256  {
257  throw parse_error{std::string{"Encountered an unexpected letter: "} +
258  not_in_alph.msg.str() +
259  " evaluated to true on " +
260  detail::make_printable(*it)};
261  }
262 
263  seq.push_back(assign_char_to(*it, value_type_t<seq_type>{}));
264  }
265 
266  #else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
267 
268  std::ranges::copy(stream_view | view::take_until(is_id) // until next header (or end)
269  | std::view::filter(!(is_space || is_digit))// ignore whitespace and numbers
270  | std::view::transform([not_in_alph] (char const c)
271  {
272  if (not_in_alph(c))
273  {
274  throw parse_error{std::string{"Encountered an unexpected letter: "} +
275  not_in_alph.msg.str() +
276  " evaluated to false on " +
277  detail::make_printable(c)};
278  }
279  return c;
280  }) // enforce legal alphabet
281  | view::char_to<value_type_t<seq_type>>, // convert to actual target alphabet
282  std::back_inserter(seq));
283  #endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
284  }
285  else
286  {
287  detail::consume(stream_view | view::take_until(is_id));
288  }
289  }
290 };
291 
294 template <>
295 class sequence_file_output_format<format_fasta>
296 {
297 public:
299  using format_tag = format_fasta;
300 
304  sequence_file_output_format() noexcept = default; //!< Defaulted.
306  sequence_file_output_format(sequence_file_output_format const &) = delete;
308  sequence_file_output_format & operator=(sequence_file_output_format const &) = delete;
309  sequence_file_output_format(sequence_file_output_format &&) noexcept = default; //!< Defaulted.
310  sequence_file_output_format & operator=(sequence_file_output_format &&) noexcept = default; //!< Defaulted.
311  ~sequence_file_output_format() noexcept = default; //!< Defaulted.
313 
315  template <typename stream_type, // constraints checked by file
316  typename seq_type, // other constraints checked inside function
317  typename id_type,
318  typename qual_type>
319  void write(stream_type & stream,
320  sequence_file_output_options const & options,
321  seq_type && sequence,
322  id_type && id,
323  qual_type && SEQAN3_DOXYGEN_ONLY(qualities))
324  {
325 
326  seqan3::ostreambuf_iterator stream_it{stream};
327 
328  // ID
329  if constexpr (detail::decays_to_ignore_v<id_type>)
330  {
331  throw std::logic_error{"The ID field may not be set to ignore when writing FASTA files."};
332  }
333  else
334  {
335  if (empty(id)) //[[unlikely]]
336  throw std::runtime_error{"The ID field may not be empty when writing FASTA files."};
337 
338  write_id(stream_it, options, id);
339  }
340 
341  // Sequence
342  if constexpr (detail::decays_to_ignore_v<seq_type>) // sequence
343  {
344  throw std::logic_error{"The SEQ and SEQ_QUAL fields may not both be set to ignore when writing FASTA files."};
345  }
346  else
347  {
348  if (empty(sequence)) //[[unlikely]]
349  throw std::runtime_error{"The SEQ field may not be empty when writing FASTA files."};
350 
351  write_seq(stream_it, options, sequence);
352  }
353  }
354 
356  template <typename stream_it_t,
357  typename id_type>
358  void write_id(stream_it_t & stream_it,
359  sequence_file_output_options const & options,
360  id_type && id)
361  {
362  if (options.fasta_legacy_id_marker)
363  stream_it = ';';
364  else
365  stream_it = '>';
366 
367  if (options.fasta_blank_before_id)
368  stream_it = ' ';
369 
370  std::ranges::copy(id, stream_it);
371 
372  detail::write_eol(stream_it, options.add_carriage_return);
373  }
374 
376  template <typename stream_it_t,
377  typename seq_type>
378  void write_seq(stream_it_t & stream_it,
379  sequence_file_output_options const & options,
380  seq_type && seq)
381  {
382  if (options.fasta_letters_per_line > 0)
383  {
384  #if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
385  size_t count = 0;
386  for (auto c : seq)
387  {
388  stream_it = to_char(c);
389  if (++count % options.fasta_letters_per_line == 0)
390  detail::write_eol(stream_it, options.add_carriage_return);
391  }
392  if (count % options.fasta_letters_per_line != 0)
393  detail::write_eol(stream_it, options.add_carriage_return);
394 
395  #else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
396 
397  //TODO: combining chunk and join is substantially faster than view::interleave (2.5x), why?
398  std::ranges::copy(seq | view::to_char
399  | ranges::view::chunk(options.fasta_letters_per_line)
400  | std::view::join(options.add_carriage_return
401  ? std::string_view{"\r\n"}
402  : std::string_view{"\n"}),
403  stream_it);
404  detail::write_eol(stream_it, options.add_carriage_return);
405  #endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
406  }
407  else
408  {
409  // No performance workaround here, because transform views alone are fast
410  std::ranges::copy(seq | view::to_char, stream_it);
411  detail::write_eol(stream_it, options.add_carriage_return);
412  }
413  }
414 };
415 
416 } // namespace seqan3::detail
Provides seqan3::SequenceFileInputFormat and auxiliary classes.
Provides seqan3::view::istreambuf.
Provides various shortcuts for common std::ranges functions.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_fasta.hpp:83
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:245
Provides seqan3::view::take.
The main SeqAn3 namespace.
Provides seqan3::dna5, container aliases and string literals.
Provides seqan3::view::take_line and seqan3::view::take_line_or_throw.
Provides seqan3::sequence_file_output_options.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides various utility functions.
Provides various utility functions.
Provides seqan3::view::char_to.
Adaptations of concepts from the Ranges TS.
The FastA format (tag).
Definition: format_fasta.hpp:79
Definition: aligned_sequence_concept.hpp:35
Provides character predicates for tokenisation.
Provides seqan3::ostream and seqan3::ostreambuf iterator.
Provides seqan3::SequenceFileFormatOut and auxiliary classes.
Adaptations of algorithms from the Ranges TS.
Provides seqan3::view::to_char.
Provides seqan3::sequence_file_input_options.
Provides various transformation traits used by the range module.
Provides aliases for qualified.
Provides seqan3::view::take_exactly and seqan3::view::take_exactly_or_throw.