SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
format_fasta.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/algorithm>
16 #include <iterator>
17 #include <seqan3/std/ranges>
18 #include <string>
19 #include <string_view>
20 #include <vector>
21 
44 
45 namespace seqan3
46 {
47 
81 {
82 public:
86  format_fasta() noexcept = default;
87  format_fasta(format_fasta const &) noexcept = default;
88  format_fasta & operator=(format_fasta const &) noexcept = default;
89  format_fasta(format_fasta &&) noexcept = default;
90  format_fasta & operator=(format_fasta &&) noexcept = default;
91  ~format_fasta() noexcept = default;
92 
94 
96  static inline std::vector<std::string> file_extensions
97  {
98  { "fasta" },
99  { "fa" },
100  { "fna" },
101  { "ffn" },
102  { "faa" },
103  { "frn" },
104  { "fas" },
105  };
106 
107 protected:
109 #ifdef SEQAN3_DEPRECATED_310
110  template <typename stream_type, // constraints checked by file
111  typename legal_alph_type, bool seq_qual_combined,
112  typename seq_type, // other constraints checked inside function
113  typename id_type,
114  typename qual_type>
115  void read_sequence_record(stream_type & stream,
117  seq_type & sequence,
118  id_type & id,
119  qual_type & SEQAN3_DOXYGEN_ONLY(qualities))
120 #else // ^^^ before seqan 3.1 / after seqan 3.1 vvv
121  template <typename stream_type, // constraints checked by file
122  typename legal_alph_type,
123  typename seq_type, // other constraints checked inside function
124  typename id_type,
125  typename qual_type>
126  void read_sequence_record(stream_type & stream,
128  seq_type & sequence,
129  id_type & id,
130  qual_type & SEQAN3_DOXYGEN_ONLY(qualities))
131 #endif // SEQAN3_DEPRECATED_310
132  {
133  auto stream_view = detail::istreambuf(stream);
134 
135  // ID
136  read_id(stream_view, options, id);
137 
138  // Sequence
139  read_seq(stream_view, options, sequence);
140  }
141 
143  template <typename stream_type, // constraints checked by file
144  typename seq_type, // other constraints checked inside function
145  typename id_type,
146  typename qual_type>
147  void write_sequence_record(stream_type & stream,
148  sequence_file_output_options const & options,
149  seq_type && sequence,
150  id_type && id,
151  qual_type && SEQAN3_DOXYGEN_ONLY(qualities))
152  {
153  seqan3::detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
154 
155  // ID
156  if constexpr (detail::decays_to_ignore_v<id_type>)
157  {
158  throw std::logic_error{"The ID field may not be set to ignore when writing FASTA files."};
159  }
160  else
161  {
162  if (std::ranges::empty(id)) //[[unlikely]]
163  throw std::runtime_error{"The ID field may not be empty when writing FASTA files."};
164 
165  write_id(stream_it, options, id);
166  }
167 
168  // Sequence
169  if constexpr (detail::decays_to_ignore_v<seq_type>) // sequence
170  {
171  throw std::logic_error{"The SEQ and SEQ_QUAL fields may not both be set to ignore when writing FASTA files."};
172  }
173  else
174  {
175  if (std::ranges::empty(sequence)) //[[unlikely]]
176  throw std::runtime_error{"The SEQ field may not be empty when writing FASTA files."};
177 
178  write_seq(stream_it, options, sequence);
179  }
180  }
181 
182 private:
185 #ifdef SEQAN3_DEPRECATED_310
186  template <typename stream_view_t,
187  typename seq_legal_alph_type, bool seq_qual_combined,
188  typename id_type>
189  void read_id(stream_view_t & stream_view,
191  id_type & id)
192 #else // ^^^ before seqan 3.1 / after seqan 3.1 vvv
193  template <typename stream_view_t,
194  typename seq_legal_alph_type,
195  typename id_type>
196  void read_id(stream_view_t & stream_view,
198  id_type & id)
199 #endif // SEQAN3_DEPRECATED_310
200  {
201  auto const is_id = is_char<'>'> || is_char<';'>;
202 
203  if (!is_id(*begin(stream_view)))
204  throw parse_error{std::string{"Expected to be on beginning of ID, but "} + is_id.msg +
205  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
206 
207  // read id
208  if constexpr (!detail::decays_to_ignore_v<id_type>)
209  {
210  if (options.truncate_ids)
211  {
212  #if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
213  auto it = stream_view.begin();
214  auto e = stream_view.end();
215  for (; (it != e) && (is_id || is_blank)(*it); ++it)
216  {}
217 
218  bool at_delimiter = false;
219  for (; it != e; ++it)
220  {
221  if ((is_cntrl || is_blank)(*it))
222  {
223  at_delimiter = true;
224  break;
225  }
226  id.push_back(assign_char_to(*it, std::ranges::range_value_t<id_type>{}));
227  }
228 
229  if (!at_delimiter)
230  throw unexpected_end_of_input{"FastA ID line did not end in newline."};
231 
232  for (; (it != e) && ((!is_char<'\n'>)(*it)); ++it)
233  {}
234 
235  #else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
236 
237  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
238  | detail::take_until_or_throw(is_cntrl || is_blank) // read ID until delimiter…
239  | views::char_to<std::ranges::range_value_t<id_type>>,
240  std::cpp20::back_inserter(id)); // … ^A is old delimiter
241 
242  // consume rest of line
243  detail::consume(stream_view | detail::take_line_or_throw);
244  #endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
245 
246  }
247  else
248  {
249  #if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
250  auto it = stream_view.begin();
251  auto e = stream_view.end();
252  for (; (it != e) && (is_id || is_blank)(*it); ++it)
253  {}
254 
255  bool at_delimiter = false;
256  for (; it != e; ++it)
257  {
258  if ((is_char<'\n'>)(*it))
259  {
260  at_delimiter = true;
261  break;
262  }
263  id.push_back(assign_char_to(*it, std::ranges::range_value_t<id_type>{}));
264  }
265 
266  if (!at_delimiter)
267  throw unexpected_end_of_input{"FastA ID line did not end in newline."};
268 
269  #else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
270 
271  std::ranges::copy(stream_view | detail::take_line_or_throw // read line
272  | std::views::drop_while(is_id || is_blank) // skip leading >
273  | views::char_to<std::ranges::range_value_t<id_type>>,
274  std::cpp20::back_inserter(id));
275  #endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
276  }
277  }
278  else
279  {
280  detail::consume(stream_view | detail::take_line_or_throw);
281  }
282  }
283 
285 #ifdef SEQAN3_DEPRECATED_310
286  template <typename stream_view_t,
287  typename seq_legal_alph_type, bool seq_qual_combined,
288  typename seq_type>
289  void read_seq(stream_view_t & stream_view,
290  sequence_file_input_options<seq_legal_alph_type, seq_qual_combined> const &,
291  seq_type & seq)
292 #else // ^^^ before seqan 3.1 / after seqan 3.1 vvv
293  template <typename stream_view_t,
294  typename seq_legal_alph_type,
295  typename seq_type>
296  void read_seq(stream_view_t & stream_view,
297  sequence_file_input_options<seq_legal_alph_type> const &,
298  seq_type & seq)
299 #endif // SEQAN3_DEPRECATED_310
300  {
301  auto constexpr is_id = is_char<'>'> || is_char<';'>;
302 
303  if constexpr (!detail::decays_to_ignore_v<seq_type>)
304  {
305  auto constexpr is_legal_alph = char_is_valid_for<seq_legal_alph_type>;
306 
307  #if SEQAN3_WORKAROUND_VIEW_PERFORMANCE
308  auto it = stream_view.begin();
309  auto e = stream_view.end();
310 
311  if (it == e)
312  throw unexpected_end_of_input{"No sequence information given!"};
313 
314  for (; (it != e) && ((!is_id)(*it)); ++it)
315  {
316  if ((is_space || is_digit)(*it))
317  continue;
318  else if (!is_legal_alph(*it))
319  {
320  throw parse_error{std::string{"Encountered an unexpected letter: "} +
321  "char_is_valid_for<" +
322  detail::type_name_as_string<seq_legal_alph_type> +
323  "> evaluated to false on " +
324  detail::make_printable(*it)};
325  }
326 
327  seq.push_back(assign_char_to(*it, std::ranges::range_value_t<seq_type>{}));
328  }
329 
330  #else // ↑↑↑ WORKAROUND | ORIGINAL ↓↓↓
331 
332  if (std::ranges::begin(stream_view) == std::ranges::end(stream_view))
333  throw unexpected_end_of_input{"No sequence information given!"};
334 
335  std::ranges::copy(stream_view | detail::take_until(is_id) // until next header (or end)
336  | std::views::filter(!(is_space || is_digit))// ignore whitespace and numbers
337  | std::views::transform([is_legal_alph] (char const c)
338  {
339  if (!is_legal_alph(c))
340  {
341  throw parse_error{std::string{"Encountered an unexpected letter: "} +
342  "char_is_valid_for<" +
343  detail::type_name_as_string<seq_legal_alph_type> +
344  "> evaluated to false on " +
345  detail::make_printable(c)};
346  }
347  return c;
348  }) // enforce legal alphabet
349  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
350  std::cpp20::back_inserter(seq));
351  #endif // SEQAN3_WORKAROUND_VIEW_PERFORMANCE
352  }
353  else
354  {
355  detail::consume(stream_view | detail::take_until(is_id));
356  }
357  }
358 
360  template <typename stream_it_t, typename id_type>
361  void write_id(stream_it_t & stream_it, sequence_file_output_options const & options, id_type && id)
362  {
363  if (options.fasta_legacy_id_marker)
364  stream_it = ';';
365  else
366  stream_it = '>';
367 
368  if (options.fasta_blank_before_id)
369  stream_it = ' ';
370 
371  stream_it.write_range(id);
372  stream_it.write_end_of_line(options.add_carriage_return);
373  }
374 
376  template <typename stream_it_t, typename seq_type>
377  void write_seq(stream_it_t & stream_it, sequence_file_output_options const & options, seq_type && seq)
378  {
379  auto char_sequence = seq | views::to_char;
380 
381  if (options.fasta_letters_per_line > 0)
382  {
383  /* Using `views::interleave` is probably the way to go but that needs performance-tuning.*/
384  auto it = std::ranges::begin(char_sequence);
385  auto end = std::ranges::end(char_sequence);
386 
387  while (it != end)
388  {
389  /* Note: This solution is slightly suboptimal for sized but non-random-access ranges.*/
390  auto current_end = it;
391  size_t steps = std::ranges::advance(current_end, options.fasta_letters_per_line, end);
392  using subrange_t = std::ranges::subrange<decltype(it), decltype(it), std::ranges::subrange_kind::sized>;
393  it = stream_it.write_range(subrange_t{it, current_end, (options.fasta_letters_per_line - steps)});
394  stream_it.write_end_of_line(options.add_carriage_return);
395  }
396  }
397  else
398  {
399  stream_it.write_range(char_sequence);
400  stream_it.write_end_of_line(options.add_carriage_return);
401  }
402  }
403 };
404 
405 } // namespace seqan
Adaptations of algorithms from the Ranges TS.
Provides aliases for qualified.
Core alphabet concept and free function/type trait wrappers.
Provides seqan3::views::char_to.
Provides seqan3::views::to_char.
Provides alphabet adaptations for standard char types.
The FastA format.
Definition: format_fasta.hpp:81
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_fasta.hpp:97
format_fasta() noexcept=default
Defaulted.
void read_sequence_record(stream_type &stream, sequence_file_input_options< legal_alph_type, seq_qual_combined > const &options, seq_type &sequence, id_type &id, qual_type &qualities)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_fasta.hpp:115
void write_sequence_record(stream_type &stream, sequence_file_output_options const &options, seq_type &&sequence, id_type &&id, qual_type &&qualities)
Write the given fields to the specified stream.
Definition: format_fasta.hpp:147
Provides various utility functions.
Provides various transformation traits used by the range module.
Provides seqan3::dna5, container aliases and string literals.
Provides seqan3::detail::fast_ostreambuf_iterator.
Provides seqan3::detail::ignore_output_iterator for writing to null stream.
The generic concept for a (biological) sequence.
Provides various utility functions.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Adaptations of concepts from the Ranges TS.
Provides seqan3::sequence_file_input_format and auxiliary classes.
Provides seqan3::sequence_file_input_options.
Provides seqan3::sequence_file_output_format and auxiliary classes.
Provides seqan3::sequence_file_output_options.
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:30
The options type defines various option members that influence the behaviour of all or some formats.
Definition: output_options.hpp:22
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
Provides seqan3::detail::take_line and seqan3::detail::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
[DEPRECATED] Provides seqan3::views::take.
Provides traits to inspect some information of a type, for example its name.
Provides character predicates for tokenisation.