SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
format_vienna.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
14 #pragma once
15 
16 #include <cstdio>
17 #include <iterator>
18 #include <stack>
19 #include <string>
20 #include <string_view>
21 #include <type_traits>
22 #include <vector>
23 
24 #include <range/v3/algorithm/copy.hpp>
25 #include <range/v3/view/chunk.hpp>
26 #include <range/v3/view/drop_while.hpp>
27 #include <range/v3/view/join.hpp>
28 #include <range/v3/view/remove_if.hpp>
29 #include <range/v3/view/transform.hpp>
30 
49 #include <seqan3/std/algorithm>
50 #include <seqan3/std/ranges>
51 
52 namespace seqan3
53 {
91 {
94  {
95  { "dbn" },
96  { "fasta" },
97  { "fa" }
98  };
99 };
100 
101 } // namespace seqan
102 
103 namespace seqan3::detail
104 {
105 
108 template <>
109 class structure_file_input_format<format_vienna>
110 {
111 public:
113  using format_tag = format_vienna;
114 
118  structure_file_input_format() noexcept = default;
119  structure_file_input_format(structure_file_input_format const &) = delete;
122  structure_file_input_format & operator=(structure_file_input_format const &) = delete;
123  structure_file_input_format(structure_file_input_format &&) noexcept = default;
124  structure_file_input_format & operator=(structure_file_input_format &&) noexcept = default;
125  ~structure_file_input_format() noexcept = default;
126 
129  template <typename stream_type, // constraints checked by file
130  typename seq_legal_alph_type,
131  bool structured_seq_combined,
132  typename seq_type, // other constraints checked inside function
133  typename id_type,
134  typename bpp_type,
135  typename structure_type,
136  typename energy_type,
137  typename react_type,
138  typename comment_type,
139  typename offset_type>
140  void read(stream_type & stream,
141  structure_file_input_options<seq_legal_alph_type, structured_seq_combined> const & options,
142  seq_type & seq,
143  id_type & id,
144  bpp_type & bpp,
145  structure_type & structure,
146  energy_type & energy,
147  react_type & SEQAN3_DOXYGEN_ONLY(react),
148  react_type & SEQAN3_DOXYGEN_ONLY(react_err),
149  comment_type & SEQAN3_DOXYGEN_ONLY(comment),
150  offset_type & SEQAN3_DOXYGEN_ONLY(offset))
151  {
152  auto stream_view = view::istreambuf(stream);
153 
154  // READ ID (if present)
155  auto constexpr is_id = is_char<'>'>;
156  if (is_id(*begin(stream_view)))
157  {
158  if constexpr (!detail::decays_to_ignore_v<id_type>)
159  {
160  if (options.truncate_ids)
161  {
162  std::ranges::copy(stream_view | std::view::drop_while(is_id || is_blank) // skip leading >
163  | view::take_until_or_throw(is_cntrl || is_blank)
164  | view::char_to<value_type_t<id_type>>,
165  std::back_inserter(id));
166  detail::consume(stream_view | view::take_line_or_throw);
167  }
168  else
169  {
170  std::ranges::copy(stream_view | std::view::drop_while(is_id || is_blank) // skip leading >
171  | view::take_line_or_throw
172  | view::char_to<value_type_t<id_type>>,
173  std::back_inserter(id));
174  }
175  }
176  else
177  {
178  detail::consume(stream_view | view::take_line_or_throw);
179  }
180  }
181  else if constexpr (!detail::decays_to_ignore_v<id_type>)
182  {
183  auto constexpr is_legal_seq = is_in_alphabet<seq_legal_alph_type>;
184  if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
185  {
186  throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} +
187  is_id.msg.str() + " and " + is_legal_seq.msg.str() +
188  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
189  }
190  }
191 
192  // READ SEQUENCE
193  if constexpr (!detail::decays_to_ignore_v<seq_type>)
194  {
195  auto constexpr is_legal_seq = is_in_alphabet<seq_legal_alph_type>;
196  std::ranges::copy(stream_view | view::take_line_or_throw // until end of line
197  | std::view::filter(!(is_space || is_digit)) // ignore whitespace and numbers
198  | std::view::transform([is_legal_seq](char const c)
199  {
200  if (!is_legal_seq(c)) // enforce legal alphabet
201  {
202  throw parse_error{std::string{"Encountered an unexpected letter: "} +
203  is_legal_seq.msg.str() +
204  " evaluated to false on " +
205  detail::make_printable(c)};
206  }
207  return c;
208  })
209  | view::char_to<value_type_t<seq_type>>, // convert to actual target alphabet
210  std::back_inserter(seq));
211  }
212  else
213  {
214  detail::consume(stream_view | view::take_line_or_throw);
215  }
216 
217  // READ STRUCTURE (if present)
218  if constexpr (!detail::decays_to_ignore_v<structure_type>)
219  {
220  if constexpr (structured_seq_combined)
221  {
222  assert(std::addressof(seq) == std::addressof(structure));
223  using alph_type = typename value_type_t<structure_type>::structure_alphabet_type;
224  std::ranges::copy(read_structure<alph_type>(stream_view), begin(structure));
225 
226  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
227  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
228  }
229  else
230  {
231  using alph_type = value_type_t<structure_type>;
232  std::ranges::copy(read_structure<alph_type>(stream_view), std::back_inserter(structure));
233 
234  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
235  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
236  }
237  if constexpr (!detail::decays_to_ignore_v<seq_type>)
238  if (size(seq) != size(structure))
239  throw parse_error{"Found sequence and associated structure of different length."};
240  }
241  else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
242  {
243  detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
244 
245  if constexpr (!detail::decays_to_ignore_v<seq_type>)
246  if (size(seq) != size(bpp))
247  throw parse_error{"Found sequence and associated structure of different length."};
248  }
249  else
250  {
251  detail::consume(stream_view | view::take_until(is_space)); // until whitespace
252  }
253 
254  // READ ENERGY (if present)
255  if constexpr (!detail::decays_to_ignore_v<energy_type>)
256  {
257  std::string e_str = stream_view | view::take_line
258  | std::view::filter(!(is_space || is_char<'('> || is_char<')'>));
259  if (!e_str.empty())
260  {
261  size_t num_processed;
262  energy = std::stod(e_str, &num_processed);
263  if (num_processed != e_str.size()) // [[unlikely]]
264  {
265  throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
266  }
267  }
268  }
269  else
270  {
271  detail::consume(stream_view | view::take_line);
272  }
273  detail::consume(stream_view | view::take_until(!is_space));
274  }
275 
276 private:
284  template <typename alph_type, typename stream_view_type>
285  auto read_structure(stream_view_type & stream_view)
286  {
287  auto constexpr is_legal_structure = is_in_alphabet<alph_type>;
288  return stream_view | view::take_until(is_space) // until whitespace
289  | std::view::transform([is_legal_structure](char const c)
290  {
291  if (!is_legal_structure(c))
292  {
293  throw parse_error{
294  std::string{"Encountered an unexpected letter: "} +
295  is_legal_structure.msg.str() +
296  " evaluated to false on " + detail::make_printable(c)};
297  }
298  return c;
299  }) // enforce legal alphabet
300  | view::char_to<alph_type>; // convert to actual target alphabet
301  }
302 };
303 
306 template <>
307 class structure_file_output_format<format_vienna>
308 {
309 public:
311  using format_tag = format_vienna;
312 
316  structure_file_output_format() noexcept = default; //!< Defaulted.
318  structure_file_output_format(structure_file_output_format const &) = delete;
320  structure_file_output_format & operator=(structure_file_output_format const &) = delete;
321  structure_file_output_format(structure_file_output_format &&) noexcept = default; //!< Defaulted.
322  structure_file_output_format & operator=(structure_file_output_format &&) noexcept = default; //!< Defaulted.
323  ~structure_file_output_format() noexcept = default; //!< Defaulted.
325 
327  template <typename stream_type, // constraints checked by file
328  typename seq_type, // other constraints checked inside function
329  typename id_type,
330  typename bpp_type,
331  typename structure_type,
332  typename energy_type,
333  typename react_type,
334  typename comment_type,
335  typename offset_type>
336  void write(stream_type & stream,
337  structure_file_output_options const & options,
338  seq_type && seq,
339  id_type && id,
340  bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
341  structure_type && structure,
342  energy_type && energy,
343  react_type && SEQAN3_DOXYGEN_ONLY(react),
344  react_type && SEQAN3_DOXYGEN_ONLY(react_err),
345  comment_type && SEQAN3_DOXYGEN_ONLY(comment),
346  offset_type && SEQAN3_DOXYGEN_ONLY(offset))
347  {
348  seqan3::ostreambuf_iterator stream_it{stream};
349 
350  // WRITE ID (optional)
351  if constexpr (!detail::decays_to_ignore_v<id_type>)
352  {
353  if (!empty(id))
354  {
355  stream_it = '>';
356  stream_it = ' ';
357  std::ranges::copy(id, stream_it);
358  detail::write_eol(stream_it, options.add_carriage_return);
359  }
360  }
361 
362  // WRITE SEQUENCE
363  if constexpr (!detail::decays_to_ignore_v<seq_type>)
364  {
365  if (empty(seq)) //[[unlikely]]
366  throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
367 
368  std::ranges::copy(seq | view::to_char, stream_it);
369  detail::write_eol(stream_it, options.add_carriage_return);
370  }
371  else
372  {
373  throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
374  "when writing Vienna files."};
375  }
376 
377  // WRITE STRUCTURE (optional)
378  if constexpr (!detail::decays_to_ignore_v<structure_type>)
379  {
380  if (!empty(structure))
381  std::ranges::copy(structure | view::to_char, stream_it);
382 
383  // WRITE ENERGY (optional)
384  if constexpr (!detail::decays_to_ignore_v<energy_type>)
385  {
386  if (energy)
387  {
388 // TODO(joergi-w) enable the following when std::to_chars is implemented for float types
389 // auto [endptr, ec] = std::to_chars(str.data(),
390 // str.data() + str.size(),
391 // energy,
392 // std::chars_format::fixed,
393 // options.precision);
394 // if (ec == std::errc())
395 // std::ranges::copy(str.data(), endptr, stream_it);
396 // else
397 // throw std::runtime_error{"The energy could not be transformed into a string."};
398 
399  stream_it = ' ';
400  stream_it = '(';
401 
402  std::array<char, 100> str;
403  int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
404  if (len < 0 || len >= 100)
405  throw std::runtime_error{"The energy could not be transformed into a string."};
406  std::ranges::copy(str.data(), str.data() + len, stream_it);
407 
408  stream_it = ')';
409  }
410  }
411  detail::write_eol(stream_it, options.add_carriage_return);
412  }
413  else if constexpr (!detail::decays_to_ignore_v<energy_type>)
414  {
415  throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
416  }
417  }
418 };
419 
420 } // namespace seqan3::detail
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition: format_vienna.hpp:90
Provides seqan3::StructureFileInputFormat.
Provides seqan3::view::istreambuf.
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
Provides various shortcuts for common std::ranges functions.
Provides seqan3::structure_file_input_options.
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:245
Provides seqan3::view::take.
The main SeqAn3 namespace.
Provides seqan3::view::take_line and seqan3::view::take_line_or_throw.
Helper functions (e.g. conversions) for the structure IO submodule.
Provides seqan3::structure_file_output_options.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides various utility functions.
Provides various utility functions.
Provides seqan3::view::char_to.
Adaptations of concepts from the Ranges TS.
Provides seqan3::StructureFileOutputFormat and auxiliary classes.
Provides the WUSS format for RNA structure.
Definition: aligned_sequence_concept.hpp:35
Provides character predicates for tokenisation.
Provides seqan3::ostream and seqan3::ostreambuf iterator.
Adaptations of algorithms from the Ranges TS.
Provides seqan3::view::to_char.
Provides various transformation traits used by the range module.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_vienna.hpp:94