SeqAn3  3.0.1
The Modern C++ library for sequence analysis.
format_vienna.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <cstdio>
16 #include <iterator>
17 #include <stack>
18 #include <string>
19 #include <string_view>
20 #include <type_traits>
21 #include <vector>
22 
43 #include <seqan3/std/algorithm>
44 #include <seqan3/std/ranges>
45 
46 namespace seqan3
47 {
85 {
86 public:
90  format_vienna() noexcept = default;
91  format_vienna(format_vienna const &) noexcept = default;
92  format_vienna & operator=(format_vienna const &) noexcept = default;
93  format_vienna(format_vienna &&) noexcept = default;
94  format_vienna & operator=(format_vienna &&) noexcept = default;
95  ~format_vienna() noexcept = default;
96 
100  {
101  { "dbn" },
102  { "fasta" },
103  { "fa" }
104  };
105 
106 protected:
108  template <typename stream_type, // constraints checked by file
109  typename seq_legal_alph_type,
110  bool structured_seq_combined,
111  typename seq_type, // other constraints checked inside function
112  typename id_type,
113  typename bpp_type,
114  typename structure_type,
115  typename energy_type,
116  typename react_type,
117  typename comment_type,
118  typename offset_type>
119  void read_structure_record(stream_type & stream,
121  seq_type & seq,
122  id_type & id,
123  bpp_type & bpp,
124  structure_type & structure,
125  energy_type & energy,
126  react_type & SEQAN3_DOXYGEN_ONLY(react),
127  react_type & SEQAN3_DOXYGEN_ONLY(react_err),
128  comment_type & SEQAN3_DOXYGEN_ONLY(comment),
129  offset_type & SEQAN3_DOXYGEN_ONLY(offset))
130  {
131  auto stream_view = views::istreambuf(stream);
132 
133  // READ ID (if present)
134  auto constexpr is_id = is_char<'>'>;
135  if (is_id(*begin(stream_view)))
136  {
137  if constexpr (!detail::decays_to_ignore_v<id_type>)
138  {
139  if (options.truncate_ids)
140  {
141  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
142  | views::take_until_or_throw(is_cntrl || is_blank)
143  | views::char_to<value_type_t<id_type>>,
144  std::ranges::back_inserter(id));
145  detail::consume(stream_view | views::take_line_or_throw);
146  }
147  else
148  {
149  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
150  | views::take_line_or_throw
151  | views::char_to<value_type_t<id_type>>,
152  std::ranges::back_inserter(id));
153  }
154  }
155  else
156  {
157  detail::consume(stream_view | views::take_line_or_throw);
158  }
159  }
160  else if constexpr (!detail::decays_to_ignore_v<id_type>)
161  {
162  auto constexpr is_legal_seq = is_in_alphabet<seq_legal_alph_type>;
163  if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
164  {
165  throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} +
166  is_id.msg + " and " + is_legal_seq.msg +
167  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
168  }
169  }
170 
171  // READ SEQUENCE
172  if constexpr (!detail::decays_to_ignore_v<seq_type>)
173  {
174  auto constexpr is_legal_seq = is_in_alphabet<seq_legal_alph_type>;
175  std::ranges::copy(stream_view | views::take_line_or_throw // until end of line
176  | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
177  | std::views::transform([is_legal_seq](char const c)
178  {
179  if (!is_legal_seq(c)) // enforce legal alphabet
180  {
181  throw parse_error{std::string{"Encountered an unexpected letter: "} +
182  is_legal_seq.msg +
183  " evaluated to false on " +
184  detail::make_printable(c)};
185  }
186  return c;
187  })
188  | views::char_to<value_type_t<seq_type>>, // convert to actual target alphabet
189  std::ranges::back_inserter(seq));
190  }
191  else
192  {
193  detail::consume(stream_view | views::take_line_or_throw);
194  }
195 
196  // READ STRUCTURE (if present)
197  [[maybe_unused]] int64_t structure_length{};
198  if constexpr (!detail::decays_to_ignore_v<structure_type>)
199  {
200  if constexpr (structured_seq_combined)
201  {
202  assert(std::addressof(seq) == std::addressof(structure));
203  using alph_type = typename value_type_t<structure_type>::structure_alphabet_type;
204  // We need the structure_length parameter to count the length of the structure while reading
205  // because we cannot infer it from the (already resized) structure_seq object.
206  auto res = std::ranges::copy(read_structure<alph_type>(stream_view), std::ranges::begin(structure));
207  structure_length = std::ranges::distance(std::ranges::begin(structure), res.out);
208 
209  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
210  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
211  }
212  else
213  {
214  using alph_type = value_type_t<structure_type>;
215  std::ranges::copy(read_structure<alph_type>(stream_view), std::ranges::back_inserter(structure));
216  structure_length = std::ranges::distance(structure);
217 
218  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
219  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
220  }
221  }
222  else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
223  {
224  detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
225  structure_length = std::ranges::distance(bpp);
226  }
227  else
228  {
229  detail::consume(stream_view | views::take_until(is_space)); // until whitespace
230  }
231 
232  if constexpr (!detail::decays_to_ignore_v<seq_type> &&
233  !(detail::decays_to_ignore_v<structure_type> && detail::decays_to_ignore_v<bpp_type>))
234  {
235  if (std::ranges::distance(seq) != structure_length)
236  throw parse_error{"Found sequence and associated structure of different length."};
237  }
238 
239  // READ ENERGY (if present)
240  if constexpr (!detail::decays_to_ignore_v<energy_type>)
241  {
242  std::string e_str = stream_view | views::take_line
243  | std::views::filter(!(is_space || is_char<'('> || is_char<')'>))
244  | views::to<std::string>;
245  if (!e_str.empty())
246  {
247  size_t num_processed;
248  energy = std::stod(e_str, &num_processed);
249  if (num_processed != e_str.size()) // [[unlikely]]
250  {
251  throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
252  }
253  }
254  }
255  else
256  {
257  detail::consume(stream_view | views::take_line);
258  }
259  detail::consume(stream_view | views::take_until(!is_space));
260  }
261 
263  template <typename stream_type, // constraints checked by file
264  typename seq_type, // other constraints checked inside function
265  typename id_type,
266  typename bpp_type,
267  typename structure_type,
268  typename energy_type,
269  typename react_type,
270  typename comment_type,
271  typename offset_type>
272  void write_structure_record(stream_type & stream,
273  structure_file_output_options const & options,
274  seq_type && seq,
275  id_type && id,
276  bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
277  structure_type && structure,
278  energy_type && energy,
279  react_type && SEQAN3_DOXYGEN_ONLY(react),
280  react_type && SEQAN3_DOXYGEN_ONLY(react_err),
281  comment_type && SEQAN3_DOXYGEN_ONLY(comment),
282  offset_type && SEQAN3_DOXYGEN_ONLY(offset))
283  {
284  seqan3::ostreambuf_iterator stream_it{stream};
285 
286  // WRITE ID (optional)
287  if constexpr (!detail::decays_to_ignore_v<id_type>)
288  {
289  if (!empty(id))
290  {
291  stream_it = '>';
292  stream_it = ' ';
293  std::ranges::copy(id, stream_it);
294  detail::write_eol(stream_it, options.add_carriage_return);
295  }
296  }
297 
298  // WRITE SEQUENCE
299  if constexpr (!detail::decays_to_ignore_v<seq_type>)
300  {
301  if (empty(seq)) //[[unlikely]]
302  throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
303 
304  std::ranges::copy(seq | views::to_char, stream_it);
305  detail::write_eol(stream_it, options.add_carriage_return);
306  }
307  else
308  {
309  throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
310  "when writing Vienna files."};
311  }
312 
313  // WRITE STRUCTURE (optional)
314  if constexpr (!detail::decays_to_ignore_v<structure_type>)
315  {
316  if (!empty(structure))
317  std::ranges::copy(structure | views::to_char, stream_it);
318 
319  // WRITE ENERGY (optional)
320  if constexpr (!detail::decays_to_ignore_v<energy_type>)
321  {
322  if (energy)
323  {
324 // TODO(joergi-w) enable the following when std::to_chars is implemented for float types
325 // auto [endptr, ec] = std::to_chars(str.data(),
326 // str.data() + str.size(),
327 // energy,
328 // std::chars_format::fixed,
329 // options.precision);
330 // if (ec == std::errc())
331 // std::ranges::copy(str.data(), endptr, stream_it);
332 // else
333 // throw std::runtime_error{"The energy could not be transformed into a string."};
334 
335  stream_it = ' ';
336  stream_it = '(';
337 
338  std::array<char, 100> str;
339  int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
340  if (len < 0 || len >= 100)
341  throw std::runtime_error{"The energy could not be transformed into a string."};
342  std::ranges::copy(str.data(), str.data() + len, stream_it);
343 
344  stream_it = ')';
345  }
346  }
347  detail::write_eol(stream_it, options.add_carriage_return);
348  }
349  else if constexpr (!detail::decays_to_ignore_v<energy_type>)
350  {
351  throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
352  }
353  }
354 
355 private:
362  template <typename alph_type, typename stream_view_type>
363  auto read_structure(stream_view_type & stream_view)
364  {
365  auto constexpr is_legal_structure = is_in_alphabet<alph_type>;
366  return stream_view | views::take_until(is_space) // until whitespace
367  | std::views::transform([is_legal_structure](char const c)
368  {
369  if (!is_legal_structure(c))
370  {
371  throw parse_error{
372  std::string{"Encountered an unexpected letter: "} +
373  is_legal_structure.msg +
374  " evaluated to false on " + detail::make_printable(c)};
375  }
376  return c;
377  }) // enforce legal alphabet
378  | views::char_to<alph_type>; // convert to actual target alphabet
379  }
380 };
381 
382 } // namespace seqan3::detail
wuss.hpp
Provides the WUSS format for RNA structure.
shortcuts.hpp
Provides various shortcuts for common std::ranges functions.
seqan3::field::seq
The "sequence", usually a range of nucleotides or amino acids.
take_until.hpp
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
seqan3::format_vienna::~format_vienna
~format_vienna() noexcept=default
detail.hpp
Helper functions (e.g. conversions) for the structure IO submodule.
seqan3::field::offset
Sequence (SEQ) relative start position (0-based), unsigned value.
seqan3::field::bpp
Base pair probability matrix of interactions, usually a matrix of float numbers.
vector
stack
input_format_concept.hpp
Provides seqan3::structure_file_input_format.
seqan3::field::structure
Fixed interactions, usually a string of structure alphabet characters.
algorithm
Adaptations of algorithms from the Ranges TS.
seqan3::field::energy
Energy of a folded sequence, represented by one float number.
input_options.hpp
Provides seqan3::structure_file_input_options.
seqan3::format_vienna::read_structure_record
void read_structure_record(stream_type &stream, structure_file_input_options< seq_legal_alph_type, structured_seq_combined > const &options, seq_type &seq, id_type &id, bpp_type &bpp, structure_type &structure, energy_type &energy, react_type &react, react_type &react_err, comment_type &comment, offset_type &offset)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_vienna.hpp:119
output_options.hpp
Provides seqan3::structure_file_output_options.
to.hpp
Provides seqan3::views::to.
range.hpp
Provides various transformation traits used by the range module.
seqan3::field::comment
Comment field of arbitrary content, usually a string.
seqan3::format_vienna::format_vienna
format_vienna() noexcept=default
Defaulted.
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:36
seqan3::structure_file_input_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:27
seqan3::format_vienna
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition: format_vienna.hpp:84
seqan3::format_vienna::file_extensions
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_vienna.hpp:100
output_format_concept.hpp
Provides seqan3::structure_file_output_format and auxiliary classes.
seqan3::format_vienna::operator=
format_vienna & operator=(format_vienna const &) noexcept=default
Defaulted.
char.hpp
Provides alphabet adaptations for standard char types.
to_char.hpp
Provides seqan3::views::to_char.
take.hpp
Provides seqan3::views::take.
ranges
Adaptations of concepts from the Ranges TS.
seqan3::field::react
Reactivity values of the sequence characters given in a vector of float numbers.
predicate.hpp
Provides character predicates for tokenisation.
seqan3::field::react_err
Reactivity error values given in a vector corresponding to REACT.
istreambuf.hpp
Provides seqan3::views::istreambuf.
take_line.hpp
Provides seqan3::views::take_line and seqan3::views::take_line_or_throw.
iterator.hpp
Provides seqan3::ostream and seqan3::ostreambuf iterator.
misc.hpp
Provides various utility functions.
misc.hpp
Provides various utility functions.
string_view
cstdio
char_to.hpp
Provides seqan3::views::char_to.
seqan3::views::istreambuf
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:113
string