SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
format_vienna.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <seqan3/std/algorithm>
16 #include <cstdio>
17 #include <iterator>
18 #include <seqan3/std/ranges>
19 #include <stack>
20 #include <string>
21 #include <string_view>
22 #include <type_traits>
23 #include <vector>
24 
45 
46 namespace seqan3
47 {
85 {
86 public:
90  format_vienna() noexcept = default;
91  format_vienna(format_vienna const &) noexcept = default;
92  format_vienna & operator=(format_vienna const &) noexcept = default;
93  format_vienna(format_vienna &&) noexcept = default;
94  format_vienna & operator=(format_vienna &&) noexcept = default;
95  ~format_vienna() noexcept = default;
96 
98 
100  static inline std::vector<std::string> file_extensions
101  {
102  { "dbn" },
103  { "fasta" },
104  { "fa" }
105  };
106 
107 protected:
109  template <typename stream_type, // constraints checked by file
110  typename seq_legal_alph_type,
111  bool structured_seq_combined,
112  typename seq_type, // other constraints checked inside function
113  typename id_type,
114  typename bpp_type,
115  typename structure_type,
116  typename energy_type,
117  typename react_type,
118  typename comment_type,
119  typename offset_type>
120  void read_structure_record(stream_type & stream,
122  seq_type & seq,
123  id_type & id,
124  bpp_type & bpp,
125  structure_type & structure,
126  energy_type & energy,
127  react_type & SEQAN3_DOXYGEN_ONLY(react),
128  react_type & SEQAN3_DOXYGEN_ONLY(react_err),
129  comment_type & SEQAN3_DOXYGEN_ONLY(comment),
130  offset_type & SEQAN3_DOXYGEN_ONLY(offset))
131  {
132  auto stream_view = detail::istreambuf(stream);
133 
134  // READ ID (if present)
135  auto constexpr is_id = is_char<'>'>;
136  if (is_id(*begin(stream_view)))
137  {
138  if constexpr (!detail::decays_to_ignore_v<id_type>)
139  {
140  if (options.truncate_ids)
141  {
142  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
143  | detail::take_until_or_throw(is_cntrl || is_blank)
144  | views::char_to<std::ranges::range_value_t<id_type>>,
145  std::cpp20::back_inserter(id));
146  detail::consume(stream_view | detail::take_line_or_throw);
147  }
148  else
149  {
150  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
151  | detail::take_line_or_throw
152  | views::char_to<std::ranges::range_value_t<id_type>>,
153  std::cpp20::back_inserter(id));
154  }
155  }
156  else
157  {
158  detail::consume(stream_view | detail::take_line_or_throw);
159  }
160  }
161  else if constexpr (!detail::decays_to_ignore_v<id_type>)
162  {
163  auto constexpr is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
164  if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
165  {
166  throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} +
167  is_id.msg + " and char_is_valid_for<" +
168  detail::type_name_as_string<seq_legal_alph_type> + ">" +
169  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
170  }
171  }
172 
173  // READ SEQUENCE
174  if constexpr (!detail::decays_to_ignore_v<seq_type>)
175  {
176  auto constexpr is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
177  std::ranges::copy(stream_view | detail::take_line_or_throw // until end of line
178  | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
179  | std::views::transform([is_legal_seq](char const c)
180  {
181  if (!is_legal_seq(c)) // enforce legal alphabet
182  {
183  throw parse_error{std::string{"Encountered an unexpected letter: "} +
184  "char_is_valid_for<" +
185  detail::type_name_as_string<seq_legal_alph_type> +
186  "> evaluated to false on " +
187  detail::make_printable(c)};
188  }
189  return c;
190  })
191  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
192  std::cpp20::back_inserter(seq));
193  }
194  else
195  {
196  detail::consume(stream_view | detail::take_line_or_throw);
197  }
198 
199  // READ STRUCTURE (if present)
200  [[maybe_unused]] int64_t structure_length{};
201  if constexpr (!detail::decays_to_ignore_v<structure_type>)
202  {
203  if constexpr (structured_seq_combined)
204  {
205  assert(std::addressof(seq) == std::addressof(structure));
206  using alph_type = typename std::ranges::range_value_t<structure_type>::structure_alphabet_type;
207  // We need the structure_length parameter to count the length of the structure while reading
208  // because we cannot infer it from the (already resized) structure_seq object.
209  auto res = std::ranges::copy(read_structure<alph_type>(stream_view), std::ranges::begin(structure));
210  structure_length = std::ranges::distance(std::ranges::begin(structure), res.out);
211 
212  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
213  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
214  }
215  else
216  {
217  using alph_type = std::ranges::range_value_t<structure_type>;
218  std::ranges::copy(read_structure<alph_type>(stream_view), std::cpp20::back_inserter(structure));
219  structure_length = std::ranges::distance(structure);
220 
221  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
222  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
223  }
224  }
225  else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
226  {
227  detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
228  structure_length = std::ranges::distance(bpp);
229  }
230  else
231  {
232  detail::consume(stream_view | detail::take_until(is_space)); // until whitespace
233  }
234 
235  if constexpr (!detail::decays_to_ignore_v<seq_type> &&
236  !(detail::decays_to_ignore_v<structure_type> && detail::decays_to_ignore_v<bpp_type>))
237  {
238  if (std::ranges::distance(seq) != structure_length)
239  throw parse_error{"Found sequence and associated structure of different length."};
240  }
241 
242  // READ ENERGY (if present)
243  if constexpr (!detail::decays_to_ignore_v<energy_type>)
244  {
245  std::string e_str = stream_view | detail::take_line
246  | std::views::filter(!(is_space || is_char<'('> || is_char<')'>))
247  | views::to<std::string>;
248  if (!e_str.empty())
249  {
250  size_t num_processed;
251  energy = std::stod(e_str, &num_processed);
252  if (num_processed != e_str.size()) // [[unlikely]]
253  {
254  throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
255  }
256  }
257  }
258  else
259  {
260  detail::consume(stream_view | detail::take_line);
261  }
262  detail::consume(stream_view | detail::take_until(!is_space));
263  }
264 
266  template <typename stream_type, // constraints checked by file
267  typename seq_type, // other constraints checked inside function
268  typename id_type,
269  typename bpp_type,
270  typename structure_type,
271  typename energy_type,
272  typename react_type,
273  typename comment_type,
274  typename offset_type>
275  void write_structure_record(stream_type & stream,
276  structure_file_output_options const & options,
277  seq_type && seq,
278  id_type && id,
279  bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
280  structure_type && structure,
281  energy_type && energy,
282  react_type && SEQAN3_DOXYGEN_ONLY(react),
283  react_type && SEQAN3_DOXYGEN_ONLY(react_err),
284  comment_type && SEQAN3_DOXYGEN_ONLY(comment),
285  offset_type && SEQAN3_DOXYGEN_ONLY(offset))
286  {
287  std::cpp20::ostreambuf_iterator stream_it{stream};
288 
289  // WRITE ID (optional)
290  if constexpr (!detail::decays_to_ignore_v<id_type>)
291  {
292  if (!std::ranges::empty(id))
293  {
294  stream_it = '>';
295  stream_it = ' ';
296  std::ranges::copy(id, stream_it);
297  detail::write_eol(stream_it, options.add_carriage_return);
298  }
299  }
300 
301  // WRITE SEQUENCE
302  if constexpr (!detail::decays_to_ignore_v<seq_type>)
303  {
304  if (std::ranges::empty(seq)) //[[unlikely]]
305  throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
306 
307  std::ranges::copy(seq | views::to_char, stream_it);
308  detail::write_eol(stream_it, options.add_carriage_return);
309  }
310  else
311  {
312  throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
313  "when writing Vienna files."};
314  }
315 
316  // WRITE STRUCTURE (optional)
317  if constexpr (!detail::decays_to_ignore_v<structure_type>)
318  {
319  if (!std::ranges::empty(structure))
320  std::ranges::copy(structure | views::to_char, stream_it);
321 
322  // WRITE ENERGY (optional)
323  if constexpr (!detail::decays_to_ignore_v<energy_type>)
324  {
325  if (energy)
326  {
327 // TODO(joergi-w) enable the following when std::to_chars is implemented for float types
328 // auto [endptr, ec] = std::to_chars(str.data(),
329 // str.data() + str.size(),
330 // energy,
331 // std::chars_format::fixed,
332 // options.precision);
333 // if (ec == std::errc())
334 // std::ranges::copy(str.data(), endptr, stream_it);
335 // else
336 // throw std::runtime_error{"The energy could not be transformed into a string."};
337 
338  stream_it = ' ';
339  stream_it = '(';
340 
341  std::array<char, 100> str;
342  int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
343  if (len < 0 || len >= 100)
344  throw std::runtime_error{"The energy could not be transformed into a string."};
345  std::ranges::copy(str.data(), str.data() + len, stream_it);
346 
347  stream_it = ')';
348  }
349  }
350  detail::write_eol(stream_it, options.add_carriage_return);
351  }
352  else if constexpr (!detail::decays_to_ignore_v<energy_type>)
353  {
354  throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
355  }
356  }
357 
358 private:
365  template <typename alph_type, typename stream_view_type>
366  auto read_structure(stream_view_type & stream_view)
367  {
368  auto constexpr is_legal_structure = char_is_valid_for<alph_type>;
369  return stream_view | detail::take_until(is_space) // until whitespace
370  | std::views::transform([is_legal_structure](char const c)
371  {
372  if (!is_legal_structure(c))
373  {
374  throw parse_error{
375  std::string{"Encountered an unexpected letter: char_is_valid_for<"} +
376  detail::type_name_as_string<alph_type> +
377  "> evaluated to false on " + detail::make_printable(c)};
378  }
379  return c;
380  }) // enforce legal alphabet
381  | views::char_to<alph_type>; // convert to actual target alphabet
382  }
383 };
384 
385 } // namespace seqan3::detail
Adaptations of algorithms from the Ranges TS.
Core alphabet concept and free function/type trait wrappers.
Provides seqan3::views::char_to.
Provides seqan3::views::to_char.
Provides alphabet adaptations for standard char types.
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition: format_vienna.hpp:85
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_vienna.hpp:101
format_vienna() noexcept=default
Defaulted.
void read_structure_record(stream_type &stream, structure_file_input_options< seq_legal_alph_type, structured_seq_combined > const &options, seq_type &seq, id_type &id, bpp_type &bpp, structure_type &structure, energy_type &energy, react_type &react, react_type &react_err, comment_type &comment, offset_type &offset)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_vienna.hpp:120
Provides various utility functions.
Provides various transformation traits used by the range module.
Provides various utility functions.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Adaptations of concepts from the Ranges TS.
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:28
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_file_output_format and auxiliary classes.
Provides seqan3::structure_file_output_options.
Provides seqan3::detail::take_line and seqan3::detail::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
[DEPRECATED] Provides seqan3::views::take.
Provides traits to inspect some information of a type, for example its name.
Provides character predicates for tokenisation.
Provides seqan3::views::to.
Provides the WUSS format for RNA structure.