SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
format_vienna.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <cstdio>
16 #include <iterator>
17 #include <stack>
18 #include <string>
19 #include <string_view>
20 #include <type_traits>
21 #include <vector>
22 
42 #include <seqan3/std/algorithm>
43 #include <seqan3/std/ranges>
44 
45 namespace seqan3
46 {
84 {
85 public:
89  format_vienna() noexcept = default;
90  format_vienna(format_vienna const &) noexcept = default;
91  format_vienna & operator=(format_vienna const &) noexcept = default;
92  format_vienna(format_vienna &&) noexcept = default;
93  format_vienna & operator=(format_vienna &&) noexcept = default;
94  ~format_vienna() noexcept = default;
95 
98  static inline std::vector<std::string> file_extensions
99  {
100  { "dbn" },
101  { "fasta" },
102  { "fa" }
103  };
104 
105 protected:
107  template <typename stream_type, // constraints checked by file
108  typename seq_legal_alph_type,
109  bool structured_seq_combined,
110  typename seq_type, // other constraints checked inside function
111  typename id_type,
112  typename bpp_type,
113  typename structure_type,
114  typename energy_type,
115  typename react_type,
116  typename comment_type,
117  typename offset_type>
118  void read_structure_record(stream_type & stream,
120  seq_type & seq,
121  id_type & id,
122  bpp_type & bpp,
123  structure_type & structure,
124  energy_type & energy,
125  react_type & SEQAN3_DOXYGEN_ONLY(react),
126  react_type & SEQAN3_DOXYGEN_ONLY(react_err),
127  comment_type & SEQAN3_DOXYGEN_ONLY(comment),
128  offset_type & SEQAN3_DOXYGEN_ONLY(offset))
129  {
130  auto stream_view = views::istreambuf(stream);
131 
132  // READ ID (if present)
133  auto constexpr is_id = is_char<'>'>;
134  if (is_id(*begin(stream_view)))
135  {
136  if constexpr (!detail::decays_to_ignore_v<id_type>)
137  {
138  if (options.truncate_ids)
139  {
140  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
141  | views::take_until_or_throw(is_cntrl || is_blank)
142  | views::char_to<std::ranges::range_value_t<id_type>>,
143  std::cpp20::back_inserter(id));
144  detail::consume(stream_view | views::take_line_or_throw);
145  }
146  else
147  {
148  std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
149  | views::take_line_or_throw
150  | views::char_to<std::ranges::range_value_t<id_type>>,
151  std::cpp20::back_inserter(id));
152  }
153  }
154  else
155  {
156  detail::consume(stream_view | views::take_line_or_throw);
157  }
158  }
159  else if constexpr (!detail::decays_to_ignore_v<id_type>)
160  {
161  auto constexpr is_legal_seq = is_in_alphabet<seq_legal_alph_type>;
162  if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
163  {
164  throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} +
165  is_id.msg + " and " + is_legal_seq.msg +
166  " evaluated to false on " + detail::make_printable(*begin(stream_view))};
167  }
168  }
169 
170  // READ SEQUENCE
171  if constexpr (!detail::decays_to_ignore_v<seq_type>)
172  {
173  auto constexpr is_legal_seq = is_in_alphabet<seq_legal_alph_type>;
174  std::ranges::copy(stream_view | views::take_line_or_throw // until end of line
175  | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
176  | std::views::transform([is_legal_seq](char const c)
177  {
178  if (!is_legal_seq(c)) // enforce legal alphabet
179  {
180  throw parse_error{std::string{"Encountered an unexpected letter: "} +
181  is_legal_seq.msg +
182  " evaluated to false on " +
183  detail::make_printable(c)};
184  }
185  return c;
186  })
187  | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
188  std::cpp20::back_inserter(seq));
189  }
190  else
191  {
192  detail::consume(stream_view | views::take_line_or_throw);
193  }
194 
195  // READ STRUCTURE (if present)
196  [[maybe_unused]] int64_t structure_length{};
197  if constexpr (!detail::decays_to_ignore_v<structure_type>)
198  {
199  if constexpr (structured_seq_combined)
200  {
201  assert(std::addressof(seq) == std::addressof(structure));
202  using alph_type = typename std::ranges::range_value_t<structure_type>::structure_alphabet_type;
203  // We need the structure_length parameter to count the length of the structure while reading
204  // because we cannot infer it from the (already resized) structure_seq object.
205  auto res = std::ranges::copy(read_structure<alph_type>(stream_view), std::ranges::begin(structure));
206  structure_length = std::ranges::distance(std::ranges::begin(structure), res.out);
207 
208  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
209  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
210  }
211  else
212  {
213  using alph_type = std::ranges::range_value_t<structure_type>;
214  std::ranges::copy(read_structure<alph_type>(stream_view), std::cpp20::back_inserter(structure));
215  structure_length = std::ranges::distance(structure);
216 
217  if constexpr (!detail::decays_to_ignore_v<bpp_type>)
218  detail::bpp_from_rna_structure<alph_type>(bpp, structure);
219  }
220  }
221  else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
222  {
223  detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
224  structure_length = std::ranges::distance(bpp);
225  }
226  else
227  {
228  detail::consume(stream_view | views::take_until(is_space)); // until whitespace
229  }
230 
231  if constexpr (!detail::decays_to_ignore_v<seq_type> &&
232  !(detail::decays_to_ignore_v<structure_type> && detail::decays_to_ignore_v<bpp_type>))
233  {
234  if (std::ranges::distance(seq) != structure_length)
235  throw parse_error{"Found sequence and associated structure of different length."};
236  }
237 
238  // READ ENERGY (if present)
239  if constexpr (!detail::decays_to_ignore_v<energy_type>)
240  {
241  std::string e_str = stream_view | views::take_line
242  | std::views::filter(!(is_space || is_char<'('> || is_char<')'>))
243  | views::to<std::string>;
244  if (!e_str.empty())
245  {
246  size_t num_processed;
247  energy = std::stod(e_str, &num_processed);
248  if (num_processed != e_str.size()) // [[unlikely]]
249  {
250  throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
251  }
252  }
253  }
254  else
255  {
256  detail::consume(stream_view | views::take_line);
257  }
258  detail::consume(stream_view | views::take_until(!is_space));
259  }
260 
262  template <typename stream_type, // constraints checked by file
263  typename seq_type, // other constraints checked inside function
264  typename id_type,
265  typename bpp_type,
266  typename structure_type,
267  typename energy_type,
268  typename react_type,
269  typename comment_type,
270  typename offset_type>
271  void write_structure_record(stream_type & stream,
272  structure_file_output_options const & options,
273  seq_type && seq,
274  id_type && id,
275  bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
276  structure_type && structure,
277  energy_type && energy,
278  react_type && SEQAN3_DOXYGEN_ONLY(react),
279  react_type && SEQAN3_DOXYGEN_ONLY(react_err),
280  comment_type && SEQAN3_DOXYGEN_ONLY(comment),
281  offset_type && SEQAN3_DOXYGEN_ONLY(offset))
282  {
283  std::cpp20::ostreambuf_iterator stream_it{stream};
284 
285  // WRITE ID (optional)
286  if constexpr (!detail::decays_to_ignore_v<id_type>)
287  {
288  if (!std::ranges::empty(id))
289  {
290  stream_it = '>';
291  stream_it = ' ';
292  std::ranges::copy(id, stream_it);
293  detail::write_eol(stream_it, options.add_carriage_return);
294  }
295  }
296 
297  // WRITE SEQUENCE
298  if constexpr (!detail::decays_to_ignore_v<seq_type>)
299  {
300  if (std::ranges::empty(seq)) //[[unlikely]]
301  throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
302 
303  std::ranges::copy(seq | views::to_char, stream_it);
304  detail::write_eol(stream_it, options.add_carriage_return);
305  }
306  else
307  {
308  throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
309  "when writing Vienna files."};
310  }
311 
312  // WRITE STRUCTURE (optional)
313  if constexpr (!detail::decays_to_ignore_v<structure_type>)
314  {
315  if (!std::ranges::empty(structure))
316  std::ranges::copy(structure | views::to_char, stream_it);
317 
318  // WRITE ENERGY (optional)
319  if constexpr (!detail::decays_to_ignore_v<energy_type>)
320  {
321  if (energy)
322  {
323 // TODO(joergi-w) enable the following when std::to_chars is implemented for float types
324 // auto [endptr, ec] = std::to_chars(str.data(),
325 // str.data() + str.size(),
326 // energy,
327 // std::chars_format::fixed,
328 // options.precision);
329 // if (ec == std::errc())
330 // std::ranges::copy(str.data(), endptr, stream_it);
331 // else
332 // throw std::runtime_error{"The energy could not be transformed into a string."};
333 
334  stream_it = ' ';
335  stream_it = '(';
336 
337  std::array<char, 100> str;
338  int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
339  if (len < 0 || len >= 100)
340  throw std::runtime_error{"The energy could not be transformed into a string."};
341  std::ranges::copy(str.data(), str.data() + len, stream_it);
342 
343  stream_it = ')';
344  }
345  }
346  detail::write_eol(stream_it, options.add_carriage_return);
347  }
348  else if constexpr (!detail::decays_to_ignore_v<energy_type>)
349  {
350  throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
351  }
352  }
353 
354 private:
361  template <typename alph_type, typename stream_view_type>
362  auto read_structure(stream_view_type & stream_view)
363  {
364  auto constexpr is_legal_structure = is_in_alphabet<alph_type>;
365  return stream_view | views::take_until(is_space) // until whitespace
366  | std::views::transform([is_legal_structure](char const c)
367  {
368  if (!is_legal_structure(c))
369  {
370  throw parse_error{
371  std::string{"Encountered an unexpected letter: "} +
372  is_legal_structure.msg +
373  " evaluated to false on " + detail::make_printable(c)};
374  }
375  return c;
376  }) // enforce legal alphabet
377  | views::char_to<alph_type>; // convert to actual target alphabet
378  }
379 };
380 
381 } // namespace seqan3::detail
wuss.hpp
Provides the WUSS format for RNA structure.
take_until.hpp
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
detail.hpp
Helper functions (e.g. conversions) for the structure IO submodule.
vector
stack
input_format_concept.hpp
Provides seqan3::structure_file_input_format.
algorithm
Adaptations of algorithms from the Ranges TS.
seqan3::seq
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
input_options.hpp
Provides seqan3::structure_file_input_options.
seqan3::format_vienna::read_structure_record
void read_structure_record(stream_type &stream, structure_file_input_options< seq_legal_alph_type, structured_seq_combined > const &options, seq_type &seq, id_type &id, bpp_type &bpp, structure_type &structure, energy_type &energy, react_type &react, react_type &react_err, comment_type &comment, offset_type &offset)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_vienna.hpp:118
output_options.hpp
Provides seqan3::structure_file_output_options.
to.hpp
Provides seqan3::views::to.
range.hpp
Provides various transformation traits used by the range module.
seqan3::format_vienna::format_vienna
format_vienna() noexcept=default
Defaulted.
seqan3
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
seqan3::structure_file_input_options
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:28
seqan3::format_vienna
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition: format_vienna.hpp:84
seqan3::format_vienna::file_extensions
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_vienna.hpp:99
output_format_concept.hpp
Provides seqan3::structure_file_output_format and auxiliary classes.
char.hpp
Provides alphabet adaptations for standard char types.
to_char.hpp
Provides seqan3::views::to_char.
take.hpp
Provides seqan3::views::take.
ranges
Adaptations of concepts from the Ranges TS.
predicate.hpp
Provides character predicates for tokenisation.
std
SeqAn specific customisations in the standard namespace.
istreambuf.hpp
Provides seqan3::views::istreambuf.
take_line.hpp
Provides seqan3::views::take_line and seqan3::views::take_line_or_throw.
iterator.hpp
Provides seqan3::fast_istreambuf_iterator and seqan3::fast_ostreambuf_iterator, as well as,...
misc.hpp
Provides various utility functions.
misc.hpp
Provides various utility functions.
string_view
cstdio
char_to.hpp
Provides seqan3::views::char_to.
seqan3::views::istreambuf
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:113
string