SeqAn3 3.4.0-rc.4
The Modern C++ library for sequence analysis.
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages Concepts
format_vienna.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2025 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2025 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <algorithm>
13#include <cstdio>
14#include <iterator>
15#include <ranges>
16#include <stack>
17#include <string>
18#include <string_view>
19#include <type_traits>
20#include <vector>
21
41
42namespace seqan3
43{
83{
84public:
88 format_vienna() noexcept = default;
89 format_vienna(format_vienna const &) noexcept = default;
90 format_vienna & operator=(format_vienna const &) noexcept = default;
91 format_vienna(format_vienna &&) noexcept = default;
92 format_vienna & operator=(format_vienna &&) noexcept = default;
93 ~format_vienna() noexcept = default;
94
96
98 static inline std::vector<std::string> file_extensions{{"dbn"}, {"fasta"}, {"fa"}};
99
100protected:
102 template <typename stream_type, // constraints checked by file
103 typename seq_legal_alph_type,
104 bool structured_seq_combined,
105 typename seq_type, // other constraints checked inside function
106 typename id_type,
107 typename bpp_type,
108 typename structure_type,
109 typename energy_type,
110 typename react_type,
111 typename comment_type,
112 typename offset_type>
113 void
114 read_structure_record(stream_type & stream,
116 seq_type & seq,
117 id_type & id,
118 bpp_type & bpp,
119 structure_type & structure,
120 energy_type & energy,
121 react_type & SEQAN3_DOXYGEN_ONLY(react),
122 react_type & SEQAN3_DOXYGEN_ONLY(react_err),
123 comment_type & SEQAN3_DOXYGEN_ONLY(comment),
124 offset_type & SEQAN3_DOXYGEN_ONLY(offset))
125 {
126 auto stream_view = detail::istreambuf(stream);
127
128 // READ ID (if present)
129 constexpr auto is_id = is_char<'>'>;
130 if (is_id(*begin(stream_view)))
131 {
132 if constexpr (!detail::decays_to_ignore_v<id_type>)
133 {
134 if (options.truncate_ids)
135 {
136 std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
137 | detail::take_until_or_throw(is_cntrl || is_blank)
138 | views::char_to<std::ranges::range_value_t<id_type>>,
140 detail::consume(stream_view | detail::take_line_or_throw);
141 }
142 else
143 {
144 std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
145 | detail::take_line_or_throw
146 | views::char_to<std::ranges::range_value_t<id_type>>,
148 }
149 }
150 else
151 {
152 detail::consume(stream_view | detail::take_line_or_throw);
153 }
154 }
155 else if constexpr (!detail::decays_to_ignore_v<id_type>)
156 {
157 constexpr auto is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
158 if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
159 {
160 throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} + is_id.msg
161 + " and char_is_valid_for<" + detail::type_name_as_string<seq_legal_alph_type>
162 + ">" + " evaluated to false on " + detail::make_printable(*begin(stream_view))};
163 }
164 }
165
166 // READ SEQUENCE
167 if constexpr (!detail::decays_to_ignore_v<seq_type>)
168 {
169 constexpr auto is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
171 stream_view | detail::take_line_or_throw // until end of line
172 | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
173 | std::views::transform(
174 [is_legal_seq](char const c)
175 {
176 if (!is_legal_seq(c)) // enforce legal alphabet
177 {
178 throw parse_error{std::string{"Encountered an unexpected letter: "}
179 + "char_is_valid_for<"
180 + detail::type_name_as_string<seq_legal_alph_type>
181 + "> evaluated to false on " + detail::make_printable(c)};
182 }
183 return c;
184 })
185 | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
187 }
188 else
189 {
190 detail::consume(stream_view | detail::take_line_or_throw);
191 }
192
193 // READ STRUCTURE (if present)
194 [[maybe_unused]] int64_t structure_length{};
195 if constexpr (!detail::decays_to_ignore_v<structure_type>)
196 {
197 if constexpr (structured_seq_combined)
198 {
200 using alph_type = typename std::ranges::range_value_t<structure_type>::structure_alphabet_type;
201 // We need the structure_length parameter to count the length of the structure while reading
202 // because we cannot infer it from the (already resized) structure_seq object.
203 auto range = read_structure<alph_type>(stream_view);
204 // Use std::views::take to avoid going out of bounds if the structure is longer than the sequence.
205 auto res = std::ranges::copy(range | std::views::take(std::ranges::distance(seq)),
207 structure_length = std::ranges::distance(std::ranges::begin(structure), res.out);
208 // If the structure is longer than the sequence, there are characters left.
209 // std::ranges::distance will also consume the characters in the stream.
210 structure_length += std::ranges::distance(range);
211
212 if constexpr (!detail::decays_to_ignore_v<bpp_type>)
213 detail::bpp_from_rna_structure<alph_type>(bpp, structure);
214 }
215 else
216 {
217 using alph_type = std::ranges::range_value_t<structure_type>;
218 std::ranges::copy(read_structure<alph_type>(stream_view), std::back_inserter(structure));
219 structure_length = std::ranges::distance(structure);
220
221 if constexpr (!detail::decays_to_ignore_v<bpp_type>)
222 detail::bpp_from_rna_structure<alph_type>(bpp, structure);
223 }
224 }
225 else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
226 {
227 detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
228 structure_length = std::ranges::distance(bpp);
229 }
230 else
231 {
232 detail::consume(stream_view | detail::take_until(is_space)); // until whitespace
233 }
234
235 if constexpr (!detail::decays_to_ignore_v<seq_type>
236 && !(detail::decays_to_ignore_v<structure_type> && detail::decays_to_ignore_v<bpp_type>))
237 {
238 if (std::ranges::distance(seq) != structure_length)
239 throw parse_error{"Found sequence and associated structure of different length."};
240 }
241
242 // READ ENERGY (if present)
243 if constexpr (!detail::decays_to_ignore_v<energy_type>)
244 {
245 std::string e_str = stream_view | detail::take_line
246 | std::views::filter(!(is_space || is_char<'('> || is_char<')'>))
248 if (!e_str.empty())
249 {
250 size_t num_processed;
251 energy = std::stod(e_str, &num_processed);
252 if (num_processed != e_str.size()) // [[unlikely]]
253 {
254 throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
255 }
256 }
257 }
258 else
259 {
260 detail::consume(stream_view | detail::take_line);
261 }
262 detail::consume(stream_view | detail::take_until(!is_space));
263 }
264
266 template <typename stream_type, // constraints checked by file
267 typename seq_type, // other constraints checked inside function
268 typename id_type,
269 typename bpp_type,
270 typename structure_type,
271 typename energy_type,
272 typename react_type,
273 typename comment_type,
274 typename offset_type>
275 void write_structure_record(stream_type & stream,
276 structure_file_output_options const & options,
277 seq_type && seq,
278 id_type && id,
279 bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
280 structure_type && structure,
281 energy_type && energy,
282 react_type && SEQAN3_DOXYGEN_ONLY(react),
283 react_type && SEQAN3_DOXYGEN_ONLY(react_err),
284 comment_type && SEQAN3_DOXYGEN_ONLY(comment),
285 offset_type && SEQAN3_DOXYGEN_ONLY(offset))
286 {
287 std::ostreambuf_iterator stream_it{stream};
288
289 // WRITE ID (optional)
290 if constexpr (!detail::decays_to_ignore_v<id_type>)
291 {
292 if (!std::ranges::empty(id))
293 {
294 stream_it = '>';
295 stream_it = ' ';
296 std::ranges::copy(id, stream_it);
297 detail::write_eol(stream_it, options.add_carriage_return);
298 }
299 }
300
301 // WRITE SEQUENCE
302 if constexpr (!detail::decays_to_ignore_v<seq_type>)
303 {
304 if (std::ranges::empty(seq)) //[[unlikely]]
305 throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
306
308 detail::write_eol(stream_it, options.add_carriage_return);
309 }
310 else
311 {
312 throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
313 "when writing Vienna files."};
314 }
315
316 // WRITE STRUCTURE (optional)
317 if constexpr (!detail::decays_to_ignore_v<structure_type>)
318 {
319 if (!std::ranges::empty(structure))
321
322 // WRITE ENERGY (optional)
323 if constexpr (!detail::decays_to_ignore_v<energy_type>)
324 {
325 if (energy)
326 {
327 // TODO(joergi-w) enable the following when std::to_chars is implemented for float types
328 // auto [endptr, ec] = std::to_chars(str.data(),
329 // str.data() + str.size(),
330 // energy,
331 // std::chars_format::fixed,
332 // options.precision);
333 // if (ec == std::errc())
334 // std::ranges::copy(str.data(), endptr, stream_it);
335 // else
336 // throw std::runtime_error{"The energy could not be transformed into a string."};
337
338 stream_it = ' ';
339 stream_it = '(';
340
342 int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
343 if (len < 0 || len >= 100)
344 throw std::runtime_error{"The energy could not be transformed into a string."};
345 std::ranges::copy(str.data(), str.data() + len, stream_it);
346
347 stream_it = ')';
348 }
349 }
350 detail::write_eol(stream_it, options.add_carriage_return);
351 }
352 else if constexpr (!detail::decays_to_ignore_v<energy_type>)
353 {
354 throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
355 }
356 }
357
358private:
365 template <typename alph_type, typename stream_view_type>
366 auto read_structure(stream_view_type & stream_view)
367 {
368 constexpr auto is_legal_structure = char_is_valid_for<alph_type>;
369 return stream_view | detail::take_until(is_space) // until whitespace
370 | std::views::transform(
371 [is_legal_structure](char const c)
372 {
373 if (!is_legal_structure(c))
374 {
375 throw parse_error{std::string{"Encountered an unexpected letter: char_is_valid_for<"}
376 + detail::type_name_as_string<alph_type>
377 + "> evaluated to false on " + detail::make_printable(c)};
378 }
379 return c;
380 }) // enforce legal alphabet
381 | views::char_to<alph_type>; // convert to actual target alphabet
382 }
383};
384
385} // namespace seqan3
T addressof(T... args)
Core alphabet concept and free function/type trait wrappers.
T back_inserter(T... args)
T begin(T... args)
Provides alphabet adaptations for standard char types.
Provides seqan3::views::char_to.
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition format_vienna.hpp:83
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition format_vienna.hpp:98
void write_structure_record(stream_type &stream, structure_file_output_options const &options, seq_type &&seq, id_type &&id, bpp_type &&bpp, structure_type &&structure, energy_type &&energy, react_type &&react, react_type &&react_err, comment_type &&comment, offset_type &&offset)
Write the given fields to the specified stream.
Definition format_vienna.hpp:275
format_vienna() noexcept=default
Defaulted.
void read_structure_record(stream_type &stream, structure_file_input_options< seq_legal_alph_type, structured_seq_combined > const &options, seq_type &seq, id_type &id, bpp_type &bpp, structure_type &structure, energy_type &energy, react_type &react, react_type &react_err, comment_type &comment, offset_type &offset)
Read from the specified stream and back-insert into the given field buffers.
Definition format_vienna.hpp:114
T copy(T... args)
Provides various utility functions.
Provides various transformation traits used by the range module.
T data(T... args)
T empty(T... args)
T snprintf(T... args)
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition to_char.hpp:60
auto const char_to
A view over an alphabet, given a range of characters.
Definition char_to.hpp:64
@ energy
Energy of a folded sequence, represented by one float number.
@ comment
Comment field of arbitrary content, usually a string.
@ structure
Fixed interactions, usually a string of structure alphabet characters.
@ bpp
Base pair probability matrix of interactions, usually a matrix of float numbers.
@ react
Reactivity values of the sequence characters given in a vector of float numbers.
@ react_err
Reactivity error values given in a vector corresponding to seqan3::field::react.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
constexpr auto is_blank
Checks whether c is a blank character.
Definition predicate.hpp:139
constexpr auto is_digit
Checks whether c is a digital character.
Definition predicate.hpp:259
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition predicate.hpp:60
constexpr auto is_space
Checks whether c is a space character.
Definition predicate.hpp:122
constexpr auto is_cntrl
Checks whether c is a control character.
Definition predicate.hpp:87
seqan::stl::ranges::to to
Converts a range to a container. <dl class="no-api">This entity is not part of the SeqAn API....
Definition to.hpp:23
Provides various utility functions.
Helper functions (e.g. conversions) for the structure IO submodule.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
SeqAn specific customisations in the standard namespace.
Provides character predicates for tokenisation.
T size(T... args)
T stod(T... args)
Thrown if there is a parse error, such as reading an unexpected character from an input stream.
Definition io/exception.hpp:45
The options type defines various option members that influence the behaviour of all or some formats.
Definition structure_file/input_options.hpp:27
bool truncate_ids
Read the ID string only up until the first whitespace character.
Definition structure_file/input_options.hpp:29
The options type defines various option members that influence the behaviour of all or some formats.
Definition structure_file/output_options.hpp:23
bool add_carriage_return
The default plain text line-ending is "\n", but on Windows an additional carriage return is recommend...
Definition structure_file/output_options.hpp:27
int precision
The precision for writing floating point types.
Definition structure_file/output_options.hpp:30
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_file_output_format and auxiliary classes.
Provides seqan3::structure_file_output_options.
Provides seqan3::detail::take_line and seqan3::detail::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::ranges::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides the WUSS format for RNA structure.
Hide me