SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
format_vienna.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/std/algorithm>
16#include <cstdio>
17#include <iterator>
18#include <seqan3/std/ranges>
19#include <stack>
20#include <string>
21#include <string_view>
22#include <type_traits>
23#include <vector>
24
44
45namespace seqan3
46{
86{
87public:
91 format_vienna() noexcept = default;
92 format_vienna(format_vienna const &) noexcept = default;
93 format_vienna & operator=(format_vienna const &) noexcept = default;
94 format_vienna(format_vienna &&) noexcept = default;
95 format_vienna & operator=(format_vienna &&) noexcept = default;
96 ~format_vienna() noexcept = default;
97
99
101 static inline std::vector<std::string> file_extensions
102 {
103 { "dbn" },
104 { "fasta" },
105 { "fa" }
106 };
107
108protected:
110 template <typename stream_type, // constraints checked by file
111 typename seq_legal_alph_type,
112 bool structured_seq_combined,
113 typename seq_type, // other constraints checked inside function
114 typename id_type,
115 typename bpp_type,
116 typename structure_type,
117 typename energy_type,
118 typename react_type,
119 typename comment_type,
120 typename offset_type>
121 void read_structure_record(stream_type & stream,
123 seq_type & seq,
124 id_type & id,
125 bpp_type & bpp,
126 structure_type & structure,
127 energy_type & energy,
128 react_type & SEQAN3_DOXYGEN_ONLY(react),
129 react_type & SEQAN3_DOXYGEN_ONLY(react_err),
130 comment_type & SEQAN3_DOXYGEN_ONLY(comment),
131 offset_type & SEQAN3_DOXYGEN_ONLY(offset))
132 {
133 auto stream_view = detail::istreambuf(stream);
134
135 // READ ID (if present)
136 auto constexpr is_id = is_char<'>'>;
137 if (is_id(*begin(stream_view)))
138 {
139 if constexpr (!detail::decays_to_ignore_v<id_type>)
140 {
141 if (options.truncate_ids)
142 {
143 std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
144 | detail::take_until_or_throw(is_cntrl || is_blank)
145 | views::char_to<std::ranges::range_value_t<id_type>>,
146 std::cpp20::back_inserter(id));
147 detail::consume(stream_view | detail::take_line_or_throw);
148 }
149 else
150 {
151 std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
152 | detail::take_line_or_throw
153 | views::char_to<std::ranges::range_value_t<id_type>>,
154 std::cpp20::back_inserter(id));
155 }
156 }
157 else
158 {
159 detail::consume(stream_view | detail::take_line_or_throw);
160 }
161 }
162 else if constexpr (!detail::decays_to_ignore_v<id_type>)
163 {
164 auto constexpr is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
165 if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
166 {
167 throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} +
168 is_id.msg + " and char_is_valid_for<" +
169 detail::type_name_as_string<seq_legal_alph_type> + ">" +
170 " evaluated to false on " + detail::make_printable(*begin(stream_view))};
171 }
172 }
173
174 // READ SEQUENCE
175 if constexpr (!detail::decays_to_ignore_v<seq_type>)
176 {
177 auto constexpr is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
178 std::ranges::copy(stream_view | detail::take_line_or_throw // until end of line
179 | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
180 | std::views::transform([is_legal_seq](char const c)
181 {
182 if (!is_legal_seq(c)) // enforce legal alphabet
183 {
184 throw parse_error{std::string{"Encountered an unexpected letter: "} +
185 "char_is_valid_for<" +
186 detail::type_name_as_string<seq_legal_alph_type> +
187 "> evaluated to false on " +
188 detail::make_printable(c)};
189 }
190 return c;
191 })
192 | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
193 std::cpp20::back_inserter(seq));
194 }
195 else
196 {
197 detail::consume(stream_view | detail::take_line_or_throw);
198 }
199
200 // READ STRUCTURE (if present)
201 [[maybe_unused]] int64_t structure_length{};
202 if constexpr (!detail::decays_to_ignore_v<structure_type>)
203 {
204 if constexpr (structured_seq_combined)
205 {
206 assert(std::addressof(seq) == std::addressof(structure));
207 using alph_type = typename std::ranges::range_value_t<structure_type>::structure_alphabet_type;
208 // We need the structure_length parameter to count the length of the structure while reading
209 // because we cannot infer it from the (already resized) structure_seq object.
210 auto res = std::ranges::copy(read_structure<alph_type>(stream_view), std::ranges::begin(structure));
211 structure_length = std::ranges::distance(std::ranges::begin(structure), res.out);
212
213 if constexpr (!detail::decays_to_ignore_v<bpp_type>)
214 detail::bpp_from_rna_structure<alph_type>(bpp, structure);
215 }
216 else
217 {
218 using alph_type = std::ranges::range_value_t<structure_type>;
219 std::ranges::copy(read_structure<alph_type>(stream_view), std::cpp20::back_inserter(structure));
220 structure_length = std::ranges::distance(structure);
221
222 if constexpr (!detail::decays_to_ignore_v<bpp_type>)
223 detail::bpp_from_rna_structure<alph_type>(bpp, structure);
224 }
225 }
226 else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
227 {
228 detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
229 structure_length = std::ranges::distance(bpp);
230 }
231 else
232 {
233 detail::consume(stream_view | detail::take_until(is_space)); // until whitespace
234 }
235
236 if constexpr (!detail::decays_to_ignore_v<seq_type> &&
237 !(detail::decays_to_ignore_v<structure_type> && detail::decays_to_ignore_v<bpp_type>))
238 {
239 if (std::ranges::distance(seq) != structure_length)
240 throw parse_error{"Found sequence and associated structure of different length."};
241 }
242
243 // READ ENERGY (if present)
244 if constexpr (!detail::decays_to_ignore_v<energy_type>)
245 {
246 std::string e_str = stream_view | detail::take_line
247 | std::views::filter(!(is_space || is_char<'('> || is_char<')'>))
248 | views::to<std::string>;
249 if (!e_str.empty())
250 {
251 size_t num_processed;
252 energy = std::stod(e_str, &num_processed);
253 if (num_processed != e_str.size()) // [[unlikely]]
254 {
255 throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
256 }
257 }
258 }
259 else
260 {
261 detail::consume(stream_view | detail::take_line);
262 }
263 detail::consume(stream_view | detail::take_until(!is_space));
264 }
265
267 template <typename stream_type, // constraints checked by file
268 typename seq_type, // other constraints checked inside function
269 typename id_type,
270 typename bpp_type,
271 typename structure_type,
272 typename energy_type,
273 typename react_type,
274 typename comment_type,
275 typename offset_type>
276 void write_structure_record(stream_type & stream,
277 structure_file_output_options const & options,
278 seq_type && seq,
279 id_type && id,
280 bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
281 structure_type && structure,
282 energy_type && energy,
283 react_type && SEQAN3_DOXYGEN_ONLY(react),
284 react_type && SEQAN3_DOXYGEN_ONLY(react_err),
285 comment_type && SEQAN3_DOXYGEN_ONLY(comment),
286 offset_type && SEQAN3_DOXYGEN_ONLY(offset))
287 {
288 std::cpp20::ostreambuf_iterator stream_it{stream};
289
290 // WRITE ID (optional)
291 if constexpr (!detail::decays_to_ignore_v<id_type>)
292 {
293 if (!std::ranges::empty(id))
294 {
295 stream_it = '>';
296 stream_it = ' ';
297 std::ranges::copy(id, stream_it);
298 detail::write_eol(stream_it, options.add_carriage_return);
299 }
300 }
301
302 // WRITE SEQUENCE
303 if constexpr (!detail::decays_to_ignore_v<seq_type>)
304 {
305 if (std::ranges::empty(seq)) //[[unlikely]]
306 throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
307
308 std::ranges::copy(seq | views::to_char, stream_it);
309 detail::write_eol(stream_it, options.add_carriage_return);
310 }
311 else
312 {
313 throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
314 "when writing Vienna files."};
315 }
316
317 // WRITE STRUCTURE (optional)
318 if constexpr (!detail::decays_to_ignore_v<structure_type>)
319 {
320 if (!std::ranges::empty(structure))
321 std::ranges::copy(structure | views::to_char, stream_it);
322
323 // WRITE ENERGY (optional)
324 if constexpr (!detail::decays_to_ignore_v<energy_type>)
325 {
326 if (energy)
327 {
328// TODO(joergi-w) enable the following when std::to_chars is implemented for float types
329// auto [endptr, ec] = std::to_chars(str.data(),
330// str.data() + str.size(),
331// energy,
332// std::chars_format::fixed,
333// options.precision);
334// if (ec == std::errc())
335// std::ranges::copy(str.data(), endptr, stream_it);
336// else
337// throw std::runtime_error{"The energy could not be transformed into a string."};
338
339 stream_it = ' ';
340 stream_it = '(';
341
342 std::array<char, 100> str;
343 int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
344 if (len < 0 || len >= 100)
345 throw std::runtime_error{"The energy could not be transformed into a string."};
346 std::ranges::copy(str.data(), str.data() + len, stream_it);
347
348 stream_it = ')';
349 }
350 }
351 detail::write_eol(stream_it, options.add_carriage_return);
352 }
353 else if constexpr (!detail::decays_to_ignore_v<energy_type>)
354 {
355 throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
356 }
357 }
358
359private:
366 template <typename alph_type, typename stream_view_type>
367 auto read_structure(stream_view_type & stream_view)
368 {
369 auto constexpr is_legal_structure = char_is_valid_for<alph_type>;
370 return stream_view | detail::take_until(is_space) // until whitespace
371 | std::views::transform([is_legal_structure](char const c)
372 {
373 if (!is_legal_structure(c))
374 {
375 throw parse_error{
376 std::string{"Encountered an unexpected letter: char_is_valid_for<"} +
377 detail::type_name_as_string<alph_type> +
378 "> evaluated to false on " + detail::make_printable(c)};
379 }
380 return c;
381 }) // enforce legal alphabet
382 | views::char_to<alph_type>; // convert to actual target alphabet
383 }
384};
385
386} // namespace seqan3::detail
The <algorithm> header from C++20's standard library.
Core alphabet concept and free function/type trait wrappers.
Provides alphabet adaptations for standard char types.
Provides seqan3::views::char_to.
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition: format_vienna.hpp:86
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_vienna.hpp:102
format_vienna() noexcept=default
Defaulted.
void read_structure_record(stream_type &stream, structure_file_input_options< seq_legal_alph_type, structured_seq_combined > const &options, seq_type &seq, id_type &id, bpp_type &bpp, structure_type &structure, energy_type &energy, react_type &react, react_type &react_err, comment_type &comment, offset_type &offset)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_vienna.hpp:121
Provides various utility functions.
Provides various transformation traits used by the range module.
Provides various utility functions.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: cigar_operation_table.hpp:2
SeqAn specific customisations in the standard namespace.
Provides character predicates for tokenisation.
The <ranges> header from C++20's standard library.
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:30
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_file_output_format and auxiliary classes.
Provides seqan3::structure_file_output_options.
Provides seqan3::detail::take_line and seqan3::detail::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides the WUSS format for RNA structure.