SeqAn3 3.2.0
The Modern C++ library for sequence analysis.
format_vienna.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <algorithm>
16#include <cstdio>
17#include <iterator>
18#include <ranges>
19#include <stack>
20#include <string>
21#include <string_view>
22#include <type_traits>
23#include <vector>
24
44
45namespace seqan3
46{
86{
87public:
91 format_vienna() noexcept = default;
92 format_vienna(format_vienna const &) noexcept = default;
93 format_vienna & operator=(format_vienna const &) noexcept = default;
94 format_vienna(format_vienna &&) noexcept = default;
95 format_vienna & operator=(format_vienna &&) noexcept = default;
96 ~format_vienna() noexcept = default;
97
99
101 static inline std::vector<std::string> file_extensions{{"dbn"}, {"fasta"}, {"fa"}};
102
103protected:
105 template <typename stream_type, // constraints checked by file
106 typename seq_legal_alph_type,
107 bool structured_seq_combined,
108 typename seq_type, // other constraints checked inside function
109 typename id_type,
110 typename bpp_type,
111 typename structure_type,
112 typename energy_type,
113 typename react_type,
114 typename comment_type,
115 typename offset_type>
116 void
117 read_structure_record(stream_type & stream,
119 seq_type & seq,
120 id_type & id,
121 bpp_type & bpp,
122 structure_type & structure,
123 energy_type & energy,
124 react_type & SEQAN3_DOXYGEN_ONLY(react),
125 react_type & SEQAN3_DOXYGEN_ONLY(react_err),
126 comment_type & SEQAN3_DOXYGEN_ONLY(comment),
127 offset_type & SEQAN3_DOXYGEN_ONLY(offset))
128 {
129 auto stream_view = detail::istreambuf(stream);
130
131 // READ ID (if present)
132 constexpr auto is_id = is_char<'>'>;
133 if (is_id(*begin(stream_view)))
134 {
135 if constexpr (!detail::decays_to_ignore_v<id_type>)
136 {
137 if (options.truncate_ids)
138 {
139 std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
140 | detail::take_until_or_throw(is_cntrl || is_blank)
141 | views::char_to<std::ranges::range_value_t<id_type>>,
142 std::back_inserter(id));
143 detail::consume(stream_view | detail::take_line_or_throw);
144 }
145 else
146 {
147 std::ranges::copy(stream_view | std::views::drop_while(is_id || is_blank) // skip leading >
148 | detail::take_line_or_throw
149 | views::char_to<std::ranges::range_value_t<id_type>>,
150 std::back_inserter(id));
151 }
152 }
153 else
154 {
155 detail::consume(stream_view | detail::take_line_or_throw);
156 }
157 }
158 else if constexpr (!detail::decays_to_ignore_v<id_type>)
159 {
160 constexpr auto is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
161 if (!is_legal_seq(*begin(stream_view))) // if neither id nor seq found: throw
162 {
163 throw parse_error{std::string{"Expected to be on beginning of ID or sequence, but "} + is_id.msg
164 + " and char_is_valid_for<" + detail::type_name_as_string<seq_legal_alph_type>
165 + ">" + " evaluated to false on " + detail::make_printable(*begin(stream_view))};
166 }
167 }
168
169 // READ SEQUENCE
170 if constexpr (!detail::decays_to_ignore_v<seq_type>)
171 {
172 constexpr auto is_legal_seq = char_is_valid_for<seq_legal_alph_type>;
173 std::ranges::copy(
174 stream_view | detail::take_line_or_throw // until end of line
175 | std::views::filter(!(is_space || is_digit)) // ignore whitespace and numbers
176 | std::views::transform(
177 [is_legal_seq](char const c)
178 {
179 if (!is_legal_seq(c)) // enforce legal alphabet
180 {
181 throw parse_error{std::string{"Encountered an unexpected letter: "}
182 + "char_is_valid_for<"
183 + detail::type_name_as_string<seq_legal_alph_type>
184 + "> evaluated to false on " + detail::make_printable(c)};
185 }
186 return c;
187 })
188 | views::char_to<std::ranges::range_value_t<seq_type>>, // convert to actual target alphabet
189 std::back_inserter(seq));
190 }
191 else
192 {
193 detail::consume(stream_view | detail::take_line_or_throw);
194 }
195
196 // READ STRUCTURE (if present)
197 [[maybe_unused]] int64_t structure_length{};
198 if constexpr (!detail::decays_to_ignore_v<structure_type>)
199 {
200 if constexpr (structured_seq_combined)
201 {
202 assert(std::addressof(seq) == std::addressof(structure));
203 using alph_type = typename std::ranges::range_value_t<structure_type>::structure_alphabet_type;
204 // We need the structure_length parameter to count the length of the structure while reading
205 // because we cannot infer it from the (already resized) structure_seq object.
206 auto res = std::ranges::copy(read_structure<alph_type>(stream_view), std::ranges::begin(structure));
207 structure_length = std::ranges::distance(std::ranges::begin(structure), res.out);
208
209 if constexpr (!detail::decays_to_ignore_v<bpp_type>)
210 detail::bpp_from_rna_structure<alph_type>(bpp, structure);
211 }
212 else
213 {
214 using alph_type = std::ranges::range_value_t<structure_type>;
215 std::ranges::copy(read_structure<alph_type>(stream_view), std::back_inserter(structure));
216 structure_length = std::ranges::distance(structure);
217
218 if constexpr (!detail::decays_to_ignore_v<bpp_type>)
219 detail::bpp_from_rna_structure<alph_type>(bpp, structure);
220 }
221 }
222 else if constexpr (!detail::decays_to_ignore_v<bpp_type>)
223 {
224 detail::bpp_from_rna_structure<wuss51>(bpp, read_structure<wuss51>(stream_view));
225 structure_length = std::ranges::distance(bpp);
226 }
227 else
228 {
229 detail::consume(stream_view | detail::take_until(is_space)); // until whitespace
230 }
231
232 if constexpr (!detail::decays_to_ignore_v<seq_type>
233 && !(detail::decays_to_ignore_v<structure_type> && detail::decays_to_ignore_v<bpp_type>))
234 {
235 if (std::ranges::distance(seq) != structure_length)
236 throw parse_error{"Found sequence and associated structure of different length."};
237 }
238
239 // READ ENERGY (if present)
240 if constexpr (!detail::decays_to_ignore_v<energy_type>)
241 {
242 std::string e_str = stream_view | detail::take_line
243 | std::views::filter(!(is_space || is_char<'('> || is_char<')'>))
244 | ranges::to<std::string>();
245 if (!e_str.empty())
246 {
247 size_t num_processed;
248 energy = std::stod(e_str, &num_processed);
249 if (num_processed != e_str.size()) // [[unlikely]]
250 {
251 throw parse_error{std::string{"Failed to parse energy value '"} + e_str + "'."};
252 }
253 }
254 }
255 else
256 {
257 detail::consume(stream_view | detail::take_line);
258 }
259 detail::consume(stream_view | detail::take_until(!is_space));
260 }
261
263 template <typename stream_type, // constraints checked by file
264 typename seq_type, // other constraints checked inside function
265 typename id_type,
266 typename bpp_type,
267 typename structure_type,
268 typename energy_type,
269 typename react_type,
270 typename comment_type,
271 typename offset_type>
272 void write_structure_record(stream_type & stream,
273 structure_file_output_options const & options,
274 seq_type && seq,
275 id_type && id,
276 bpp_type && SEQAN3_DOXYGEN_ONLY(bpp),
277 structure_type && structure,
278 energy_type && energy,
279 react_type && SEQAN3_DOXYGEN_ONLY(react),
280 react_type && SEQAN3_DOXYGEN_ONLY(react_err),
281 comment_type && SEQAN3_DOXYGEN_ONLY(comment),
282 offset_type && SEQAN3_DOXYGEN_ONLY(offset))
283 {
284 std::ostreambuf_iterator stream_it{stream};
285
286 // WRITE ID (optional)
287 if constexpr (!detail::decays_to_ignore_v<id_type>)
288 {
289 if (!std::ranges::empty(id))
290 {
291 stream_it = '>';
292 stream_it = ' ';
293 std::ranges::copy(id, stream_it);
294 detail::write_eol(stream_it, options.add_carriage_return);
295 }
296 }
297
298 // WRITE SEQUENCE
299 if constexpr (!detail::decays_to_ignore_v<seq_type>)
300 {
301 if (std::ranges::empty(seq)) //[[unlikely]]
302 throw std::runtime_error{"The SEQ field may not be empty when writing Vienna files."};
303
304 std::ranges::copy(seq | views::to_char, stream_it);
305 detail::write_eol(stream_it, options.add_carriage_return);
306 }
307 else
308 {
309 throw std::logic_error{"The SEQ and STRUCTURED_SEQ fields may not both be set to ignore "
310 "when writing Vienna files."};
311 }
312
313 // WRITE STRUCTURE (optional)
314 if constexpr (!detail::decays_to_ignore_v<structure_type>)
315 {
316 if (!std::ranges::empty(structure))
317 std::ranges::copy(structure | views::to_char, stream_it);
318
319 // WRITE ENERGY (optional)
320 if constexpr (!detail::decays_to_ignore_v<energy_type>)
321 {
322 if (energy)
323 {
324 // TODO(joergi-w) enable the following when std::to_chars is implemented for float types
325 // auto [endptr, ec] = std::to_chars(str.data(),
326 // str.data() + str.size(),
327 // energy,
328 // std::chars_format::fixed,
329 // options.precision);
330 // if (ec == std::errc())
331 // std::ranges::copy(str.data(), endptr, stream_it);
332 // else
333 // throw std::runtime_error{"The energy could not be transformed into a string."};
334
335 stream_it = ' ';
336 stream_it = '(';
337
338 std::array<char, 100> str;
339 int len = std::snprintf(str.data(), 100, "%.*f", options.precision, energy);
340 if (len < 0 || len >= 100)
341 throw std::runtime_error{"The energy could not be transformed into a string."};
342 std::ranges::copy(str.data(), str.data() + len, stream_it);
343
344 stream_it = ')';
345 }
346 }
347 detail::write_eol(stream_it, options.add_carriage_return);
348 }
349 else if constexpr (!detail::decays_to_ignore_v<energy_type>)
350 {
351 throw std::logic_error{"The ENERGY field cannot be written to a Vienna file without providing STRUCTURE."};
352 }
353 }
354
355private:
362 template <typename alph_type, typename stream_view_type>
363 auto read_structure(stream_view_type & stream_view)
364 {
365 constexpr auto is_legal_structure = char_is_valid_for<alph_type>;
366 return stream_view | detail::take_until(is_space) // until whitespace
367 | std::views::transform(
368 [is_legal_structure](char const c)
369 {
370 if (!is_legal_structure(c))
371 {
372 throw parse_error{std::string{"Encountered an unexpected letter: char_is_valid_for<"}
373 + detail::type_name_as_string<alph_type>
374 + "> evaluated to false on " + detail::make_printable(c)};
375 }
376 return c;
377 }) // enforce legal alphabet
378 | views::char_to<alph_type>; // convert to actual target alphabet
379 }
380};
381
382} // namespace seqan3
Core alphabet concept and free function/type trait wrappers.
Provides alphabet adaptations for standard char types.
Provides seqan3::views::char_to.
The Vienna format (dot bracket notation) for RNA sequences with secondary structure.
Definition: format_vienna.hpp:86
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_vienna.hpp:101
format_vienna() noexcept=default
Defaulted.
void read_structure_record(stream_type &stream, structure_file_input_options< seq_legal_alph_type, structured_seq_combined > const &options, seq_type &seq, id_type &id, bpp_type &bpp, structure_type &structure, energy_type &energy, react_type &react, react_type &react_err, comment_type &comment, offset_type &offset)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_vienna.hpp:117
Provides various utility functions.
Provides various transformation traits used by the range module.
Provides various utility functions.
Provides seqan3::detail::istreambuf.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
SeqAn specific customisations in the standard namespace.
Provides character predicates for tokenisation.
Provides seqan3::ranges::to.
The options type defines various option members that influence the behaviour of all or some formats.
Definition: input_options.hpp:30
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_file_output_format and auxiliary classes.
Provides seqan3::structure_file_output_options.
Provides seqan3::detail::take_line and seqan3::detail::take_line_or_throw.
Provides seqan3::views::take_until and seqan3::views::take_until_or_throw.
Provides seqan3::views::to_char.
Provides traits to inspect some information of a type, for example its name.
Provides the WUSS format for RNA structure.