SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
format_embl.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
14 #pragma once
15 
16 #include <iterator>
17 #include <string>
18 #include <string_view>
19 #include <vector>
20 
21 #include <range/v3/view/chunk.hpp>
22 #include <range/v3/view/repeat_n.hpp>
23 
40 #include <seqan3/std/algorithm>
41 #include <seqan3/std/charconv>
42 #include <seqan3/std/ranges>
43 
44 namespace seqan3
45 {
76 {
79  {
80  { "embl" },
81  };
82 };
83 
84 } // namespace seqan
85 
86 namespace seqan3::detail
87 {
88 
91 template <>
92 class sequence_file_input_format<format_embl>
93 {
94 public:
96  using format_tag = format_embl;
97 
101  sequence_file_input_format() noexcept = default;
102  sequence_file_input_format(sequence_file_input_format const &) = delete;
105  sequence_file_input_format & operator=(sequence_file_input_format const &) = delete;
106  sequence_file_input_format(sequence_file_input_format &&) noexcept = default;
107  sequence_file_input_format & operator=(sequence_file_input_format &&) noexcept = default;
108  ~sequence_file_input_format() noexcept = default;
109 
112  template <typename stream_type, // constraints checked by file
113  typename seq_legal_alph_type, bool seq_qual_combined,
114  typename seq_type, // other constraints checked inside function
115  typename id_type,
116  typename qual_type>
117  void read(stream_type & stream,
118  sequence_file_input_options<seq_legal_alph_type, seq_qual_combined> const & options,
119  seq_type & sequence,
120  id_type & id,
121  qual_type & SEQAN3_DOXYGEN_ONLY(qualities))
122  {
123  auto stream_view = view::istreambuf(stream);
124  auto stream_it = std::ranges::begin(stream_view);
125 
126  std::string idbuffer;
128  std::back_inserter(idbuffer));
129  if (idbuffer != "ID")
130  throw parse_error{"An entry has to start with the code word ID."};
131 
132  if constexpr (!detail::decays_to_ignore_v<id_type>)
133  {
134  if (options.embl_genbank_complete_header)
135  {
136  std::ranges::copy(idbuffer | view::char_to<value_type_t<id_type>>, std::back_inserter(id));
137  do
138  {
139  std::ranges::copy(stream_view | view::take_until_or_throw(is_char<'S'>)
140  | view::char_to<value_type_t<id_type>>,
141  std::back_inserter(id));
142  id.push_back(*stream_it);
143  ++stream_it;
144  } while (*stream_it != 'Q');
145  id.pop_back(); // remove 'S' from id
146  idbuffer = "SQ";
147  }
148  else
149  {
150  // ID
151  detail::consume(stream_view | view::take_until(!is_blank));
152 
153  // read id
154  if (options.truncate_ids)
155  {
156  std::ranges::copy(stream_view | view::take_until_or_throw(is_blank || is_char<';'> || is_cntrl)
157  | view::char_to<value_type_t<id_type>>,
158  std::back_inserter(id));
159  }
160  else
161  {
162  std::ranges::copy(stream_view | view::take_until_or_throw(is_char<';'>)
163  | view::char_to<value_type_t<id_type>>,
164  std::back_inserter(id));
165  }
166  }
167  }
168 
169  // Jump to sequence
170  if (idbuffer !="SQ")
171  {
172  do
173  {
174  detail::consume(stream_view | view::take_until_or_throw(is_char<'S'>));
175  ++stream_it;
176  } while (*stream_it != 'Q');
177  }
178  detail::consume(stream_view | view::take_line_or_throw); //Consume line with infos to sequence
179 
180  // Sequence
181  auto constexpr is_end = is_char<'/'> ;
182  if constexpr (!detail::decays_to_ignore_v<seq_type>)
183  {
184  auto seq_view = stream_view | std::view::filter(!(is_space || is_digit)) // ignore whitespace and numbers
185  | view::take_until_or_throw(is_end); // until //
186 
187  auto constexpr is_legal_alph = is_in_alphabet<seq_legal_alph_type>;
188  std::ranges::copy(seq_view | std::view::transform([is_legal_alph] (char const c) // enforce legal alphabet
189  {
190  if (!is_legal_alph(c))
191  {
192  throw parse_error{std::string{"Encountered an unexpected letter: "} +
193  is_legal_alph.msg.str() +
194  " evaluated to false on " +
195  detail::make_printable(c)};
196  }
197  return c;
198  })
199  | view::char_to<value_type_t<seq_type>>, // convert to actual target alphabet
200  std::back_inserter(sequence));
201  }
202  else
203  {
204  detail::consume(stream_view | view::take_until(is_end));
205  }
206  //Jump over // and cntrl
207  ++stream_it;
208  ++stream_it;
209  ++stream_it;
210  }
211 };
212 
215 template <>
216 class sequence_file_output_format<format_embl>
217 {
218 public:
220  using format_tag = format_embl;
221 
225  sequence_file_output_format() noexcept = default;
226  sequence_file_output_format(sequence_file_output_format const &) = delete;
229  sequence_file_output_format & operator=(sequence_file_output_format const &) = delete;
230  sequence_file_output_format(sequence_file_output_format &&) noexcept = default;
231  sequence_file_output_format & operator=(sequence_file_output_format &&) noexcept = default;
232  ~sequence_file_output_format() noexcept = default;
233 
236  template <typename stream_type, // constraints checked by file
237  typename seq_type, // other constraints checked inside function
238  typename id_type,
239  typename qual_type>
240  void write(stream_type & stream,
241  sequence_file_output_options const & options,
242  seq_type && sequence,
243  id_type && id,
244  qual_type && SEQAN3_DOXYGEN_ONLY(qualities))
245  {
246  seqan3::ostreambuf_iterator stream_it{stream};
247  [[maybe_unused]] size_t sequence_size = 0;
248  [[maybe_unused]] char buffer[50];
249  if constexpr (!detail::decays_to_ignore_v<seq_type>)
250  sequence_size = ranges::size(sequence);
251 
252  // ID
253  if constexpr (detail::decays_to_ignore_v<id_type>)
254  {
255  throw std::logic_error{"The ID field may not be set to ignore when writing embl files."};
256  }
257  else
258  {
259  if (ranges::empty(id)) //[[unlikely]]
260  throw std::runtime_error{"The ID field may not be empty when writing embl files."};
261 
262  if (options.embl_genbank_complete_header)
263  {
264  std::ranges::copy(id, stream_it);
265  }
266  else
267  {
268  std::ranges::copy(std::string_view{"ID "}, stream_it);
269  std::ranges::copy(id, stream_it);
270  std::ranges::copy(std::string_view{"; "}, stream_it);
271  auto res = std::to_chars(&buffer[0], &buffer[0] + sizeof(buffer), sequence_size);
272  std::copy(&buffer[0], res.ptr, stream_it);
273  std::ranges::copy(std::string_view{" BP.\n"}, stream_it);
274  }
275 
276  }
277 
278  // Sequence
279  if constexpr (detail::decays_to_ignore_v<seq_type>) // sequence
280  {
281  throw std::logic_error{"The SEQ field may not be set to ignore when writing embl files."};
282  }
283  else
284  {
285  if (ranges::empty(sequence)) //[[unlikely]]
286  throw std::runtime_error{"The SEQ field may not be empty when writing embl files."};
287 
288  std::ranges::copy(std::string_view{"SQ Sequence "}, stream_it);
289  auto res = std::to_chars(&buffer[0], &buffer[0] + sizeof(buffer), sequence_size);
290  std::copy(&buffer[0], res.ptr, stream_it);
291  std::ranges::copy(std::string_view{" BP;\n"}, stream_it);
292  auto seqChunk = sequence | ranges::view::chunk(60);
293  unsigned int i = 0;
294  size_t bp = 0;
295  for (auto chunk : seqChunk)
296  {
298  | ranges::view::chunk(10)
299  | std::view::join(' '), stream_it);
300  ++i;
301  stream_it = ' ';
302  bp = std::min(sequence_size, bp + 60);
303  uint8_t num_blanks = 60 * i - bp; // for sequence characters
304  num_blanks += num_blanks / 10; // additional chunk separators
305  std::ranges::copy(ranges::view::repeat_n(' ', num_blanks), stream_it);
306  std::ranges::copy(std::to_string(bp), stream_it);
307  stream_it = '\n';
308  }
309  std::ranges::copy(std::string_view{"//"}, stream_it);
310  stream_it = '\n';
311  }
312  }
313 };
314 
315 } // namespace seqan3::detail
Provides seqan3::SequenceFileInputFormat and auxiliary classes.
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition: predicate.hpp:83
T copy(T... args)
Provides seqan3::view::istreambuf.
auto constexpr take_until
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:599
The EMBL format (tag).
Definition: format_embl.hpp:75
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Writes successive characters onto the output stream from which...
Definition: iterator.hpp:56
T to_string(T... args)
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:245
::ranges::size size
Alias for ranges::size. Obtains the size of a range whose size can be calculated in constant time...
Definition: ranges:189
The main SeqAn3 namespace.
Provides seqan3::dna5, container aliases and string literals.
Provides std::from_chars and std::to_chars if not defined in the stl <charconv> header.
constexpr auto join
Flattens a View of ranges into a View.
Definition: ranges:683
Provides seqan3::view::take_line and seqan3::view::take_line_or_throw.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_embl.hpp:79
auto constexpr is_blank
Checks whether c is a blank character.
Definition: predicate.hpp:163
T min(T... args)
Provides seqan3::sequence_file_output_options.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides various utility functions.
Provides various utility functions.
auto constexpr is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:287
Provides seqan3::view::char_to.
auto constexpr take_line_or_throw
A view adaptor that returns a single line from the underlying range (throws if there is no end-of-lin...
Definition: take_line.hpp:92
Adaptations of concepts from the Ranges TS.
::ranges::begin begin
Alias for ranges::begin. Returns an iterator to the beginning of a range.
Definition: ranges:174
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: algorithm:44
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:66
Definition: aligned_sequence_concept.hpp:35
Provides character predicates for tokenisation.
T to_chars(T... args)
Provides seqan3::ostream and seqan3::ostreambuf iterator.
auto constexpr take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:613
Provides seqan3::SequenceFileFormatOut and auxiliary classes.
T back_inserter(T... args)
Adaptations of algorithms from the Ranges TS.
::ranges::empty empty
Alias for ranges::empty. Checks whether a range is empty.
Definition: ranges:194
Provides seqan3::view::to_char.
Provides seqan3::sequence_file_input_options.
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:97
Provides various transformation traits used by the range module.
auto constexpr is_space
Checks whether c is a space character.
Definition: predicate.hpp:146
auto constexpr is_cntrl
Checks whether c is a control character.
Definition: predicate.hpp:110
constexpr auto transform
A range adaptor that takes a invocable and returns a view of the elements with the invocable applied...
Definition: ranges:911
Provides seqan3::view::take_exactly and seqan3::view::take_exactly_or_throw.
constexpr auto filter
A range adaptor that takes a predicate and returns a view of the elements that satisfy the predicate...
Definition: ranges:565
auto const char_to
A view over an alphabet, given a range of characters.
Definition: char_to.hpp:70