SeqAn3  3.0.0
The Modern C++ library for sequence analysis.
format_genbank.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2019, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2019, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <iterator>
16 #include <string>
17 #include <string_view>
18 #include <vector>
19 
20 #include <range/v3/view/chunk.hpp>
21 
39 #include <seqan3/std/algorithm>
40 #include <seqan3/std/charconv>
41 #include <seqan3/std/ranges>
42 
43 namespace seqan3
44 {
45 
73 {
76  {
77  { "genbank" },
78  { "gb" },
79  { "gbk" },
80  };
81 };
82 
83 } // namespace seqan
84 
85 namespace seqan3::detail
86 {
87 
90 template<>
91 class sequence_file_input_format<format_genbank>
92 {
93 public:
95  using format_tag = format_genbank;
96 
100  sequence_file_input_format() noexcept = default;
101  sequence_file_input_format(sequence_file_input_format const &) = delete;
104  sequence_file_input_format & operator=(sequence_file_input_format const &) = delete;
105  sequence_file_input_format(sequence_file_input_format &&) noexcept = default;
106  sequence_file_input_format & operator=(sequence_file_input_format &&) noexcept = default;
107  ~sequence_file_input_format() noexcept = default;
108 
111  template <typename stream_type, // constraints checked by file
112  typename seq_legal_alph_type, bool seq_qual_combined,
113  typename seq_type, // other constraints checked inside function
114  typename id_type,
115  typename qual_type>
116  void read(stream_type & stream,
117  sequence_file_input_options<seq_legal_alph_type, seq_qual_combined> const & options,
118  seq_type & sequence,
119  id_type & id,
120  qual_type & SEQAN3_DOXYGEN_ONLY(qualities))
121  {
122  auto stream_view = view::istreambuf(stream);
123  auto stream_it = std::ranges::begin(stream_view);
124 
125  if (!(std::ranges::equal(stream_view | view::take_until_or_throw(is_cntrl || is_blank), std::string{"LOCUS"})))
126  throw parse_error{"An entry has to start with the code word LOCUS."};
127 
128  //ID
129  if constexpr (!detail::decays_to_ignore_v<id_type>)
130  {
131  if (options.embl_genbank_complete_header)
132  {
134 
135  while (!is_char<'O'>(*std::ranges::begin(stream_view)))
136  {
138  | view::char_to<value_type_t<id_type>>,
139  std::back_inserter(id));
140  id.push_back('\n');
141  }
142  }
143  else
144  {
145  detail::consume(stream_view | view::take_until(!is_blank));
146  // read id
148  | view::char_to<value_type_t<id_type>>,
149  std::back_inserter(id));
150  detail::consume(stream_view | view::take_line_or_throw);
151  }
152  }
153 
154  // Jump to sequence
155  while (!(is_char<'O'>(*std::ranges::begin(stream_view)) || options.embl_genbank_complete_header))
156  detail::consume(stream_view | view::take_line_or_throw);
157 
158  // Sequence
159  detail::consume(stream_view | view::take_line_or_throw); // consume "ORIGIN"
160  auto constexpr is_end = is_char<'/'> ;
161  if constexpr (!detail::decays_to_ignore_v<seq_type>)
162  {
163  auto constexpr is_legal_alph = is_in_alphabet<seq_legal_alph_type>;
165  | view::take_until_or_throw_and_consume(is_end) // consume "//"
166  | std::view::transform([is_legal_alph] (char const c) // enforce legal alphabet
167  {
168  if (!is_legal_alph(c))
169  {
170  throw parse_error{std::string{"Encountered an unexpected letter: "} +
171  is_legal_alph.msg.str() +
172  " evaluated to false on " +
173  detail::make_printable(c)};
174  }
175  return c;
176  })
177  | view::char_to<value_type_t<seq_type>>, // convert to actual target alphabet
178  std::back_inserter(sequence));
179  }
180  else
181  {
182  detail::consume(stream_view | view::take_until_or_throw_and_consume(is_end)); // consume until "//"
183  ++stream_it; // consume "/n"
184  }
185  }
186 };
187 
190 template <>
191 class sequence_file_output_format<format_genbank>
192 {
193 public:
195  using format_tag = format_genbank;
196 
200  sequence_file_output_format() noexcept = default;
201  sequence_file_output_format(sequence_file_output_format const &) = delete;
204  sequence_file_output_format & operator=(sequence_file_output_format const &) = delete;
205  sequence_file_output_format(sequence_file_output_format &&) noexcept = default;
206  sequence_file_output_format & operator=(sequence_file_output_format &&) noexcept = default;
207  ~sequence_file_output_format() noexcept = default;
208 
211  template <typename stream_type, // constraints checked by file
212  typename seq_type, // other constraints checked inside function
213  typename id_type,
214  typename qual_type>
215  void write(stream_type & stream,
216  sequence_file_output_options const & options,
217  seq_type && sequence,
218  id_type && id,
219  qual_type && SEQAN3_DOXYGEN_ONLY(qualities))
220  {
221  seqan3::ostreambuf_iterator stream_it{stream};
222  size_t sequence_size{0};
223  [[maybe_unused]] char buffer[50];
224  if constexpr (!detail::decays_to_ignore_v<seq_type>)
225  sequence_size = ranges::size(sequence);
226 
227  // ID
228  if constexpr (detail::decays_to_ignore_v<id_type>)
229  {
230  throw std::logic_error{"The ID field may not be set to ignore when writing genbank files."};
231  }
232  else if (ranges::empty(id)) //[[unlikely]]
233  {
234  throw std::runtime_error{"The ID field may not be empty when writing genbank files."};
235  }
236  else if (options.embl_genbank_complete_header)
237  {
238  std::ranges::copy(id, stream_it);
239  }
240  else
241  {
242  std::ranges::copy(std::string_view{"LOCUS "}, stream_it);
243  std::ranges::copy(id, stream_it);
244  std::ranges::copy(std::string_view{" "}, stream_it);
245  auto res = std::to_chars(&buffer[0], &buffer[0] + sizeof(buffer), sequence_size);
246  std::copy(&buffer[0], res.ptr, stream_it);
247  std::ranges::copy(std::string_view{" bp\n"}, stream_it);
248  }
249 
250  // Sequence
251  if constexpr (detail::decays_to_ignore_v<seq_type>) // sequence
252  {
253  throw std::logic_error{"The SEQ field may not be set to ignore when writing genbank files."};
254  }
255  else if (std::ranges::empty(sequence)) //[[unlikely]]
256  {
257  throw std::runtime_error{"The SEQ field may not be empty when writing genbank files."};
258  }
259  else
260  {
261  std::ranges::copy(std::string_view{"ORIGIN\n"}, stream_it);
262  auto seq = sequence | ranges::view::chunk(60);
263  size_t i = 0;
264  size_t bp = 1;
265 
266  while (bp < sequence_size)
267  {
268  // Sequence length with more than 9 digits are not possible in one genbank entry, maximal 350 kb are
269  // allowed. See: https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html#SequenceLengthA
270  for (size_t j = std::to_string(bp).size(); j < 9; j++)
271  stream_it = ' ';
272  std::ranges::copy(std::to_string(bp), stream_it);
273  stream_it = ' ';
275  | view::interleave(10, std::string_view{" "}), stream_it);
276  bp += 60;
277  ++i;
278  detail::write_eol(stream_it,false);
279  }
280  std::ranges::copy(std::string_view{"//"}, stream_it);
281  detail::write_eol(stream_it,false);
282  }
283  }
284 };
285 
286 } // namespace seqan3::detail
::ranges::equal equal
Alias for ranges::equal. Determines if two sets of elements are the same.
Definition: algorithm:54
Provides seqan3::SequenceFileInputFormat and auxiliary classes.
constexpr auto is_char
Checks whether a given letter is the same as the template non-type argument.
Definition: predicate.hpp:83
T copy(T... args)
Provides seqan3::view::istreambuf.
auto constexpr take_until
A view adaptor that returns elements from the underlying range until the functor evaluates to true (o...
Definition: take_until.hpp:599
constexpr sequenced_policy seq
Global execution policy object for sequenced execution policy.
Definition: execution.hpp:54
::ranges::ostreambuf_iterator ostreambuf_iterator
Alias for ranges::ostreambuf_iterator. Writes successive characters onto the output stream from which...
Definition: iterator.hpp:56
T to_string(T... args)
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf.hpp:245
Provides seqan3::view::take.
::ranges::size size
Alias for ranges::size. Obtains the size of a range whose size can be calculated in constant time...
Definition: ranges:189
The main SeqAn3 namespace.
auto constexpr take_until_or_throw_and_consume
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:641
Provides seqan3::dna5, container aliases and string literals.
Provides std::from_chars and std::to_chars if not defined in the stl <charconv> header.
Provides seqan3::view::take_line and seqan3::view::take_line_or_throw.
auto constexpr is_blank
Checks whether c is a blank character.
Definition: predicate.hpp:163
Provides seqan3::view::interleave.
Provides seqan3::sequence_file_output_options.
Provides seqan3::view::take_until and seqan3::view::take_until_or_throw.
Provides various utility functions.
Provides various utility functions.
auto constexpr is_digit
Checks whether c is a digital character.
Definition: predicate.hpp:287
Provides seqan3::view::char_to.
auto constexpr take_line_or_throw
A view adaptor that returns a single line from the underlying range (throws if there is no end-of-lin...
Definition: take_line.hpp:92
Adaptations of concepts from the Ranges TS.
::ranges::begin begin
Alias for ranges::begin. Returns an iterator to the beginning of a range.
Definition: ranges:174
::ranges::copy copy
Alias for ranges::copy. Copies a range of elements to a new location.
Definition: algorithm:44
auto const to_char
A view that calls seqan3::to_char() on each element in the input range.
Definition: to_char.hpp:66
Definition: aligned_sequence_concept.hpp:35
Provides character predicates for tokenisation.
T to_chars(T... args)
constexpr auto interleave
A view that interleaves a given range into another range at regular intervals.
Definition: interleave.hpp:387
Provides seqan3::ostream and seqan3::ostreambuf iterator.
auto constexpr take_until_or_throw
A view adaptor that returns elements from the underlying range until the functor evaluates to true (t...
Definition: take_until.hpp:613
Provides seqan3::SequenceFileFormatOut and auxiliary classes.
T back_inserter(T... args)
Adaptations of algorithms from the Ranges TS.
::ranges::empty empty
Alias for ranges::empty. Checks whether a range is empty.
Definition: ranges:194
Provides seqan3::view::to_char.
Provides seqan3::sequence_file_input_options.
Provides various transformation traits used by the range module.
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_genbank.hpp:76
auto constexpr is_space
Checks whether c is a space character.
Definition: predicate.hpp:146
auto constexpr is_cntrl
Checks whether c is a control character.
Definition: predicate.hpp:110
constexpr auto transform
A range adaptor that takes a invocable and returns a view of the elements with the invocable applied...
Definition: ranges:911
The GenBank format (tag).
Definition: format_genbank.hpp:72
constexpr auto filter
A range adaptor that takes a predicate and returns a view of the elements that satisfy the predicate...
Definition: ranges:565
auto const char_to
A view over an alphabet, given a range of characters.
Definition: char_to.hpp:70