SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
bgzf_stream_util.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 #include <cstring>
17 #include <memory>
18 #include <thread>
19 
20 #if SEQAN3_HAS_ZLIB
21 // Zlib headers
22 #include <zlib.h>
23 #else
24 #error "This file cannot be used when building without GZip-support."
25 #endif // SEQAN3_HAS_ZLIB
26 
30 #include <seqan3/io/exception.hpp>
31 #include <seqan3/std/algorithm>
32 #include <seqan3/std/span>
33 
34 namespace seqan3::contrib
35 {
36 
40 inline static uint64_t bgzf_thread_count = std::thread::hardware_concurrency();
41 
42 // ============================================================================
43 // Forwards
44 // ============================================================================
45 
46 // ============================================================================
47 // Classes
48 // ============================================================================
49 
50 // Special end-of-file marker defined by the BGZF compression format.
51 // See: https://samtools.github.io/hts-specs/SAMv1.pdf
52 static constexpr std::array<char, 28> BGZF_END_OF_FILE_MARKER {{'\x1f', '\x8b', '\x08', '\x04',
53  '\x00', '\x00', '\x00', '\x00',
54  '\x00', '\xff', '\x06', '\x00',
55  '\x42', '\x43', '\x02', '\x00',
56  '\x1b', '\x00', '\x03', '\x00',
57  '\x00', '\x00', '\x00', '\x00',
58  '\x00', '\x00', '\x00', '\x00'}};
59 
60 template <typename TAlgTag>
61 struct CompressionContext {};
62 
63 template <typename TAlgTag>
64 struct DefaultPageSize;
65 
66 template <>
67 struct CompressionContext<detail::gz_compression>
68 {
69  z_stream strm;
70 
71  CompressionContext()
72  {
73  std::memset(&strm, 0, sizeof(z_stream));
74  }
75 };
76 
77 template <>
78 struct CompressionContext<detail::bgzf_compression>:
79  CompressionContext<detail::gz_compression>
80 {
81  static constexpr size_t BLOCK_HEADER_LENGTH = detail::bgzf_compression::magic_header.size();
82  unsigned char headerPos;
83 };
84 
85 template <>
86 struct DefaultPageSize<detail::bgzf_compression>
87 {
88  static const unsigned MAX_BLOCK_SIZE = 64 * 1024;
89  static const unsigned BLOCK_FOOTER_LENGTH = 8;
90  // 5 bytes block overhead (see 3.2.4. at https://tools.ietf.org/html/rfc1951)
91  static const unsigned ZLIB_BLOCK_OVERHEAD = 5;
92 
93  // Reduce the maximal input size, such that the compressed data
94  // always fits in one block even for level Z_NO_COMPRESSION.
95  enum { BLOCK_HEADER_LENGTH = CompressionContext<detail::bgzf_compression>::BLOCK_HEADER_LENGTH };
96  static const unsigned VALUE = MAX_BLOCK_SIZE - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH - ZLIB_BLOCK_OVERHEAD;
97 };
98 
99 // ============================================================================
100 // Functions
101 // ============================================================================
102 
103 // ----------------------------------------------------------------------------
104 // Function compressInit()
105 // ----------------------------------------------------------------------------
106 
107 inline void
108 compressInit(CompressionContext<detail::gz_compression> & ctx)
109 {
110  const int GZIP_WINDOW_BITS = -15; // no zlib header
111  const int Z_DEFAULT_MEM_LEVEL = 8;
112 
113  ctx.strm.zalloc = NULL;
114  ctx.strm.zfree = NULL;
115 
116  // (weese:) We use Z_BEST_SPEED instead of Z_DEFAULT_COMPRESSION as it turned out
117  // to be 2x faster and produces only 7% bigger output
118 // int status = deflateInit2(&ctx.strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
119 // GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
120  int status = deflateInit2(&ctx.strm, Z_BEST_SPEED, Z_DEFLATED,
121  GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
122  if (status != Z_OK)
123  throw io_error("Calling deflateInit2() failed for gz file.");
124 }
125 
126 // ----------------------------------------------------------------------------
127 // Function compressInit()
128 // ----------------------------------------------------------------------------
129 
130 inline void
131 compressInit(CompressionContext<detail::bgzf_compression> & ctx)
132 {
133  compressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
134  ctx.headerPos = 0;
135 }
136 
137 // ----------------------------------------------------------------------------
138 // Helper Function _bgzfUnpackXX()
139 // ----------------------------------------------------------------------------
140 
141 inline uint16_t
142 _bgzfUnpack16(char const * buffer)
143 {
144  uint16_t tmp;
145  std::uninitialized_copy(buffer, buffer + sizeof(uint16_t), reinterpret_cast<char *>(&tmp));
146  return detail::to_little_endian(tmp);
147 }
148 
149 inline uint32_t
150 _bgzfUnpack32(char const * buffer)
151 {
152  uint32_t tmp;
153  std::uninitialized_copy(buffer, buffer + sizeof(uint32_t), reinterpret_cast<char *>(&tmp));
154  return detail::to_little_endian(tmp);
155 }
156 
157 // ----------------------------------------------------------------------------
158 // Helper Function _bgzfPackXX()
159 // ----------------------------------------------------------------------------
160 
161 inline void
162 _bgzfPack16(char * buffer, uint16_t value)
163 {
164  value = detail::to_little_endian(value);
165  std::uninitialized_copy(reinterpret_cast<char *>(&value),
166  reinterpret_cast<char *>(&value) + sizeof(uint16_t),
167  buffer);
168 }
169 
170 inline void
171 _bgzfPack32(char * buffer, uint32_t value)
172 {
173  value = detail::to_little_endian(value);
174  std::uninitialized_copy(reinterpret_cast<char *>(&value),
175  reinterpret_cast<char *>(&value) + sizeof(uint32_t),
176  buffer);
177 }
178 
179 // ----------------------------------------------------------------------------
180 // Function _compressBlock()
181 // ----------------------------------------------------------------------------
182 
183 template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
184 inline TDestCapacity
185 _compressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
186  TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<detail::bgzf_compression> & ctx)
187 {
188  const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
189  const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
190 
191  assert(dstCapacity > BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH);
192  assert(sizeof(TDestValue) == 1u);
193  assert(sizeof(unsigned) == 4u);
194 
195  // 1. COPY HEADER
196  std::ranges::copy(detail::bgzf_compression::magic_header, dstBegin);
197 
198  // 2. COMPRESS
199  compressInit(ctx);
200  ctx.strm.next_in = (Bytef *)(srcBegin);
201  ctx.strm.next_out = (Bytef *)(dstBegin + BLOCK_HEADER_LENGTH);
202  ctx.strm.avail_in = srcLength * sizeof(TSourceValue);
203  ctx.strm.avail_out = dstCapacity - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
204 
205  int status = deflate(&ctx.strm, Z_FINISH);
206  if (status != Z_STREAM_END)
207  {
208  deflateEnd(&ctx.strm);
209  throw io_error("Deflation failed. Compressed BGZF data is too big.");
210  }
211 
212  status = deflateEnd(&ctx.strm);
213  if (status != Z_OK)
214  throw io_error("BGZF deflateEnd() failed.");
215 
216 
217  // 3. APPEND FOOTER
218 
219  // Set compressed length into buffer, compute CRC and write CRC into buffer.
220 
221  size_t len = dstCapacity - ctx.strm.avail_out;
222  _bgzfPack16(dstBegin + 16, len - 1);
223 
224  dstBegin += len - BLOCK_FOOTER_LENGTH;
225  _bgzfPack32(dstBegin, crc32(crc32(0u, NULL, 0u), (Bytef *)(srcBegin), srcLength * sizeof(TSourceValue)));
226  _bgzfPack32(dstBegin + 4, srcLength * sizeof(TSourceValue));
227 
228  return dstCapacity - ctx.strm.avail_out;
229 }
230 
231 // ----------------------------------------------------------------------------
232 // Function decompressInit() - GZIP
233 // ----------------------------------------------------------------------------
234 
235 inline void
236 decompressInit(CompressionContext<detail::gz_compression> & ctx)
237 {
238  const int GZIP_WINDOW_BITS = -15; // no zlib header
239 
240  ctx.strm.zalloc = NULL;
241  ctx.strm.zfree = NULL;
242  int status = inflateInit2(&ctx.strm, GZIP_WINDOW_BITS);
243  if (status != Z_OK)
244  throw io_error("GZip inflateInit2() failed.");
245 }
246 
247 // ----------------------------------------------------------------------------
248 // Function decompressInit() - BGZF
249 // ----------------------------------------------------------------------------
250 
251 inline void
252 decompressInit(CompressionContext<detail::bgzf_compression> & ctx)
253 {
254  decompressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
255  ctx.headerPos = 0;
256 }
257 
258 // ----------------------------------------------------------------------------
259 // Function _decompressBlock()
260 // ----------------------------------------------------------------------------
261 
262 template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
263 inline TDestCapacity
264 _decompressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
265  TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<detail::bgzf_compression> & ctx)
266 {
267  const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
268  const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
269 
270  assert(sizeof(TSourceValue) == 1u);
271  assert(sizeof(unsigned) == 4u);
272 
273  // 1. CHECK HEADER
274 
275  if (srcLength <= BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH)
276  throw io_error("BGZF block too short.");
277 
278  if (!detail::bgzf_compression::validate_header(std::span{srcBegin, srcLength}))
279  throw io_error("Invalid BGZF block header.");
280 
281  size_t compressedLen = _bgzfUnpack16(srcBegin + 16) + 1u;
282  if (compressedLen != srcLength)
283  throw io_error("BGZF compressed size mismatch.");
284 
285 
286  // 2. DECOMPRESS
287 
288  decompressInit(ctx);
289  ctx.strm.next_in = (Bytef *)(srcBegin + BLOCK_HEADER_LENGTH);
290  ctx.strm.next_out = (Bytef *)(dstBegin);
291  ctx.strm.avail_in = srcLength - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
292  ctx.strm.avail_out = dstCapacity * sizeof(TDestValue);
293 
294  int status = inflate(&ctx.strm, Z_FINISH);
295  if (status != Z_STREAM_END)
296  {
297  inflateEnd(&ctx.strm);
298  throw io_error("Inflation failed. Decompressed BGZF data is too big.");
299  }
300 
301  status = inflateEnd(&ctx.strm);
302  if (status != Z_OK)
303  throw io_error("BGZF inflateEnd() failed.");
304 
305 
306  // 3. CHECK FOOTER
307 
308  // Check compressed length in buffer, compute CRC and compare with CRC in buffer.
309 
310  unsigned crc = crc32(crc32(0u, NULL, 0u), (Bytef *)(dstBegin), dstCapacity - ctx.strm.avail_out);
311 
312  srcBegin += compressedLen - BLOCK_FOOTER_LENGTH;
313  if (_bgzfUnpack32(srcBegin) != crc)
314  throw io_error("BGZF wrong checksum.");
315 
316  if (_bgzfUnpack32(srcBegin + 4) != dstCapacity - ctx.strm.avail_out)
317  throw io_error("BGZF size mismatch.");
318 
319  return (dstCapacity - ctx.strm.avail_out) / sizeof(TDestValue);
320 }
321 
322 } // namespace seqan3::contrib
bit_manipulation.hpp
Provides utility functions for bit twiddling.
span
Provides std::span from the C++20 standard library.
cstring
std::uninitialized_copy
T uninitialized_copy(T... args)
algorithm
Adaptations of algorithms from the Ranges TS.
thread
std::thread::hardware_concurrency
T hardware_concurrency(T... args)
range.hpp
Provides various transformation traits used by the range module.
exception.hpp
Provides exceptions used in the I/O module.
array
magic_header.hpp
Provides seqan3::detail::magic_header.
memory
std::experimental::filesystem::status
T status(T... args)
std::memset
T memset(T... args)