SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
bgzf_stream_util.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <seqan3/std/algorithm>
16#include <array>
17#include <cstring>
18#include <memory>
19#include <seqan3/std/span>
20#include <thread>
21
26
27#if !defined(SEQAN3_HAS_ZLIB) && !defined(SEQAN3_HEADER_TEST)
28# error "This file cannot be used when building without GZip-support."
29#endif // !defined(SEQAN3_HAS_ZLIB) && !defined(SEQAN3_HEADER_TEST)
30
31#if defined(SEQAN3_HAS_ZLIB)
32// Zlib headers
33#include <zlib.h>
34
35namespace seqan3::contrib
36{
37
41inline static uint64_t bgzf_thread_count = std::thread::hardware_concurrency();
42
43// ============================================================================
44// Forwards
45// ============================================================================
46
47// ============================================================================
48// Classes
49// ============================================================================
50
51// Special end-of-file marker defined by the BGZF compression format.
52// See: https://samtools.github.io/hts-specs/SAMv1.pdf
53static constexpr std::array<char, 28> BGZF_END_OF_FILE_MARKER {{'\x1f', '\x8b', '\x08', '\x04',
54 '\x00', '\x00', '\x00', '\x00',
55 '\x00', '\xff', '\x06', '\x00',
56 '\x42', '\x43', '\x02', '\x00',
57 '\x1b', '\x00', '\x03', '\x00',
58 '\x00', '\x00', '\x00', '\x00',
59 '\x00', '\x00', '\x00', '\x00'}};
60
61template <typename TAlgTag>
62struct CompressionContext {};
63
64template <typename TAlgTag>
65struct DefaultPageSize;
66
67template <>
68struct CompressionContext<detail::gz_compression>
69{
70 z_stream strm;
71
72 CompressionContext()
73 {
74 std::memset(&strm, 0, sizeof(z_stream));
75 }
76};
77
78template <>
79struct CompressionContext<detail::bgzf_compression>:
80 CompressionContext<detail::gz_compression>
81{
82 static constexpr size_t BLOCK_HEADER_LENGTH = detail::bgzf_compression::magic_header.size();
83 unsigned char headerPos;
84};
85
86template <>
87struct DefaultPageSize<detail::bgzf_compression>
88{
89 static const unsigned MAX_BLOCK_SIZE = 64 * 1024;
90 static const unsigned BLOCK_FOOTER_LENGTH = 8;
91 // 5 bytes block overhead (see 3.2.4. at https://tools.ietf.org/html/rfc1951)
92 static const unsigned ZLIB_BLOCK_OVERHEAD = 5;
93
94 // Reduce the maximal input size, such that the compressed data
95 // always fits in one block even for level Z_NO_COMPRESSION.
96 enum { BLOCK_HEADER_LENGTH = CompressionContext<detail::bgzf_compression>::BLOCK_HEADER_LENGTH };
97 static const unsigned VALUE = MAX_BLOCK_SIZE - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH - ZLIB_BLOCK_OVERHEAD;
98};
99
100// ============================================================================
101// Functions
102// ============================================================================
103
104// ----------------------------------------------------------------------------
105// Function compressInit()
106// ----------------------------------------------------------------------------
107
108inline void
109compressInit(CompressionContext<detail::gz_compression> & ctx)
110{
111 const int GZIP_WINDOW_BITS = -15; // no zlib header
112 const int Z_DEFAULT_MEM_LEVEL = 8;
113
114 ctx.strm.zalloc = NULL;
115 ctx.strm.zfree = NULL;
116
117 // (weese:) We use Z_BEST_SPEED instead of Z_DEFAULT_COMPRESSION as it turned out
118 // to be 2x faster and produces only 7% bigger output
119// int status = deflateInit2(&ctx.strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
120// GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
121 int status = deflateInit2(&ctx.strm, Z_BEST_SPEED, Z_DEFLATED,
122 GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
123 if (status != Z_OK)
124 throw io_error("Calling deflateInit2() failed for gz file.");
125}
126
127// ----------------------------------------------------------------------------
128// Function compressInit()
129// ----------------------------------------------------------------------------
130
131inline void
132compressInit(CompressionContext<detail::bgzf_compression> & ctx)
133{
134 compressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
135 ctx.headerPos = 0;
136}
137
138// ----------------------------------------------------------------------------
139// Helper Function _bgzfUnpackXX()
140// ----------------------------------------------------------------------------
141
142inline uint16_t
143_bgzfUnpack16(char const * buffer)
144{
145 uint16_t tmp;
146 std::uninitialized_copy(buffer, buffer + sizeof(uint16_t), reinterpret_cast<char *>(&tmp));
147 return detail::to_little_endian(tmp);
148}
149
150inline uint32_t
151_bgzfUnpack32(char const * buffer)
152{
153 uint32_t tmp;
154 std::uninitialized_copy(buffer, buffer + sizeof(uint32_t), reinterpret_cast<char *>(&tmp));
155 return detail::to_little_endian(tmp);
156}
157
158// ----------------------------------------------------------------------------
159// Helper Function _bgzfPackXX()
160// ----------------------------------------------------------------------------
161
162inline void
163_bgzfPack16(char * buffer, uint16_t value)
164{
165 value = detail::to_little_endian(value);
166 std::uninitialized_copy(reinterpret_cast<char *>(&value),
167 reinterpret_cast<char *>(&value) + sizeof(uint16_t),
168 buffer);
169}
170
171inline void
172_bgzfPack32(char * buffer, uint32_t value)
173{
174 value = detail::to_little_endian(value);
175 std::uninitialized_copy(reinterpret_cast<char *>(&value),
176 reinterpret_cast<char *>(&value) + sizeof(uint32_t),
177 buffer);
178}
179
180// ----------------------------------------------------------------------------
181// Function _compressBlock()
182// ----------------------------------------------------------------------------
183
184template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
185inline TDestCapacity
186_compressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
187 TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<detail::bgzf_compression> & ctx)
188{
189 const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
190 const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
191
192 assert(dstCapacity > BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH);
193 assert(sizeof(TDestValue) == 1u);
194 assert(sizeof(unsigned) == 4u);
195
196 // 1. COPY HEADER
197 std::ranges::copy(detail::bgzf_compression::magic_header, dstBegin);
198
199 // 2. COMPRESS
200 compressInit(ctx);
201 ctx.strm.next_in = (Bytef *)(srcBegin);
202 ctx.strm.next_out = (Bytef *)(dstBegin + BLOCK_HEADER_LENGTH);
203 ctx.strm.avail_in = srcLength * sizeof(TSourceValue);
204 ctx.strm.avail_out = dstCapacity - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
205
206 int status = deflate(&ctx.strm, Z_FINISH);
207 if (status != Z_STREAM_END)
208 {
209 deflateEnd(&ctx.strm);
210 throw io_error("Deflation failed. Compressed BGZF data is too big.");
211 }
212
213 status = deflateEnd(&ctx.strm);
214 if (status != Z_OK)
215 throw io_error("BGZF deflateEnd() failed.");
216
217
218 // 3. APPEND FOOTER
219
220 // Set compressed length into buffer, compute CRC and write CRC into buffer.
221
222 size_t len = dstCapacity - ctx.strm.avail_out;
223 _bgzfPack16(dstBegin + 16, len - 1);
224
225 dstBegin += len - BLOCK_FOOTER_LENGTH;
226 _bgzfPack32(dstBegin, crc32(crc32(0u, NULL, 0u), (Bytef *)(srcBegin), srcLength * sizeof(TSourceValue)));
227 _bgzfPack32(dstBegin + 4, srcLength * sizeof(TSourceValue));
228
229 return dstCapacity - ctx.strm.avail_out;
230}
231
232// ----------------------------------------------------------------------------
233// Function decompressInit() - GZIP
234// ----------------------------------------------------------------------------
235
236inline void
237decompressInit(CompressionContext<detail::gz_compression> & ctx)
238{
239 const int GZIP_WINDOW_BITS = -15; // no zlib header
240
241 ctx.strm.zalloc = NULL;
242 ctx.strm.zfree = NULL;
243 int status = inflateInit2(&ctx.strm, GZIP_WINDOW_BITS);
244 if (status != Z_OK)
245 throw io_error("GZip inflateInit2() failed.");
246}
247
248// ----------------------------------------------------------------------------
249// Function decompressInit() - BGZF
250// ----------------------------------------------------------------------------
251
252inline void
253decompressInit(CompressionContext<detail::bgzf_compression> & ctx)
254{
255 decompressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
256 ctx.headerPos = 0;
257}
258
259// ----------------------------------------------------------------------------
260// Function _decompressBlock()
261// ----------------------------------------------------------------------------
262
263template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
264inline TDestCapacity
265_decompressBlock(TDestValue *dstBegin, TDestCapacity dstCapacity,
266 TSourceValue *srcBegin, TSourceLength srcLength, CompressionContext<detail::bgzf_compression> & ctx)
267{
268 const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
269 const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
270
271 assert(sizeof(TSourceValue) == 1u);
272 assert(sizeof(unsigned) == 4u);
273
274 // 1. CHECK HEADER
275
276 if (srcLength <= BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH)
277 throw io_error("BGZF block too short.");
278
279 if (!detail::bgzf_compression::validate_header(std::span{srcBegin, srcLength}))
280 throw io_error("Invalid BGZF block header.");
281
282 size_t compressedLen = _bgzfUnpack16(srcBegin + 16) + 1u;
283 if (compressedLen != srcLength)
284 throw io_error("BGZF compressed size mismatch.");
285
286
287 // 2. DECOMPRESS
288
289 decompressInit(ctx);
290 ctx.strm.next_in = (Bytef *)(srcBegin + BLOCK_HEADER_LENGTH);
291 ctx.strm.next_out = (Bytef *)(dstBegin);
292 ctx.strm.avail_in = srcLength - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
293 ctx.strm.avail_out = dstCapacity * sizeof(TDestValue);
294
295 int status = inflate(&ctx.strm, Z_FINISH);
296 if (status != Z_STREAM_END)
297 {
298 inflateEnd(&ctx.strm);
299 throw io_error("Inflation failed. Decompressed BGZF data is too big.");
300 }
301
302 status = inflateEnd(&ctx.strm);
303 if (status != Z_OK)
304 throw io_error("BGZF inflateEnd() failed.");
305
306
307 // 3. CHECK FOOTER
308
309 // Check compressed length in buffer, compute CRC and compare with CRC in buffer.
310
311 unsigned crc = crc32(crc32(0u, NULL, 0u), (Bytef *)(dstBegin), dstCapacity - ctx.strm.avail_out);
312
313 srcBegin += compressedLen - BLOCK_FOOTER_LENGTH;
314 if (_bgzfUnpack32(srcBegin) != crc)
315 throw io_error("BGZF wrong checksum.");
316
317 if (_bgzfUnpack32(srcBegin + 4) != dstCapacity - ctx.strm.avail_out)
318 throw io_error("BGZF size mismatch.");
319
320 return (dstCapacity - ctx.strm.avail_out) / sizeof(TDestValue);
321}
322
323} // namespace seqan3::contrib
324
325#endif // defined(SEQAN3_HAS_ZLIB)
The <algorithm> header from C++20's standard library.
Provides various transformation traits used by the range module.
T hardware_concurrency(T... args)
Provides exceptions used in the I/O module.
Provides seqan3::detail::magic_header.
T memset(T... args)
Provides std::span from the C++20 standard library.
Provides utility functions for bit twiddling.
T uninitialized_copy(T... args)