SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
bgzf_stream_util.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <algorithm>
16#include <array>
17#include <cassert>
18#include <cstring>
19#include <memory>
20#include <span>
21#include <thread>
22
28
29#if !defined(SEQAN3_HAS_ZLIB) && !defined(SEQAN3_HEADER_TEST)
30# error "This file cannot be used when building without GZip-support."
31#endif // !defined(SEQAN3_HAS_ZLIB) && !defined(SEQAN3_HEADER_TEST)
32
33#if defined(SEQAN3_HAS_ZLIB)
34// Zlib headers
35# include <zlib.h>
36
37namespace seqan3::contrib
38{
39
40// ============================================================================
41// Forwards
42// ============================================================================
43
44// ============================================================================
45// Classes
46// ============================================================================
47
48// Special end-of-file marker defined by the BGZF compression format.
49// See: https://samtools.github.io/hts-specs/SAMv1.pdf
50static constexpr std::array<char, 28> BGZF_END_OF_FILE_MARKER{
51 {'\x1f', '\x8b', '\x08', '\x04', '\x00', '\x00', '\x00', '\x00', '\x00', '\xff', '\x06', '\x00', '\x42', '\x43',
52 '\x02', '\x00', '\x1b', '\x00', '\x03', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}};
53
54template <typename TAlgTag>
55struct CompressionContext
56{};
57
58template <typename TAlgTag>
59struct DefaultPageSize;
60
61template <>
62struct CompressionContext<detail::gz_compression>
63{
64 z_stream strm;
65
66 CompressionContext()
67 {
68 std::memset(&strm, 0, sizeof(z_stream));
69 }
70};
71
72template <>
73struct CompressionContext<detail::bgzf_compression> : CompressionContext<detail::gz_compression>
74{
75 static constexpr size_t BLOCK_HEADER_LENGTH = detail::bgzf_compression::magic_header.size();
76 unsigned char headerPos;
77};
78
79template <>
80struct DefaultPageSize<detail::bgzf_compression>
81{
82 static unsigned const MAX_BLOCK_SIZE = 64 * 1024;
83 static unsigned const BLOCK_FOOTER_LENGTH = 8;
84 // 5 bytes block overhead (see 3.2.4. at https://tools.ietf.org/html/rfc1951)
85 static unsigned const ZLIB_BLOCK_OVERHEAD = 5;
86
87 // Reduce the maximal input size, such that the compressed data
88 // always fits in one block even for level Z_NO_COMPRESSION.
89 enum
90 {
91 BLOCK_HEADER_LENGTH = CompressionContext<detail::bgzf_compression>::BLOCK_HEADER_LENGTH
92 };
93 static unsigned const VALUE = MAX_BLOCK_SIZE - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH - ZLIB_BLOCK_OVERHEAD;
94};
95
96// ============================================================================
97// Functions
98// ============================================================================
99
100// ----------------------------------------------------------------------------
101// Function compressInit()
102// ----------------------------------------------------------------------------
103
104inline void compressInit(CompressionContext<detail::gz_compression> & ctx)
105{
106 int const GZIP_WINDOW_BITS = -15; // no zlib header
107 int const Z_DEFAULT_MEM_LEVEL = 8;
108
109 ctx.strm.zalloc = NULL;
110 ctx.strm.zfree = NULL;
111
112 // (weese:) We use Z_BEST_SPEED instead of Z_DEFAULT_COMPRESSION as it turned out
113 // to be 2x faster and produces only 7% bigger output
114 // int status = deflateInit2(&ctx.strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
115 // GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
116 int status =
117 deflateInit2(&ctx.strm, Z_BEST_SPEED, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
118 if (status != Z_OK)
119 throw io_error("Calling deflateInit2() failed for gz file.");
120}
121
122// ----------------------------------------------------------------------------
123// Function compressInit()
124// ----------------------------------------------------------------------------
125
126inline void compressInit(CompressionContext<detail::bgzf_compression> & ctx)
127{
128 compressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
129 ctx.headerPos = 0;
130}
131
132// ----------------------------------------------------------------------------
133// Helper Function _bgzfUnpackXX()
134// ----------------------------------------------------------------------------
135
136inline uint16_t _bgzfUnpack16(char const * buffer)
137{
138 uint16_t tmp;
139 std::uninitialized_copy(buffer, buffer + sizeof(uint16_t), reinterpret_cast<char *>(&tmp));
140 return detail::to_little_endian(tmp);
141}
142
143inline uint32_t _bgzfUnpack32(char const * buffer)
144{
145 uint32_t tmp;
146 std::uninitialized_copy(buffer, buffer + sizeof(uint32_t), reinterpret_cast<char *>(&tmp));
147 return detail::to_little_endian(tmp);
148}
149
150// ----------------------------------------------------------------------------
151// Helper Function _bgzfPackXX()
152// ----------------------------------------------------------------------------
153
154inline void _bgzfPack16(char * buffer, uint16_t value)
155{
156 value = detail::to_little_endian(value);
157 std::uninitialized_copy(reinterpret_cast<char *>(&value),
158 reinterpret_cast<char *>(&value) + sizeof(uint16_t),
159 buffer);
160}
161
162inline void _bgzfPack32(char * buffer, uint32_t value)
163{
164 value = detail::to_little_endian(value);
165 std::uninitialized_copy(reinterpret_cast<char *>(&value),
166 reinterpret_cast<char *>(&value) + sizeof(uint32_t),
167 buffer);
168}
169
170// ----------------------------------------------------------------------------
171// Function _compressBlock()
172// ----------------------------------------------------------------------------
173
174template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
175inline TDestCapacity _compressBlock(TDestValue * dstBegin,
176 TDestCapacity dstCapacity,
177 TSourceValue * srcBegin,
178 TSourceLength srcLength,
179 CompressionContext<detail::bgzf_compression> & ctx)
180{
181 const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
182 const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
183
184 assert(dstCapacity > BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH);
185 assert(sizeof(TDestValue) == 1u);
186 assert(sizeof(unsigned) == 4u);
187
188 // 1. COPY HEADER
189 std::ranges::copy(detail::bgzf_compression::magic_header, dstBegin);
190
191 // 2. COMPRESS
192 compressInit(ctx);
193 ctx.strm.next_in = (Bytef *)(srcBegin);
194 ctx.strm.next_out = (Bytef *)(dstBegin + BLOCK_HEADER_LENGTH);
195 ctx.strm.avail_in = srcLength * sizeof(TSourceValue);
196 ctx.strm.avail_out = dstCapacity - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
197
198 int status = deflate(&ctx.strm, Z_FINISH);
199 if (status != Z_STREAM_END)
200 {
201 deflateEnd(&ctx.strm);
202 throw io_error("Deflation failed. Compressed BGZF data is too big.");
203 }
204
205 status = deflateEnd(&ctx.strm);
206 if (status != Z_OK)
207 throw io_error("BGZF deflateEnd() failed.");
208
209 // 3. APPEND FOOTER
210
211 // Set compressed length into buffer, compute CRC and write CRC into buffer.
212
213 size_t len = dstCapacity - ctx.strm.avail_out;
214 _bgzfPack16(dstBegin + 16, len - 1);
215
216 dstBegin += len - BLOCK_FOOTER_LENGTH;
217 _bgzfPack32(dstBegin, crc32(crc32(0u, NULL, 0u), (Bytef *)(srcBegin), srcLength * sizeof(TSourceValue)));
218 _bgzfPack32(dstBegin + 4, srcLength * sizeof(TSourceValue));
219
220 return dstCapacity - ctx.strm.avail_out;
221}
222
223// ----------------------------------------------------------------------------
224// Function decompressInit() - GZIP
225// ----------------------------------------------------------------------------
226
227inline void decompressInit(CompressionContext<detail::gz_compression> & ctx)
228{
229 int const GZIP_WINDOW_BITS = -15; // no zlib header
230
231 ctx.strm.zalloc = NULL;
232 ctx.strm.zfree = NULL;
233 int status = inflateInit2(&ctx.strm, GZIP_WINDOW_BITS);
234 if (status != Z_OK)
235 throw io_error("GZip inflateInit2() failed.");
236}
237
238// ----------------------------------------------------------------------------
239// Function decompressInit() - BGZF
240// ----------------------------------------------------------------------------
241
242inline void decompressInit(CompressionContext<detail::bgzf_compression> & ctx)
243{
244 decompressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
245 ctx.headerPos = 0;
246}
247
248// ----------------------------------------------------------------------------
249// Function _decompressBlock()
250// ----------------------------------------------------------------------------
251
252template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
253inline TDestCapacity _decompressBlock(TDestValue * dstBegin,
254 TDestCapacity dstCapacity,
255 TSourceValue * srcBegin,
256 TSourceLength srcLength,
257 CompressionContext<detail::bgzf_compression> & ctx)
258{
259 const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
260 const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
261
262 assert(sizeof(TSourceValue) == 1u);
263 assert(sizeof(unsigned) == 4u);
264
265 // 1. CHECK HEADER
266
267 if (srcLength <= BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH)
268 throw io_error("BGZF block too short.");
269
270 if (!detail::bgzf_compression::validate_header(std::span{srcBegin, srcLength}))
271 throw io_error("Invalid BGZF block header.");
272
273 size_t compressedLen = _bgzfUnpack16(srcBegin + 16) + 1u;
274 if (compressedLen != srcLength)
275 throw io_error("BGZF compressed size mismatch.");
276
277 // 2. DECOMPRESS
278
279 decompressInit(ctx);
280 ctx.strm.next_in = (Bytef *)(srcBegin + BLOCK_HEADER_LENGTH);
281 ctx.strm.next_out = (Bytef *)(dstBegin);
282 ctx.strm.avail_in = srcLength - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
283 ctx.strm.avail_out = dstCapacity * sizeof(TDestValue);
284
285 int status = inflate(&ctx.strm, Z_FINISH);
286 if (status != Z_STREAM_END)
287 {
288 inflateEnd(&ctx.strm);
289 throw io_error("Inflation failed. Decompressed BGZF data is too big.");
290 }
291
292 status = inflateEnd(&ctx.strm);
293 if (status != Z_OK)
294 throw io_error("BGZF inflateEnd() failed.");
295
296 // 3. CHECK FOOTER
297
298 // Check compressed length in buffer, compute CRC and compare with CRC in buffer.
299
300 unsigned crc = crc32(crc32(0u, NULL, 0u), (Bytef *)(dstBegin), dstCapacity - ctx.strm.avail_out);
301
302 srcBegin += compressedLen - BLOCK_FOOTER_LENGTH;
303 if (_bgzfUnpack32(srcBegin) != crc)
304 throw io_error("BGZF wrong checksum.");
305
306 if (_bgzfUnpack32(srcBegin + 4) != dstCapacity - ctx.strm.avail_out)
307 throw io_error("BGZF size mismatch.");
308
309 return (dstCapacity - ctx.strm.avail_out) / sizeof(TDestValue);
310}
311
312} // namespace seqan3::contrib
313
314#endif // defined(SEQAN3_HAS_ZLIB)
Provides seqan3::contrib::bgzf_thread_count.
T copy(T... args)
Provides various transformation traits used by the range module.
Provides exceptions used in the I/O module.
Provides seqan3::detail::magic_header.
T memset(T... args)
Provides utility functions for bit twiddling.
T uninitialized_copy(T... args)