SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
bgzf_stream_util.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <algorithm>
13#include <array>
14#include <cassert>
15#include <cstring>
16#include <memory>
17#include <span>
18#include <thread>
19
25
26#if !defined(SEQAN3_HAS_ZLIB) && !defined(SEQAN3_HEADER_TEST)
27# error "This file cannot be used when building without GZip-support."
28#endif // !defined(SEQAN3_HAS_ZLIB) && !defined(SEQAN3_HEADER_TEST)
29
30#if defined(SEQAN3_HAS_ZLIB)
31// Zlib headers
32# include <zlib.h>
33
34namespace seqan3::contrib
35{
36
37// ============================================================================
38// Forwards
39// ============================================================================
40
41// ============================================================================
42// Classes
43// ============================================================================
44
45// Special end-of-file marker defined by the BGZF compression format.
46// See: https://samtools.github.io/hts-specs/SAMv1.pdf
47static constexpr std::array<char, 28> BGZF_END_OF_FILE_MARKER{
48 {'\x1f', '\x8b', '\x08', '\x04', '\x00', '\x00', '\x00', '\x00', '\x00', '\xff', '\x06', '\x00', '\x42', '\x43',
49 '\x02', '\x00', '\x1b', '\x00', '\x03', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00', '\x00'}};
50
51template <typename TAlgTag>
52struct CompressionContext
53{};
54
55template <typename TAlgTag>
56struct DefaultPageSize;
57
58template <>
59struct CompressionContext<detail::gz_compression>
60{
61 z_stream strm;
62
63 CompressionContext()
64 {
65 std::memset(&strm, 0, sizeof(z_stream));
66 }
67};
68
69template <>
70struct CompressionContext<detail::bgzf_compression> : CompressionContext<detail::gz_compression>
71{
72 static constexpr size_t BLOCK_HEADER_LENGTH = detail::bgzf_compression::magic_header.size();
73 unsigned char headerPos;
74};
75
76template <>
77struct DefaultPageSize<detail::bgzf_compression>
78{
79 static unsigned const MAX_BLOCK_SIZE = 64 * 1024;
80 static unsigned const BLOCK_FOOTER_LENGTH = 8;
81 // 5 bytes block overhead (see 3.2.4. at https://tools.ietf.org/html/rfc1951)
82 static unsigned const ZLIB_BLOCK_OVERHEAD = 5;
83
84 // Reduce the maximal input size, such that the compressed data
85 // always fits in one block even for level Z_NO_COMPRESSION.
86 enum
87 {
88 BLOCK_HEADER_LENGTH = CompressionContext<detail::bgzf_compression>::BLOCK_HEADER_LENGTH
89 };
90 static unsigned const VALUE = MAX_BLOCK_SIZE - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH - ZLIB_BLOCK_OVERHEAD;
91};
92
93// ============================================================================
94// Functions
95// ============================================================================
96
97// ----------------------------------------------------------------------------
98// Function compressInit()
99// ----------------------------------------------------------------------------
100
101inline void compressInit(CompressionContext<detail::gz_compression> & ctx)
102{
103 int const GZIP_WINDOW_BITS = -15; // no zlib header
104 int const Z_DEFAULT_MEM_LEVEL = 8;
105
106 ctx.strm.zalloc = NULL;
107 ctx.strm.zfree = NULL;
108
109 // (weese:) We use Z_BEST_SPEED instead of Z_DEFAULT_COMPRESSION as it turned out
110 // to be 2x faster and produces only 7% bigger output
111 // int status = deflateInit2(&ctx.strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
112 // GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
113 int status =
114 deflateInit2(&ctx.strm, Z_BEST_SPEED, Z_DEFLATED, GZIP_WINDOW_BITS, Z_DEFAULT_MEM_LEVEL, Z_DEFAULT_STRATEGY);
115 if (status != Z_OK)
116 throw io_error("Calling deflateInit2() failed for gz file.");
117}
118
119// ----------------------------------------------------------------------------
120// Function compressInit()
121// ----------------------------------------------------------------------------
122
123inline void compressInit(CompressionContext<detail::bgzf_compression> & ctx)
124{
125 compressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
126 ctx.headerPos = 0;
127}
128
129// ----------------------------------------------------------------------------
130// Helper Function _bgzfUnpackXX()
131// ----------------------------------------------------------------------------
132
133inline uint16_t _bgzfUnpack16(char const * buffer)
134{
135 uint16_t tmp;
136 std::uninitialized_copy(buffer, buffer + sizeof(uint16_t), reinterpret_cast<char *>(&tmp));
137 return detail::to_little_endian(tmp);
138}
139
140inline uint32_t _bgzfUnpack32(char const * buffer)
141{
142 uint32_t tmp;
143 std::uninitialized_copy(buffer, buffer + sizeof(uint32_t), reinterpret_cast<char *>(&tmp));
144 return detail::to_little_endian(tmp);
145}
146
147// ----------------------------------------------------------------------------
148// Helper Function _bgzfPackXX()
149// ----------------------------------------------------------------------------
150
151inline void _bgzfPack16(char * buffer, uint16_t value)
152{
153 value = detail::to_little_endian(value);
154 std::uninitialized_copy(reinterpret_cast<char *>(&value),
155 reinterpret_cast<char *>(&value) + sizeof(uint16_t),
156 buffer);
157}
158
159inline void _bgzfPack32(char * buffer, uint32_t value)
160{
161 value = detail::to_little_endian(value);
162 std::uninitialized_copy(reinterpret_cast<char *>(&value),
163 reinterpret_cast<char *>(&value) + sizeof(uint32_t),
164 buffer);
165}
166
167// ----------------------------------------------------------------------------
168// Function _compressBlock()
169// ----------------------------------------------------------------------------
170
171template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
172inline TDestCapacity _compressBlock(TDestValue * dstBegin,
173 TDestCapacity dstCapacity,
174 TSourceValue * srcBegin,
175 TSourceLength srcLength,
176 CompressionContext<detail::bgzf_compression> & ctx)
177{
178 const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
179 const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
180
181 assert(dstCapacity > BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH);
182 assert(sizeof(TDestValue) == 1u);
183 assert(sizeof(unsigned) == 4u);
184
185 // 1. COPY HEADER
186 std::ranges::copy(detail::bgzf_compression::magic_header, dstBegin);
187
188 // 2. COMPRESS
189 compressInit(ctx);
190 ctx.strm.next_in = (Bytef *)(srcBegin);
191 ctx.strm.next_out = (Bytef *)(dstBegin + BLOCK_HEADER_LENGTH);
192 ctx.strm.avail_in = srcLength * sizeof(TSourceValue);
193 ctx.strm.avail_out = dstCapacity - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
194
195 int status = deflate(&ctx.strm, Z_FINISH);
196 if (status != Z_STREAM_END)
197 {
198 deflateEnd(&ctx.strm);
199 throw io_error("Deflation failed. Compressed BGZF data is too big.");
200 }
201
202 status = deflateEnd(&ctx.strm);
203 if (status != Z_OK)
204 throw io_error("BGZF deflateEnd() failed.");
205
206 // 3. APPEND FOOTER
207
208 // Set compressed length into buffer, compute CRC and write CRC into buffer.
209
210 size_t len = dstCapacity - ctx.strm.avail_out;
211 _bgzfPack16(dstBegin + 16, len - 1);
212
213 dstBegin += len - BLOCK_FOOTER_LENGTH;
214 _bgzfPack32(dstBegin, crc32(crc32(0u, NULL, 0u), (Bytef *)(srcBegin), srcLength * sizeof(TSourceValue)));
215 _bgzfPack32(dstBegin + 4, srcLength * sizeof(TSourceValue));
216
217 return dstCapacity - ctx.strm.avail_out;
218}
219
220// ----------------------------------------------------------------------------
221// Function decompressInit() - GZIP
222// ----------------------------------------------------------------------------
223
224inline void decompressInit(CompressionContext<detail::gz_compression> & ctx)
225{
226 int const GZIP_WINDOW_BITS = -15; // no zlib header
227
228 ctx.strm.zalloc = NULL;
229 ctx.strm.zfree = NULL;
230 int status = inflateInit2(&ctx.strm, GZIP_WINDOW_BITS);
231 if (status != Z_OK)
232 throw io_error("GZip inflateInit2() failed.");
233}
234
235// ----------------------------------------------------------------------------
236// Function decompressInit() - BGZF
237// ----------------------------------------------------------------------------
238
239inline void decompressInit(CompressionContext<detail::bgzf_compression> & ctx)
240{
241 decompressInit(static_cast<CompressionContext<detail::gz_compression> &>(ctx));
242 ctx.headerPos = 0;
243}
244
245// ----------------------------------------------------------------------------
246// Function _decompressBlock()
247// ----------------------------------------------------------------------------
248
249template <typename TDestValue, typename TDestCapacity, typename TSourceValue, typename TSourceLength>
250inline TDestCapacity _decompressBlock(TDestValue * dstBegin,
251 TDestCapacity dstCapacity,
252 TSourceValue * srcBegin,
253 TSourceLength srcLength,
254 CompressionContext<detail::bgzf_compression> & ctx)
255{
256 const size_t BLOCK_HEADER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_HEADER_LENGTH;
257 const size_t BLOCK_FOOTER_LENGTH = DefaultPageSize<detail::bgzf_compression>::BLOCK_FOOTER_LENGTH;
258
259 assert(sizeof(TSourceValue) == 1u);
260 assert(sizeof(unsigned) == 4u);
261
262 // 1. CHECK HEADER
263
264 if (srcLength <= BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH)
265 throw io_error("BGZF block too short.");
266
267 if (!detail::bgzf_compression::validate_header(std::span{srcBegin, srcLength}))
268 throw io_error("Invalid BGZF block header.");
269
270 size_t compressedLen = _bgzfUnpack16(srcBegin + 16) + 1u;
271 if (compressedLen != srcLength)
272 throw io_error("BGZF compressed size mismatch.");
273
274 // 2. DECOMPRESS
275
276 decompressInit(ctx);
277 ctx.strm.next_in = (Bytef *)(srcBegin + BLOCK_HEADER_LENGTH);
278 ctx.strm.next_out = (Bytef *)(dstBegin);
279 ctx.strm.avail_in = srcLength - BLOCK_HEADER_LENGTH - BLOCK_FOOTER_LENGTH;
280 ctx.strm.avail_out = dstCapacity * sizeof(TDestValue);
281
282 int status = inflate(&ctx.strm, Z_FINISH);
283 if (status != Z_STREAM_END)
284 {
285 inflateEnd(&ctx.strm);
286 throw io_error("Inflation failed. Decompressed BGZF data is too big.");
287 }
288
289 status = inflateEnd(&ctx.strm);
290 if (status != Z_OK)
291 throw io_error("BGZF inflateEnd() failed.");
292
293 // 3. CHECK FOOTER
294
295 // Check compressed length in buffer, compute CRC and compare with CRC in buffer.
296
297 unsigned crc = crc32(crc32(0u, NULL, 0u), (Bytef *)(dstBegin), dstCapacity - ctx.strm.avail_out);
298
299 srcBegin += compressedLen - BLOCK_FOOTER_LENGTH;
300 if (_bgzfUnpack32(srcBegin) != crc)
301 throw io_error("BGZF wrong checksum.");
302
303 if (_bgzfUnpack32(srcBegin + 4) != dstCapacity - ctx.strm.avail_out)
304 throw io_error("BGZF size mismatch.");
305
306 return (dstCapacity - ctx.strm.avail_out) / sizeof(TDestValue);
307}
308
309} // namespace seqan3::contrib
310
311#endif // defined(SEQAN3_HAS_ZLIB)
Provides seqan3::contrib::bgzf_thread_count.
T copy(T... args)
Provides various transformation traits used by the range module.
Provides exceptions used in the I/O module.
Provides seqan3::detail::magic_header.
T memset(T... args)
Provides utility functions for bit twiddling.
T uninitialized_copy(T... args)
Hide me