SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
simd_algorithm_avx512.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <array>
13
18
19//-----------------------------------------------------------------------------
20// forward declare avx512 simd algorithms that use avx512 intrinsics
21//-----------------------------------------------------------------------------
22
23namespace seqan3::detail
24{
28template <simd::simd_concept simd_t>
29constexpr simd_t load_avx512(void const * mem_addr);
30
34template <simd::simd_concept simd_t>
35constexpr void store_avx512(void * mem_addr, simd_t const & simd_vec);
36
40template <simd::simd_concept simd_t>
42
46template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
47constexpr target_simd_t upcast_signed_avx512(source_simd_t const & src);
48
52template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
53constexpr target_simd_t upcast_unsigned_avx512(source_simd_t const & src);
54
58template <uint8_t index, simd::simd_concept simd_t>
59constexpr simd_t extract_half_avx512(simd_t const & src);
60
64template <uint8_t index, simd::simd_concept simd_t>
65constexpr simd_t extract_quarter_avx512(simd_t const & src);
66
70template <uint8_t index, simd::simd_concept simd_t>
71constexpr simd_t extract_eighth_avx512(simd_t const & src);
72
73} // namespace seqan3::detail
74
75//-----------------------------------------------------------------------------
76// implementation
77//-----------------------------------------------------------------------------
78
79#ifdef __AVX512F__
80
81namespace seqan3::detail
82{
83
84template <simd::simd_concept simd_t>
85constexpr simd_t load_avx512(void const * mem_addr)
86{
87 return reinterpret_cast<simd_t>(_mm512_loadu_si512(mem_addr));
88}
89
90template <simd::simd_concept simd_t>
91constexpr void store_avx512(void * mem_addr, simd_t const & simd_vec)
92{
93 _mm512_storeu_si512(mem_addr, reinterpret_cast<__m512i const &>(simd_vec));
94}
95
96# if defined(__AVX512BW__)
97template <simd::simd_concept simd_t>
98inline void transpose_matrix_avx512(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
99{
100 // Transposing a 64x64 byte matrix in 6x64 instructions using AVX512 intrinsics.
101 // Step 1: Unpack 8-bit operands.
102
103 // Note: _mm512_unpack* operates on 128-bit lanes, i.e. the following pattern applies:
104 // * for lower half ('lo')
105 // d[127:0] = interleave(a[63:0], b[63:0])
106 // d[255:128] = interleave(a[191:128], b[191:128])
107 // d[383:256] = interleave(a[319:256], b[319:256])
108 // d[511:384] = interleave(a[447:384], b[447:384])
109 // * for higher half ('hi')
110 // d[127:0] = interleave(a[127:64], b[127:64])
111 // d[255:128] = interleave(a[255:192], b[255:192])
112 // d[383:256] = interleave(a[319:320], b[383:320])
113 // d[511:384] = interleave(a[511:448], b[511:448])
114 __m512i tmp1[64];
115 for (int i = 0; i < 32; ++i)
116 {
117 tmp1[i] = _mm512_unpacklo_epi8(reinterpret_cast<__m512i const &>(matrix[2 * i]),
118 reinterpret_cast<__m512i const &>(matrix[2 * i + 1]));
119 tmp1[i + 32] = _mm512_unpackhi_epi8(reinterpret_cast<__m512i const &>(matrix[2 * i]),
120 reinterpret_cast<__m512i const &>(matrix[2 * i + 1]));
121 }
122
123 // Step 2: Unpack 16-bit operands.
124 __m512i tmp2[64];
125 for (int i = 0; i < 32; ++i)
126 {
127 tmp2[i] = _mm512_unpacklo_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
128 tmp2[i + 32] = _mm512_unpackhi_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
129 }
130 // Step 3: Unpack 32-bit operands.
131 for (int i = 0; i < 32; ++i)
132 {
133 tmp1[i] = _mm512_unpacklo_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
134 tmp1[i + 32] = _mm512_unpackhi_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
135 }
136 // Step 4: Unpack 64-bit operands.
137 for (int i = 0; i < 32; ++i)
138 {
139 tmp2[i] = _mm512_unpacklo_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
140 tmp2[i + 32] = _mm512_unpackhi_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
141 }
142
143 // Step 5: Unpack 128-bit operands.
144 // Helper function to emulate unpack of 128-bit across lanes using _mm512_permutex2var_epi64.
145 auto _mm512_unpacklo_epi128 = [](__m512i const & a, __m512i const & b)
146 {
147 constexpr std::array<uint64_t, 8> lo_mask{0, 1, 8, 9, 2, 3, 10, 11};
148 return _mm512_permutex2var_epi64(a, reinterpret_cast<__m512i const &>(*lo_mask.data()), b);
149 };
150
151 auto _mm521_unpackhi_epi128 = [](__m512i const & a, __m512i const & b)
152 {
153 constexpr std::array<uint64_t, 8> hi_mask{4, 5, 12, 13, 6, 7, 14, 15};
154 return _mm512_permutex2var_epi64(a, reinterpret_cast<__m512i const &>(*hi_mask.data()), b);
155 };
156
157 for (int i = 0; i < 32; ++i)
158 {
159 tmp1[i] = _mm512_unpacklo_epi128(tmp2[2 * i], tmp2[2 * i + 1]);
160 tmp1[i + 32] = _mm521_unpackhi_epi128(tmp2[2 * i], tmp2[2 * i + 1]);
161 }
162 // Step 6: Unpack 128-bit operands.
163 // Helper function to emulate unpack of 256-bit across lanes using _mm512_shuffle_i64x2.
164 auto _mm512_unpacklo_epi256 = [](__m512i const & a, __m512i const & b)
165 {
166 return _mm512_shuffle_i64x2(a, b, 0b0100'0100);
167 };
168
169 auto _mm521_unpackhi_epi256 = [](__m512i const & a, __m512i const & b)
170 {
171 return _mm512_shuffle_i64x2(a, b, 0b1110'1110);
172 };
173
174 // A look-up table to place the final transposed vector to the correct position in the
175 // original matrix.
176 // clang-format off
177 constexpr std::array<uint32_t, 64> reverse_idx_mask{
178 // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
179 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
180 // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
181 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
182 // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
183 32, 48, 40, 56, 36, 52, 44, 60, 34, 50, 42, 58, 38, 54, 46, 62,
184 // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
185 33, 49, 41, 57, 37, 53, 45, 61, 35, 51, 43, 59, 39, 55, 47, 63};
186 // clang-format on
187
188 for (int i = 0; i < 32; ++i)
189 {
190 int const idx = i * 2;
191 matrix[reverse_idx_mask[idx]] = reinterpret_cast<simd_t>(_mm512_unpacklo_epi256(tmp1[idx], tmp1[idx + 1]));
192 matrix[reverse_idx_mask[idx + 1]] = reinterpret_cast<simd_t>(_mm521_unpackhi_epi256(tmp1[idx], tmp1[idx + 1]));
193 }
194}
195# endif // defined(__AVX512BW__)
196
197template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
198constexpr target_simd_t upcast_signed_avx512(source_simd_t const & src)
199{
200 __m512i const & tmp = reinterpret_cast<__m512i const &>(src);
201 if constexpr (simd_traits<source_simd_t>::length == 64) // cast from epi8 ...
202 {
203 if constexpr (simd_traits<target_simd_t>::length == 32) // to epi16
204 return reinterpret_cast<target_simd_t>(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(tmp)));
205 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
206 return reinterpret_cast<target_simd_t>(_mm512_cvtepi8_epi32(_mm512_castsi512_si128(tmp)));
207 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
208 return reinterpret_cast<target_simd_t>(_mm512_cvtepi8_epi64(_mm512_castsi512_si128(tmp)));
209 }
210 else if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi16 ...
211 {
212 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
213 return reinterpret_cast<target_simd_t>(_mm512_cvtepi16_epi32(_mm512_castsi512_si256(tmp)));
214 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
215 return reinterpret_cast<target_simd_t>(_mm512_cvtepi16_epi64(_mm512_castsi512_si128(tmp)));
216 }
217 else // cast from epi32 to epi64
218 {
219 static_assert(simd_traits<source_simd_t>::length == 16, "Expected 32 bit scalar type.");
220 return reinterpret_cast<target_simd_t>(_mm512_cvtepi32_epi64(_mm512_castsi512_si256(tmp)));
221 }
222}
223
224template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
225constexpr target_simd_t upcast_unsigned_avx512(source_simd_t const & src)
226{
227 __m512i const & tmp = reinterpret_cast<__m512i const &>(src);
228 if constexpr (simd_traits<source_simd_t>::length == 64) // cast from epi8 ...
229 {
230 if constexpr (simd_traits<target_simd_t>::length == 32) // to epi16
231 return reinterpret_cast<target_simd_t>(_mm512_cvtepu8_epi16(_mm512_castsi512_si256(tmp)));
232 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
233 return reinterpret_cast<target_simd_t>(_mm512_cvtepu8_epi32(_mm512_castsi512_si128(tmp)));
234 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
235 return reinterpret_cast<target_simd_t>(_mm512_cvtepu8_epi64(_mm512_castsi512_si128(tmp)));
236 }
237 else if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi16 ...
238 {
239 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
240 return reinterpret_cast<target_simd_t>(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(tmp)));
241 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
242 return reinterpret_cast<target_simd_t>(_mm512_cvtepu16_epi64(_mm512_castsi512_si128(tmp)));
243 }
244 else // cast from epi32 to epi64
245 {
246 static_assert(simd_traits<source_simd_t>::length == 16, "Expected 32 bit scalar type.");
247 return reinterpret_cast<target_simd_t>(_mm512_cvtepu32_epi64(_mm512_castsi512_si256(tmp)));
248 }
249}
250
251template <uint8_t index, simd::simd_concept simd_t>
252constexpr simd_t extract_half_avx512(simd_t const & src)
253{
254 return reinterpret_cast<simd_t>(
255 _mm512_castsi256_si512(_mm512_extracti64x4_epi64(reinterpret_cast<__m512i const &>(src), index)));
256}
257
258# if defined(__AVX512DQ__)
259template <uint8_t index, simd::simd_concept simd_t>
260constexpr simd_t extract_quarter_avx512(simd_t const & src)
261{
262 return reinterpret_cast<simd_t>(
263 _mm512_castsi128_si512(_mm512_extracti64x2_epi64(reinterpret_cast<__m512i const &>(src), index)));
264}
265
266template <uint8_t index, simd::simd_concept simd_t>
267constexpr simd_t extract_eighth_avx512(simd_t const & src)
268{
269 __m512i tmp = reinterpret_cast<__m512i const &>(src);
270
271 // for uneven index exchange higher 64 bits with lower 64 bits for each 128 bit lane.
272 if constexpr (index % 2 == 1)
273 tmp = _mm512_shuffle_epi32(tmp, 0b0100'1110); // := [1, 0, 3, 2].
274
275 return reinterpret_cast<simd_t>(_mm512_castsi128_si512(_mm512_extracti64x2_epi64(tmp, index / 2)));
276}
277# endif // defined(__AVX512DQ__)
278
279} // namespace seqan3::detail
280
281#endif // __AVX512F__
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Defines the requirements of a matrix (e.g. score matrices, trace matrices).
Definition matrix_concept.hpp:58
The internal SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
void transpose_matrix_avx512(std::array< simd_t, simd_traits< simd_t >::length > &matrix)
Transposes the given simd vector matrix.
constexpr simd_t extract_half_avx512(simd_t const &src)
Extracts one half of the given simd vector and stores the result in the lower half of the target vect...
constexpr simd_t load_avx512(void const *mem_addr)
Load simd_t size bits of integral data from memory.
constexpr target_simd_t upcast_signed_avx512(source_simd_t const &src)
Upcasts the given vector into the target vector using signed extension of packed values.
constexpr void store_avx512(void *mem_addr, simd_t const &simd_vec)
Store simd_t size bits of integral data into memory.
constexpr simd_t extract_quarter_avx512(simd_t const &src)
Extracts one quarter of the given simd vector and stores it in the lower quarter of the target vector...
constexpr target_simd_t upcast_unsigned_avx512(source_simd_t const &src)
Upcasts the given vector into the target vector using unsigned extension of packed values.
constexpr simd_t extract_eighth_avx512(simd_t const &src)
Extracts one eighth of the given simd vector and stores it in the lower eighth of the target vector.
Provides seqan3::simd::simd_traits.
seqan3::simd::simd_traits is the trait class that provides uniform interface to the properties of sim...
Definition simd_traits.hpp:38
Provides seqan3::simd::simd_concept.
Hide me