26namespace seqan3::detail
31template <simd::simd_concept simd_t>
32constexpr simd_t load_avx512(
void const * mem_addr);
37template <simd::simd_concept simd_t>
38constexpr void store_avx512(
void * mem_addr, simd_t
const & simd_vec);
43template <simd::simd_concept simd_t>
44inline void transpose_matrix_avx512(
std::array<simd_t, simd_traits<simd_t>::length> & matrix);
49template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
50constexpr target_simd_t upcast_signed_avx512(source_simd_t
const & src);
55template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
56constexpr target_simd_t upcast_unsigned_avx512(source_simd_t
const & src);
61template <u
int8_t index, simd::simd_concept simd_t>
62constexpr simd_t extract_half_avx512(simd_t
const & src);
67template <u
int8_t index, simd::simd_concept simd_t>
68constexpr simd_t extract_quarter_avx512(simd_t
const & src);
73template <u
int8_t index, simd::simd_concept simd_t>
74constexpr simd_t extract_eighth_avx512(simd_t
const & src);
84namespace seqan3::detail
87template <simd::simd_concept simd_t>
88constexpr simd_t load_avx512(
void const * mem_addr)
90 return reinterpret_cast<simd_t
>(_mm512_loadu_si512(mem_addr));
93template <simd::simd_concept simd_t>
94constexpr void store_avx512(
void * mem_addr, simd_t
const & simd_vec)
96 _mm512_storeu_si512(mem_addr,
reinterpret_cast<__m512i
const &
>(simd_vec));
99# if defined(__AVX512BW__)
100template <simd::simd_concept simd_t>
101inline void transpose_matrix_avx512(
std::array<simd_t, simd_traits<simd_t>::length> & matrix)
118 for (
int i = 0; i < 32; ++i)
120 tmp1[i] = _mm512_unpacklo_epi8(
reinterpret_cast<__m512i
const &
>(matrix[2 * i]),
121 reinterpret_cast<__m512i
const &
>(matrix[2 * i + 1]));
122 tmp1[i + 32] = _mm512_unpackhi_epi8(
reinterpret_cast<__m512i
const &
>(matrix[2 * i]),
123 reinterpret_cast<__m512i
const &
>(matrix[2 * i + 1]));
128 for (
int i = 0; i < 32; ++i)
130 tmp2[i] = _mm512_unpacklo_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
131 tmp2[i + 32] = _mm512_unpackhi_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
134 for (
int i = 0; i < 32; ++i)
136 tmp1[i] = _mm512_unpacklo_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
137 tmp1[i + 32] = _mm512_unpackhi_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
140 for (
int i = 0; i < 32; ++i)
142 tmp2[i] = _mm512_unpacklo_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
143 tmp2[i + 32] = _mm512_unpackhi_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
148 auto _mm512_unpacklo_epi128 = [](__m512i
const & a, __m512i
const & b)
151 return _mm512_permutex2var_epi64(a,
reinterpret_cast<__m512i
const &
>(*lo_mask.data()), b);
154 auto _mm521_unpackhi_epi128 = [](__m512i
const & a, __m512i
const & b)
157 return _mm512_permutex2var_epi64(a,
reinterpret_cast<__m512i
const &
>(*hi_mask.data()), b);
160 for (
int i = 0; i < 32; ++i)
162 tmp1[i] = _mm512_unpacklo_epi128(tmp2[2 * i], tmp2[2 * i + 1]);
163 tmp1[i + 32] = _mm521_unpackhi_epi128(tmp2[2 * i], tmp2[2 * i + 1]);
167 auto _mm512_unpacklo_epi256 = [](__m512i
const & a, __m512i
const & b)
169 return _mm512_shuffle_i64x2(a, b, 0b0100'0100);
172 auto _mm521_unpackhi_epi256 = [](__m512i
const & a, __m512i
const & b)
174 return _mm512_shuffle_i64x2(a, b, 0b1110'1110);
182 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
184 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
186 32, 48, 40, 56, 36, 52, 44, 60, 34, 50, 42, 58, 38, 54, 46, 62,
188 33, 49, 41, 57, 37, 53, 45, 61, 35, 51, 43, 59, 39, 55, 47, 63};
191 for (
int i = 0; i < 32; ++i)
193 int const idx = i * 2;
194 matrix[reverse_idx_mask[idx]] =
reinterpret_cast<simd_t
>(_mm512_unpacklo_epi256(tmp1[idx], tmp1[idx + 1]));
195 matrix[reverse_idx_mask[idx + 1]] =
reinterpret_cast<simd_t
>(_mm521_unpackhi_epi256(tmp1[idx], tmp1[idx + 1]));
200template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
201constexpr target_simd_t upcast_signed_avx512(source_simd_t
const & src)
203 __m512i
const & tmp =
reinterpret_cast<__m512i
const &
>(src);
204 if constexpr (simd_traits<source_simd_t>::length == 64)
206 if constexpr (simd_traits<target_simd_t>::length == 32)
207 return reinterpret_cast<target_simd_t
>(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(tmp)));
208 if constexpr (simd_traits<target_simd_t>::length == 16)
209 return reinterpret_cast<target_simd_t
>(_mm512_cvtepi8_epi32(_mm512_castsi512_si128(tmp)));
210 if constexpr (simd_traits<target_simd_t>::length == 8)
211 return reinterpret_cast<target_simd_t
>(_mm512_cvtepi8_epi64(_mm512_castsi512_si128(tmp)));
213 else if constexpr (simd_traits<source_simd_t>::length == 32)
215 if constexpr (simd_traits<target_simd_t>::length == 16)
216 return reinterpret_cast<target_simd_t
>(_mm512_cvtepi16_epi32(_mm512_castsi512_si256(tmp)));
217 if constexpr (simd_traits<target_simd_t>::length == 8)
218 return reinterpret_cast<target_simd_t
>(_mm512_cvtepi16_epi64(_mm512_castsi512_si128(tmp)));
222 static_assert(simd_traits<source_simd_t>::length == 16,
"Expected 32 bit scalar type.");
223 return reinterpret_cast<target_simd_t
>(_mm512_cvtepi32_epi64(_mm512_castsi512_si256(tmp)));
227template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
228constexpr target_simd_t upcast_unsigned_avx512(source_simd_t
const & src)
230 __m512i
const & tmp =
reinterpret_cast<__m512i
const &
>(src);
231 if constexpr (simd_traits<source_simd_t>::length == 64)
233 if constexpr (simd_traits<target_simd_t>::length == 32)
234 return reinterpret_cast<target_simd_t
>(_mm512_cvtepu8_epi16(_mm512_castsi512_si256(tmp)));
235 if constexpr (simd_traits<target_simd_t>::length == 16)
236 return reinterpret_cast<target_simd_t
>(_mm512_cvtepu8_epi32(_mm512_castsi512_si128(tmp)));
237 if constexpr (simd_traits<target_simd_t>::length == 8)
238 return reinterpret_cast<target_simd_t
>(_mm512_cvtepu8_epi64(_mm512_castsi512_si128(tmp)));
240 else if constexpr (simd_traits<source_simd_t>::length == 32)
242 if constexpr (simd_traits<target_simd_t>::length == 16)
243 return reinterpret_cast<target_simd_t
>(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(tmp)));
244 if constexpr (simd_traits<target_simd_t>::length == 8)
245 return reinterpret_cast<target_simd_t
>(_mm512_cvtepu16_epi64(_mm512_castsi512_si128(tmp)));
249 static_assert(simd_traits<source_simd_t>::length == 16,
"Expected 32 bit scalar type.");
250 return reinterpret_cast<target_simd_t
>(_mm512_cvtepu32_epi64(_mm512_castsi512_si256(tmp)));
254template <u
int8_t index, simd::simd_concept simd_t>
255constexpr simd_t extract_half_avx512(simd_t
const & src)
257 return reinterpret_cast<simd_t
>(
258 _mm512_castsi256_si512(_mm512_extracti64x4_epi64(
reinterpret_cast<__m512i
const &
>(src), index)));
261# if defined(__AVX512DQ__)
262template <u
int8_t index, simd::simd_concept simd_t>
263constexpr simd_t extract_quarter_avx512(simd_t
const & src)
265 return reinterpret_cast<simd_t
>(
266 _mm512_castsi128_si512(_mm512_extracti64x2_epi64(
reinterpret_cast<__m512i
const &
>(src), index)));
269template <u
int8_t index, simd::simd_concept simd_t>
270constexpr simd_t extract_eighth_avx512(simd_t
const & src)
272 __m512i tmp =
reinterpret_cast<__m512i
const &
>(src);
275 if constexpr (index % 2 == 1)
276 tmp = _mm512_shuffle_epi32(tmp, 0b0100'1110);
278 return reinterpret_cast<simd_t
>(_mm512_castsi128_si512(_mm512_extracti64x2_epi64(tmp, index / 2)));
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Provides seqan3::simd::simd_traits.
Provides seqan3::simd::simd_concept.