25 namespace seqan3::detail
30 template <simd::simd_concept simd_t>
31 constexpr simd_t load_avx2(
void const * mem_addr);
36 template <simd::simd_concept simd_t>
37 inline void transpose_matrix_avx2(
std::array<simd_t, simd_traits<simd_t>::length> & matrix);
42 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
43 constexpr target_simd_t upcast_signed_avx2(source_simd_t
const & src);
48 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
49 constexpr target_simd_t upcast_unsigned_avx2(source_simd_t
const & src);
54 template <u
int8_t index, simd::simd_concept simd_t>
55 constexpr simd_t extract_half_avx2(simd_t
const & src);
60 template <u
int8_t index, simd::simd_concept simd_t>
61 constexpr simd_t extract_quarter_avx2(simd_t
const & src);
66 template <u
int8_t index, simd::simd_concept simd_t>
67 constexpr simd_t extract_eighth_avx2(simd_t
const & src);
77 namespace seqan3::detail
80 template <simd::simd_concept simd_t>
81 constexpr simd_t load_avx2(
void const * mem_addr)
83 return reinterpret_cast<simd_t
>(_mm256_loadu_si256(
reinterpret_cast<__m256i
const *
>(mem_addr)));
86 template <simd::simd_concept simd_t>
87 inline void transpose_matrix_avx2(
std::array<simd_t, simd_traits<simd_t>::length> & matrix)
90 auto _mm256_unpacklo_epi128 = [] (__m256i
const & a, __m256i
const & b)
92 return _mm256_permute2x128_si256(a, b, 0x20);
95 auto _mm256_unpackhi_epi128 = [] (__m256i
const & a, __m256i
const & b)
97 return _mm256_permute2x128_si256(a, b, 0x31);
101 static const uint8_t bit_rev[] = { 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15,
102 16,24,20,28,18,26,22,30,17,25,21,29,19,27,23,31};
106 for (
int i = 0; i < 16; ++i)
108 tmp1[i] = _mm256_unpacklo_epi8(
109 reinterpret_cast<const __m256i &
>(matrix[2*i]),
110 reinterpret_cast<const __m256i &
>(matrix[2*i+1])
112 tmp1[i+16] = _mm256_unpackhi_epi8(
113 reinterpret_cast<const __m256i &
>(matrix[2*i]),
114 reinterpret_cast<const __m256i &
>(matrix[2*i+1])
118 for (
int i = 0; i < 16; ++i)
120 tmp2[i] = _mm256_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
121 tmp2[i+16] = _mm256_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
123 for (
int i = 0; i < 16; ++i)
125 tmp1[i] = _mm256_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
126 tmp1[i+16] = _mm256_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
128 for (
int i = 0; i < 16; ++i)
130 tmp2[i] = _mm256_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]);
131 tmp2[i+16] = _mm256_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]);
133 for (
int i = 0; i < 16; ++i)
135 matrix[bit_rev[i]] =
reinterpret_cast<simd_t
>(_mm256_unpacklo_epi128(tmp2[2*i],tmp2[2*i+1]));
136 matrix[bit_rev[i+16]] =
reinterpret_cast<simd_t
>(_mm256_unpackhi_epi128(tmp2[2*i],tmp2[2*i+1]));
140 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
141 constexpr target_simd_t upcast_signed_avx2(source_simd_t
const & src)
143 __m128i
const & tmp = _mm256_castsi256_si128(
reinterpret_cast<__m256i
const &
>(src));
144 if constexpr (simd_traits<source_simd_t>::length == 32)
146 if constexpr (simd_traits<target_simd_t>::length == 16)
147 return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi16(tmp));
148 if constexpr (simd_traits<target_simd_t>::length == 8)
149 return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi32(tmp));
150 if constexpr (simd_traits<target_simd_t>::length == 4)
151 return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi64(tmp));
153 else if constexpr (simd_traits<source_simd_t>::length == 16)
155 if constexpr (simd_traits<target_simd_t>::length == 8)
156 return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi32(tmp));
157 if constexpr (simd_traits<target_simd_t>::length == 4)
158 return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi64(tmp));
162 static_assert(simd_traits<source_simd_t>::length == 8,
"Expected 32 bit scalar type.");
163 return reinterpret_cast<target_simd_t
>(_mm256_cvtepi32_epi64(tmp));
167 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
168 constexpr target_simd_t upcast_unsigned_avx2(source_simd_t
const & src)
170 __m128i
const & tmp = _mm256_castsi256_si128(
reinterpret_cast<__m256i
const &
>(src));
171 if constexpr (simd_traits<source_simd_t>::length == 32)
173 if constexpr (simd_traits<target_simd_t>::length == 16)
174 return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi16(tmp));
175 if constexpr (simd_traits<target_simd_t>::length == 8)
176 return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi32(tmp));
177 if constexpr (simd_traits<target_simd_t>::length == 4)
178 return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi64(tmp));
180 else if constexpr (simd_traits<source_simd_t>::length == 16)
182 if constexpr (simd_traits<target_simd_t>::length == 8)
183 return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi32(tmp));
184 if constexpr (simd_traits<target_simd_t>::length == 4)
185 return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi64(tmp));
189 static_assert(simd_traits<source_simd_t>::length == 8,
"Expected 32 bit scalar type.");
190 return reinterpret_cast<target_simd_t
>(_mm256_cvtepu32_epi64(tmp));
194 template <u
int8_t index, simd::simd_concept simd_t>
195 constexpr simd_t extract_half_avx2(simd_t
const & src)
197 return reinterpret_cast<simd_t
>(_mm256_castsi128_si256(
198 _mm256_extracti128_si256(
reinterpret_cast<__m256i
const &
>(src), index)));
201 template <u
int8_t index, simd::simd_concept simd_t>
202 constexpr simd_t extract_quarter_avx2(simd_t
const & src)
204 return reinterpret_cast<simd_t
>(_mm256_castsi128_si256(
205 _mm_cvtsi64x_si128(_mm256_extract_epi64(
reinterpret_cast<__m256i
const &
>(src), index))));
208 template <u
int8_t index, simd::simd_concept simd_t>
209 constexpr simd_t extract_eighth_avx2(simd_t
const & src)
211 return reinterpret_cast<simd_t
>(_mm256_castsi128_si256(
212 _mm_cvtsi32_si128(_mm256_extract_epi32(
reinterpret_cast<__m256i
const &
>(src), index))));