SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
simd_algorithm_avx2.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <array>
13
18
19//-----------------------------------------------------------------------------
20// forward declare avx2 simd algorithms that use avx2 intrinsics
21//-----------------------------------------------------------------------------
22
23namespace seqan3::detail
24{
28template <simd::simd_concept simd_t>
29constexpr simd_t load_avx2(void const * mem_addr);
30
34template <simd::simd_concept simd_t>
35constexpr void store_avx2(void * mem_addr, simd_t const & simd_vec);
36
40template <simd::simd_concept simd_t>
42
46template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
47constexpr target_simd_t upcast_signed_avx2(source_simd_t const & src);
48
52template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
53constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const & src);
54
58template <uint8_t index, simd::simd_concept simd_t>
59constexpr simd_t extract_half_avx2(simd_t const & src);
60
64template <uint8_t index, simd::simd_concept simd_t>
65constexpr simd_t extract_quarter_avx2(simd_t const & src);
66
70template <uint8_t index, simd::simd_concept simd_t>
71constexpr simd_t extract_eighth_avx2(simd_t const & src);
72
73} // namespace seqan3::detail
74
75//-----------------------------------------------------------------------------
76// implementation
77//-----------------------------------------------------------------------------
78
79#ifdef __AVX2__
80
81namespace seqan3::detail
82{
83
84template <simd::simd_concept simd_t>
85constexpr simd_t load_avx2(void const * mem_addr)
86{
87 return reinterpret_cast<simd_t>(_mm256_loadu_si256(reinterpret_cast<__m256i const *>(mem_addr)));
88}
89
90template <simd::simd_concept simd_t>
91constexpr void store_avx2(void * mem_addr, simd_t const & simd_vec)
92{
93 _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem_addr), reinterpret_cast<__m256i const &>(simd_vec));
94}
95
96template <simd::simd_concept simd_t>
97inline void transpose_matrix_avx2(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
98{
99 // emulate missing _mm256_unpacklo_epi128/_mm256_unpackhi_epi128 instructions
100 auto _mm256_unpacklo_epi128 = [](__m256i const & a, __m256i const & b)
101 {
102 return _mm256_permute2x128_si256(a, b, 0x20);
103 };
104
105 auto _mm256_unpackhi_epi128 = [](__m256i const & a, __m256i const & b)
106 {
107 return _mm256_permute2x128_si256(a, b, 0x31);
108 };
109
110 // A look-up table to reverse the lowest 4 bits in order to permute the transposed rows.
111 static const uint8_t bit_rev[] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
112 16, 24, 20, 28, 18, 26, 22, 30, 17, 25, 21, 29, 19, 27, 23, 31};
113
114 // transpose a 32x32 byte matrix
115 __m256i tmp1[32];
116 for (int i = 0; i < 16; ++i)
117 {
118 tmp1[i] = _mm256_unpacklo_epi8(reinterpret_cast<__m256i const &>(matrix[2 * i]),
119 reinterpret_cast<__m256i const &>(matrix[2 * i + 1]));
120 tmp1[i + 16] = _mm256_unpackhi_epi8(reinterpret_cast<__m256i const &>(matrix[2 * i]),
121 reinterpret_cast<__m256i const &>(matrix[2 * i + 1]));
122 }
123 __m256i tmp2[32];
124 for (int i = 0; i < 16; ++i)
125 {
126 tmp2[i] = _mm256_unpacklo_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
127 tmp2[i + 16] = _mm256_unpackhi_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
128 }
129 for (int i = 0; i < 16; ++i)
130 {
131 tmp1[i] = _mm256_unpacklo_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
132 tmp1[i + 16] = _mm256_unpackhi_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
133 }
134 for (int i = 0; i < 16; ++i)
135 {
136 tmp2[i] = _mm256_unpacklo_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
137 tmp2[i + 16] = _mm256_unpackhi_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
138 }
139 for (int i = 0; i < 16; ++i)
140 {
141 matrix[bit_rev[i]] = reinterpret_cast<simd_t>(_mm256_unpacklo_epi128(tmp2[2 * i], tmp2[2 * i + 1]));
142 matrix[bit_rev[i + 16]] = reinterpret_cast<simd_t>(_mm256_unpackhi_epi128(tmp2[2 * i], tmp2[2 * i + 1]));
143 }
144}
145
146template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
147constexpr target_simd_t upcast_signed_avx2(source_simd_t const & src)
148{
149 __m128i const & tmp = _mm256_castsi256_si128(reinterpret_cast<__m256i const &>(src));
150 if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi8 ...
151 {
152 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi16
153 return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi16(tmp));
154 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
155 return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi32(tmp));
156 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
157 return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi64(tmp));
158 }
159 else if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi16 ...
160 {
161 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
162 return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi32(tmp));
163 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
164 return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi64(tmp));
165 }
166 else // cast from epi32 to epi64
167 {
168 static_assert(simd_traits<source_simd_t>::length == 8, "Expected 32 bit scalar type.");
169 return reinterpret_cast<target_simd_t>(_mm256_cvtepi32_epi64(tmp));
170 }
171}
172
173template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
174constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const & src)
175{
176 __m128i const & tmp = _mm256_castsi256_si128(reinterpret_cast<__m256i const &>(src));
177 if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi8 ...
178 {
179 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi16
180 return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi16(tmp));
181 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
182 return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi32(tmp));
183 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
184 return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi64(tmp));
185 }
186 else if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi16 ...
187 {
188 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
189 return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi32(tmp));
190 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
191 return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi64(tmp));
192 }
193 else // cast from epi32 to epi64
194 {
195 static_assert(simd_traits<source_simd_t>::length == 8, "Expected 32 bit scalar type.");
196 return reinterpret_cast<target_simd_t>(_mm256_cvtepu32_epi64(tmp));
197 }
198}
199
200template <uint8_t index, simd::simd_concept simd_t>
201constexpr simd_t extract_half_avx2(simd_t const & src)
202{
203 return reinterpret_cast<simd_t>(
204 _mm256_castsi128_si256(_mm256_extracti128_si256(reinterpret_cast<__m256i const &>(src), index)));
205}
206
207template <uint8_t index, simd::simd_concept simd_t>
208constexpr simd_t extract_quarter_avx2(simd_t const & src)
209{
210 return reinterpret_cast<simd_t>(
211 _mm256_castsi128_si256(_mm_cvtsi64_si128(_mm256_extract_epi64(reinterpret_cast<__m256i const &>(src), index))));
212}
213
214template <uint8_t index, simd::simd_concept simd_t>
215constexpr simd_t extract_eighth_avx2(simd_t const & src)
216{
217 return reinterpret_cast<simd_t>(
218 _mm256_castsi128_si256(_mm_cvtsi32_si128(_mm256_extract_epi32(reinterpret_cast<__m256i const &>(src), index))));
219}
220
221} // namespace seqan3::detail
222
223#endif // __AVX2__
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Defines the requirements of a matrix (e.g. score matrices, trace matrices).
Definition matrix_concept.hpp:58
The internal SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
void transpose_matrix_avx2(std::array< simd_t, simd_traits< simd_t >::length > &matrix)
Transposes the given simd vector matrix.
constexpr simd_t extract_half_avx2(simd_t const &src)
Extracts one half of the given simd vector and stores the result in the lower half of the target vect...
constexpr simd_t extract_eighth_avx2(simd_t const &src)
Extracts one eighth of the given simd vector and stores it in the lower eighth of the target vector.
constexpr target_simd_t upcast_signed_avx2(source_simd_t const &src)
Upcasts the given vector into the target vector using signed extension of packed values.
constexpr void store_avx2(void *mem_addr, simd_t const &simd_vec)
Store simd_t size bits of integral data into memory.
constexpr simd_t extract_quarter_avx2(simd_t const &src)
Extracts one quarter of the given simd vector and stores it in the lower quarter of the target vector...
constexpr simd_t load_avx2(void const *mem_addr)
Load simd_t size bits of integral data from memory.
constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const &src)
Upcasts the given vector into the target vector using unsigned extension of packed values.
Provides seqan3::simd::simd_traits.
seqan3::simd::simd_traits is the trait class that provides uniform interface to the properties of sim...
Definition simd_traits.hpp:38
Provides seqan3::simd::simd_concept.
Hide me