SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
simd_algorithm_avx2.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <array>
16
21
22//-----------------------------------------------------------------------------
23// forward declare avx2 simd algorithms that use avx2 intrinsics
24//-----------------------------------------------------------------------------
25
26namespace seqan3::detail
27{
31template <simd::simd_concept simd_t>
32constexpr simd_t load_avx2(void const * mem_addr);
33
37template <simd::simd_concept simd_t>
38constexpr void store_avx2(void * mem_addr, simd_t const & simd_vec);
39
43template <simd::simd_concept simd_t>
44inline void transpose_matrix_avx2(std::array<simd_t, simd_traits<simd_t>::length> & matrix);
45
49template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
50constexpr target_simd_t upcast_signed_avx2(source_simd_t const & src);
51
55template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
56constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const & src);
57
61template <uint8_t index, simd::simd_concept simd_t>
62constexpr simd_t extract_half_avx2(simd_t const & src);
63
67template <uint8_t index, simd::simd_concept simd_t>
68constexpr simd_t extract_quarter_avx2(simd_t const & src);
69
73template <uint8_t index, simd::simd_concept simd_t>
74constexpr simd_t extract_eighth_avx2(simd_t const & src);
75
76}
77
78//-----------------------------------------------------------------------------
79// implementation
80//-----------------------------------------------------------------------------
81
82#ifdef __AVX2__
83
84namespace seqan3::detail
85{
86
87template <simd::simd_concept simd_t>
88constexpr simd_t load_avx2(void const * mem_addr)
89{
90 return reinterpret_cast<simd_t>(_mm256_loadu_si256(reinterpret_cast<__m256i const *>(mem_addr)));
91}
92
93template <simd::simd_concept simd_t>
94constexpr void store_avx2(void * mem_addr, simd_t const & simd_vec)
95{
96 _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem_addr), reinterpret_cast<__m256i const &>(simd_vec));
97}
98
99template <simd::simd_concept simd_t>
100inline void transpose_matrix_avx2(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
101{
102 // emulate missing _mm256_unpacklo_epi128/_mm256_unpackhi_epi128 instructions
103 auto _mm256_unpacklo_epi128 = [] (__m256i const & a, __m256i const & b)
104 {
105 return _mm256_permute2x128_si256(a, b, 0x20);
106 };
107
108 auto _mm256_unpackhi_epi128 = [] (__m256i const & a, __m256i const & b)
109 {
110 return _mm256_permute2x128_si256(a, b, 0x31);
111 };
112
113 // A look-up table to reverse the lowest 4 bits in order to permute the transposed rows.
114 static const uint8_t bit_rev[] = { 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15,
115 16,24,20,28,18,26,22,30,17,25,21,29,19,27,23,31};
116
117 // transpose a 32x32 byte matrix
118 __m256i tmp1[32];
119 for (int i = 0; i < 16; ++i)
120 {
121 tmp1[i] = _mm256_unpacklo_epi8(
122 reinterpret_cast<const __m256i &>(matrix[2*i]),
123 reinterpret_cast<const __m256i &>(matrix[2*i+1])
124 );
125 tmp1[i+16] = _mm256_unpackhi_epi8(
126 reinterpret_cast<const __m256i &>(matrix[2*i]),
127 reinterpret_cast<const __m256i &>(matrix[2*i+1])
128 );
129 }
130 __m256i tmp2[32];
131 for (int i = 0; i < 16; ++i)
132 {
133 tmp2[i] = _mm256_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
134 tmp2[i+16] = _mm256_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
135 }
136 for (int i = 0; i < 16; ++i)
137 {
138 tmp1[i] = _mm256_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
139 tmp1[i+16] = _mm256_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
140 }
141 for (int i = 0; i < 16; ++i)
142 {
143 tmp2[i] = _mm256_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]);
144 tmp2[i+16] = _mm256_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]);
145 }
146 for (int i = 0; i < 16; ++i)
147 {
148 matrix[bit_rev[i]] = reinterpret_cast<simd_t>(_mm256_unpacklo_epi128(tmp2[2*i],tmp2[2*i+1]));
149 matrix[bit_rev[i+16]] = reinterpret_cast<simd_t>(_mm256_unpackhi_epi128(tmp2[2*i],tmp2[2*i+1]));
150 }
151}
152
153template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
154constexpr target_simd_t upcast_signed_avx2(source_simd_t const & src)
155{
156 __m128i const & tmp = _mm256_castsi256_si128(reinterpret_cast<__m256i const &>(src));
157 if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi8 ...
158 {
159 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi16
160 return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi16(tmp));
161 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
162 return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi32(tmp));
163 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
164 return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi64(tmp));
165 }
166 else if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi16 ...
167 {
168 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
169 return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi32(tmp));
170 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
171 return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi64(tmp));
172 }
173 else // cast from epi32 to epi64
174 {
175 static_assert(simd_traits<source_simd_t>::length == 8, "Expected 32 bit scalar type.");
176 return reinterpret_cast<target_simd_t>(_mm256_cvtepi32_epi64(tmp));
177 }
178}
179
180template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
181constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const & src)
182{
183 __m128i const & tmp = _mm256_castsi256_si128(reinterpret_cast<__m256i const &>(src));
184 if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi8 ...
185 {
186 if constexpr (simd_traits<target_simd_t>::length == 16) // to epi16
187 return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi16(tmp));
188 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
189 return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi32(tmp));
190 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
191 return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi64(tmp));
192 }
193 else if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi16 ...
194 {
195 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
196 return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi32(tmp));
197 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
198 return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi64(tmp));
199 }
200 else // cast from epi32 to epi64
201 {
202 static_assert(simd_traits<source_simd_t>::length == 8, "Expected 32 bit scalar type.");
203 return reinterpret_cast<target_simd_t>(_mm256_cvtepu32_epi64(tmp));
204 }
205}
206
207template <uint8_t index, simd::simd_concept simd_t>
208constexpr simd_t extract_half_avx2(simd_t const & src)
209{
210 return reinterpret_cast<simd_t>(_mm256_castsi128_si256(
211 _mm256_extracti128_si256(reinterpret_cast<__m256i const &>(src), index)));
212}
213
214template <uint8_t index, simd::simd_concept simd_t>
215constexpr simd_t extract_quarter_avx2(simd_t const & src)
216{
217 return reinterpret_cast<simd_t>(_mm256_castsi128_si256(
218 _mm_cvtsi64x_si128(_mm256_extract_epi64(reinterpret_cast<__m256i const &>(src), index))));
219}
220
221template <uint8_t index, simd::simd_concept simd_t>
222constexpr simd_t extract_eighth_avx2(simd_t const & src)
223{
224 return reinterpret_cast<simd_t>(_mm256_castsi128_si256(
225 _mm_cvtsi32_si128(_mm256_extract_epi32(reinterpret_cast<__m256i const &>(src), index))));
226}
227
228} // namespace seqan3::detail
229
230#endif // __AVX2__
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Provides seqan3::simd::simd_traits.
Provides seqan3::simd::simd_concept.