SeqAn3  3.0.2
The Modern C++ library for sequence analysis.
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
simd_algorithm_avx2.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 
20 
21 //-----------------------------------------------------------------------------
22 // forward declare avx2 simd algorithms that use avx2 intrinsics
23 //-----------------------------------------------------------------------------
24 
25 namespace seqan3::detail
26 {
30 template <simd::simd_concept simd_t>
31 constexpr simd_t load_avx2(void const * mem_addr);
32 
36 template <simd::simd_concept simd_t>
37 inline void transpose_matrix_avx2(std::array<simd_t, simd_traits<simd_t>::length> & matrix);
38 
42 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
43 constexpr target_simd_t upcast_signed_avx2(source_simd_t const & src);
44 
48 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
49 constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const & src);
50 
54 template <uint8_t index, simd::simd_concept simd_t>
55 constexpr simd_t extract_half_avx2(simd_t const & src);
56 
60 template <uint8_t index, simd::simd_concept simd_t>
61 constexpr simd_t extract_quarter_avx2(simd_t const & src);
62 
66 template <uint8_t index, simd::simd_concept simd_t>
67 constexpr simd_t extract_eighth_avx2(simd_t const & src);
68 
69 }
70 
71 //-----------------------------------------------------------------------------
72 // implementation
73 //-----------------------------------------------------------------------------
74 
75 #ifdef __AVX2__
76 
77 namespace seqan3::detail
78 {
79 
80 template <simd::simd_concept simd_t>
81 constexpr simd_t load_avx2(void const * mem_addr)
82 {
83  return reinterpret_cast<simd_t>(_mm256_loadu_si256(reinterpret_cast<__m256i const *>(mem_addr)));
84 }
85 
86 template <simd::simd_concept simd_t>
87 inline void transpose_matrix_avx2(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
88 {
89  // emulate missing _mm256_unpacklo_epi128/_mm256_unpackhi_epi128 instructions
90  auto _mm256_unpacklo_epi128 = [] (__m256i const & a, __m256i const & b)
91  {
92  return _mm256_permute2x128_si256(a, b, 0x20);
93  };
94 
95  auto _mm256_unpackhi_epi128 = [] (__m256i const & a, __m256i const & b)
96  {
97  return _mm256_permute2x128_si256(a, b, 0x31);
98  };
99 
100  // A look-up table to reverse the lowest 4 bits in order to permute the transposed rows.
101  static const uint8_t bit_rev[] = { 0, 8, 4,12, 2,10, 6,14, 1, 9, 5,13, 3,11, 7,15,
102  16,24,20,28,18,26,22,30,17,25,21,29,19,27,23,31};
103 
104  // transpose a 32x32 byte matrix
105  __m256i tmp1[32];
106  for (int i = 0; i < 16; ++i)
107  {
108  tmp1[i] = _mm256_unpacklo_epi8(
109  reinterpret_cast<const __m256i &>(matrix[2*i]),
110  reinterpret_cast<const __m256i &>(matrix[2*i+1])
111  );
112  tmp1[i+16] = _mm256_unpackhi_epi8(
113  reinterpret_cast<const __m256i &>(matrix[2*i]),
114  reinterpret_cast<const __m256i &>(matrix[2*i+1])
115  );
116  }
117  __m256i tmp2[32];
118  for (int i = 0; i < 16; ++i)
119  {
120  tmp2[i] = _mm256_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
121  tmp2[i+16] = _mm256_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
122  }
123  for (int i = 0; i < 16; ++i)
124  {
125  tmp1[i] = _mm256_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
126  tmp1[i+16] = _mm256_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
127  }
128  for (int i = 0; i < 16; ++i)
129  {
130  tmp2[i] = _mm256_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]);
131  tmp2[i+16] = _mm256_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]);
132  }
133  for (int i = 0; i < 16; ++i)
134  {
135  matrix[bit_rev[i]] = reinterpret_cast<simd_t>(_mm256_unpacklo_epi128(tmp2[2*i],tmp2[2*i+1]));
136  matrix[bit_rev[i+16]] = reinterpret_cast<simd_t>(_mm256_unpackhi_epi128(tmp2[2*i],tmp2[2*i+1]));
137  }
138 }
139 
140 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
141 constexpr target_simd_t upcast_signed_avx2(source_simd_t const & src)
142 {
143  __m128i const & tmp = _mm256_castsi256_si128(reinterpret_cast<__m256i const &>(src));
144  if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi8 ...
145  {
146  if constexpr (simd_traits<target_simd_t>::length == 16) // to epi16
147  return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi16(tmp));
148  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
149  return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi32(tmp));
150  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
151  return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi64(tmp));
152  }
153  else if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi16 ...
154  {
155  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
156  return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi32(tmp));
157  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
158  return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi64(tmp));
159  }
160  else // cast from epi32 to epi64
161  {
162  static_assert(simd_traits<source_simd_t>::length == 8, "Expected 32 bit scalar type.");
163  return reinterpret_cast<target_simd_t>(_mm256_cvtepi32_epi64(tmp));
164  }
165 }
166 
167 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
168 constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const & src)
169 {
170  __m128i const & tmp = _mm256_castsi256_si128(reinterpret_cast<__m256i const &>(src));
171  if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi8 ...
172  {
173  if constexpr (simd_traits<target_simd_t>::length == 16) // to epi16
174  return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi16(tmp));
175  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
176  return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi32(tmp));
177  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
178  return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi64(tmp));
179  }
180  else if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi16 ...
181  {
182  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
183  return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi32(tmp));
184  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
185  return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi64(tmp));
186  }
187  else // cast from epi32 to epi64
188  {
189  static_assert(simd_traits<source_simd_t>::length == 8, "Expected 32 bit scalar type.");
190  return reinterpret_cast<target_simd_t>(_mm256_cvtepu32_epi64(tmp));
191  }
192 }
193 
194 template <uint8_t index, simd::simd_concept simd_t>
195 constexpr simd_t extract_half_avx2(simd_t const & src)
196 {
197  return reinterpret_cast<simd_t>(_mm256_castsi128_si256(
198  _mm256_extracti128_si256(reinterpret_cast<__m256i const &>(src), index)));
199 }
200 
201 template <uint8_t index, simd::simd_concept simd_t>
202 constexpr simd_t extract_quarter_avx2(simd_t const & src)
203 {
204  return reinterpret_cast<simd_t>(_mm256_castsi128_si256(
205  _mm_cvtsi64x_si128(_mm256_extract_epi64(reinterpret_cast<__m256i const &>(src), index))));
206 }
207 
208 template <uint8_t index, simd::simd_concept simd_t>
209 constexpr simd_t extract_eighth_avx2(simd_t const & src)
210 {
211  return reinterpret_cast<simd_t>(_mm256_castsi128_si256(
212  _mm_cvtsi32_si128(_mm256_extract_epi32(reinterpret_cast<__m256i const &>(src), index))));
213 }
214 
215 } // namespace seqan3::detail
216 
217 #endif // __AVX2__
concept.hpp
Provides seqan3::simd::simd_concept.
simd_traits.hpp
Provides seqan3::simd::simd_traits.
builtin_simd_intrinsics.hpp
Provides intrinsics include for builtin simd.
array