SeqAn3  3.0.3
The Modern C++ library for sequence analysis.
simd_algorithm_sse4.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 
21 
22 //-----------------------------------------------------------------------------
23 // forward declare sse4 simd algorithms that use sse4 intrinsics
24 //-----------------------------------------------------------------------------
25 
26 namespace seqan3::detail
27 {
31 template <simd::simd_concept simd_t>
32 constexpr simd_t load_sse4(void const * mem_addr);
33 
37 template <simd::simd_concept simd_t>
38 constexpr void store_sse4(void * mem_addr, simd_t const & simd_vec);
39 
43 template <simd::simd_concept simd_t>
44 inline void transpose_matrix_sse4(std::array<simd_t, simd_traits<simd_t>::length> & matrix);
45 
49 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
50 constexpr target_simd_t upcast_signed_sse4(source_simd_t const & src);
51 
55 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
56 constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const & src);
57 
61 template <uint8_t index, simd::simd_concept simd_t>
62 constexpr simd_t extract_half_sse4(simd_t const & src);
63 
67 template <uint8_t index, simd::simd_concept simd_t>
68 constexpr simd_t extract_quarter_sse4(simd_t const & src);
69 
73 template <uint8_t index, simd::simd_concept simd_t>
74 constexpr simd_t extract_eighth_sse4(simd_t const & src);
75 
76 }
77 
78 //-----------------------------------------------------------------------------
79 // implementation
80 //-----------------------------------------------------------------------------
81 
82 #ifdef __SSE4_2__
83 
84 namespace seqan3::detail
85 {
86 
87 template <simd::simd_concept simd_t>
88 constexpr simd_t load_sse4(void const * mem_addr)
89 {
90  return reinterpret_cast<simd_t>(_mm_loadu_si128(reinterpret_cast<__m128i const *>(mem_addr)));
91 }
92 
93 template <simd::simd_concept simd_t>
94 constexpr simd_t store_sse4(void * mem_addr, simd_t const & simd_vec)
95 {
96  _mm_storeu_si128(reinterpret_cast<__m128i *>(mem_addr), reinterpret_cast<__m128i const &>(simd_vec));
97 }
98 
99 template <simd::simd_concept simd_t>
100 inline void transpose_matrix_sse4(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
101 {
102  static_assert(simd_traits<simd_t>::length == simd_traits<simd_t>::max_length, "Expects byte scalar type.");
103  static_assert(is_native_builtin_simd_v<simd_t>, "The passed simd vector is not a native SSE4 simd vector type.");
104  static_assert(is_builtin_simd_v<simd_t>, "The passed simd vector is not a builtin vector type.");
105 
106  // we need a look-up table to reverse the lowest 4 bits
107  // in order to place the permute the transposed rows
108  constexpr std::array<char, 16> bit_reverse{0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
109 
110  // transpose a 16x16 byte matrix
111  //
112  // matrix =
113  // A0 A1 A2 ... Ae Af
114  // B0 B1 B2 ... Be Bf
115  // ...
116  // P0 P1 P2 ... Pe Pf
117  __m128i tmp1[16];
118  for (int i = 0; i < 8; ++i)
119  {
120  tmp1[i] = _mm_unpacklo_epi8(reinterpret_cast<__m128i &>(matrix[2*i]),
121  reinterpret_cast<__m128i &>(matrix[2*i+1]));
122  tmp1[i+8] = _mm_unpackhi_epi8(reinterpret_cast<__m128i &>(matrix[2*i]),
123  reinterpret_cast<__m128i &>(matrix[2*i+1]));
124  }
125  // tmp1[0] = A0 B0 A1 B1 ... A7 B7
126  // tmp1[1] = C0 D0 C1 D1 ... C7 D7
127  // ...
128  // tmp1[7] = O0 P0 O1 P1 ... O7 P7
129  // tmp1[8] = A8 B8 A9 B9 ... Af Bf
130  // ...
131  // tmp1[15] = O8 P8 O9 P9 ... Of Pf
132  __m128i tmp2[16];
133  for (int i = 0; i < 8; ++i)
134  {
135  tmp2[i] = _mm_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
136  tmp2[i+8] = _mm_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
137  }
138  // tmp2[0] = A0 B0 C0 D0 ... A3 B3 C3 D3
139  // tmp2[1] = E0 F0 G0 H0 ... E3 F3 G3 H3
140  // ...
141  // tmp2[3] = M0 N0 O0 P0 ... M3 N3 O3 P3
142  // tmp2[4] = A8 B8 C8 D8 ... Ab Bb Cb Db
143  // ...
144  // tmp2[7] = M8 N8 O8 P8 ... Mb Nb Ob Pb
145  // tmp2[8] = A4 B4 C4 D4 ... A7 B7 C7 D7
146  // ..
147  // tmp2[12] = Ac Bc Cc Dc ... Af Bf Cf Df
148  // ...
149  // tmp2[15] = Mc Nc Oc Pc ... Mf Nf Of Pf
150  for (int i = 0; i < 8; ++i)
151  {
152  tmp1[i] = _mm_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
153  tmp1[i+8] = _mm_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
154  }
155  // tmp1[0] = A0 B0 .... H0 A1 B1 .... H1
156  // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
157  // ...
158  // tmp1[4] = A0 B0 .... H0 A1 B1 .... H1
159  // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
160  for (int i = 0; i < 8; ++i)
161  {
162  matrix[bit_reverse[i]] = reinterpret_cast<simd_t>(_mm_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]));
163  matrix[bit_reverse[i+8]] = reinterpret_cast<simd_t>(_mm_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]));
164  }
165 }
166 
167 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
168 constexpr target_simd_t upcast_signed_sse4(source_simd_t const & src)
169 {
170  if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi8 ...
171  {
172  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi16
173  return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi16(reinterpret_cast<__m128i const &>(src)));
174  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
175  return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi32(reinterpret_cast<__m128i const &>(src)));
176  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
177  return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi64(reinterpret_cast<__m128i const &>(src)));
178  }
179  else if constexpr (simd_traits<source_simd_t>::length == 8) // cast from epi16 ...
180  {
181  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
182  return reinterpret_cast<target_simd_t>(_mm_cvtepi16_epi32(reinterpret_cast<__m128i const &>(src)));
183  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
184  return reinterpret_cast<target_simd_t>(_mm_cvtepi16_epi64(reinterpret_cast<__m128i const &>(src)));
185  }
186  else // cast from epi32 to epi64
187  {
188  static_assert(simd_traits<source_simd_t>::length == 4, "Expected 32 bit scalar type.");
189  return reinterpret_cast<target_simd_t>(_mm_cvtepi32_epi64(reinterpret_cast<__m128i const &>(src)));
190  }
191 }
192 
193 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
194 constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const & src)
195 {
196  if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi8 ...
197  {
198  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi16
199  return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi16(reinterpret_cast<__m128i const &>(src)));
200  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
201  return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi32(reinterpret_cast<__m128i const &>(src)));
202  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
203  return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi64(reinterpret_cast<__m128i const &>(src)));
204  }
205  else if constexpr (simd_traits<source_simd_t>::length == 8) // cast from epi16 ...
206  {
207  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
208  return reinterpret_cast<target_simd_t>(_mm_cvtepu16_epi32(reinterpret_cast<__m128i const &>(src)));
209  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
210  return reinterpret_cast<target_simd_t>(_mm_cvtepu16_epi64(reinterpret_cast<__m128i const &>(src)));
211  }
212  else // cast from epi32 to epi64
213  {
214  static_assert(simd_traits<source_simd_t>::length == 4, "Expected 32 bit scalar type.");
215  return reinterpret_cast<target_simd_t>(_mm_cvtepu32_epi64(reinterpret_cast<__m128i const &>(src)));
216  }
217 }
218 
219 template <uint8_t index, simd::simd_concept simd_t>
220 constexpr simd_t extract_half_sse4(simd_t const & src)
221 {
222  return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), (index) << 3));
223 }
224 
225 template <uint8_t index, simd::simd_concept simd_t>
226 constexpr simd_t extract_quarter_sse4(simd_t const & src)
227 {
228  return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), index << 2));
229 }
230 
231 template <uint8_t index, simd::simd_concept simd_t>
232 constexpr simd_t extract_eighth_sse4(simd_t const & src)
233 {
234  return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), index << 1));
235 }
236 
237 } // namespace seqan3::detail
238 
239 #endif // __SSE4_2__
Provides seqan3::simd::simd_concept.
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Provides seqan3::simd::simd_traits.