SeqAn3  3.0.1
The Modern C++ library for sequence analysis.
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
simd_algorithm_sse4.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2020, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2020, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
18 
19 //-----------------------------------------------------------------------------
20 // forward declare sse4 simd algorithms that use sse4 intrinsics
21 //-----------------------------------------------------------------------------
22 
23 namespace seqan3::detail
24 {
28 template <simd::simd_concept simd_t>
29 constexpr simd_t load_sse4(void const * mem_addr);
30 
34 template <simd::simd_concept simd_t>
35 inline void transpose_matrix_sse4(std::array<simd_t, simd_traits<simd_t>::length> & matrix);
36 
40 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
41 constexpr target_simd_t upcast_signed_sse4(source_simd_t const & src);
42 
46 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
47 constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const & src);
48 
52 template <uint8_t index, simd::simd_concept simd_t>
53 constexpr simd_t extract_halve_sse4(simd_t const & src);
54 
58 template <uint8_t index, simd::simd_concept simd_t>
59 constexpr simd_t extract_quarter_sse4(simd_t const & src);
60 
64 template <uint8_t index, simd::simd_concept simd_t>
65 constexpr simd_t extract_eighth_sse4(simd_t const & src);
66 
67 }
68 
69 //-----------------------------------------------------------------------------
70 // implementation
71 //-----------------------------------------------------------------------------
72 
73 #ifdef __SSE4_2__
74 
75 namespace seqan3::detail
76 {
77 
78 template <simd::simd_concept simd_t>
79 constexpr simd_t load_sse4(void const * mem_addr)
80 {
81  return reinterpret_cast<simd_t>(_mm_loadu_si128(reinterpret_cast<__m128i const *>(mem_addr)));
82 }
83 
84 template <simd::simd_concept simd_t>
85 inline void transpose_matrix_sse4(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
86 {
87  static_assert(simd_traits<simd_t>::length == simd_traits<simd_t>::max_length, "Expects byte scalar type.");
88  static_assert(is_native_builtin_simd_v<simd_t>, "The passed simd vector is not a native SSE4 simd vector type.");
89  static_assert(is_builtin_simd_v<simd_t>, "The passed simd vector is not a builtin vector type.");
90 
91  // we need a look-up table to reverse the lowest 4 bits
92  // in order to place the permute the transposed rows
93  constexpr std::array<char, 16> bit_reverse{0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
94 
95  // transpose a 16x16 byte matrix
96  //
97  // matrix =
98  // A0 A1 A2 ... Ae Af
99  // B0 B1 B2 ... Be Bf
100  // ...
101  // P0 P1 P2 ... Pe Pf
102  __m128i tmp1[16];
103  for (int i = 0; i < 8; ++i)
104  {
105  tmp1[i] = _mm_unpacklo_epi8(reinterpret_cast<__m128i &>(matrix[2*i]),
106  reinterpret_cast<__m128i &>(matrix[2*i+1]));
107  tmp1[i+8] = _mm_unpackhi_epi8(reinterpret_cast<__m128i &>(matrix[2*i]),
108  reinterpret_cast<__m128i &>(matrix[2*i+1]));
109  }
110  // tmp1[0] = A0 B0 A1 B1 ... A7 B7
111  // tmp1[1] = C0 D0 C1 D1 ... C7 D7
112  // ...
113  // tmp1[7] = O0 P0 O1 P1 ... O7 P7
114  // tmp1[8] = A8 B8 A9 B9 ... Af Bf
115  // ...
116  // tmp1[15] = O8 P8 O9 P9 ... Of Pf
117  __m128i tmp2[16];
118  for (int i = 0; i < 8; ++i)
119  {
120  tmp2[i] = _mm_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
121  tmp2[i+8] = _mm_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
122  }
123  // tmp2[0] = A0 B0 C0 D0 ... A3 B3 C3 D3
124  // tmp2[1] = E0 F0 G0 H0 ... E3 F3 G3 H3
125  // ...
126  // tmp2[3] = M0 N0 O0 P0 ... M3 N3 O3 P3
127  // tmp2[4] = A8 B8 C8 D8 ... Ab Bb Cb Db
128  // ...
129  // tmp2[7] = M8 N8 O8 P8 ... Mb Nb Ob Pb
130  // tmp2[8] = A4 B4 C4 D4 ... A7 B7 C7 D7
131  // ..
132  // tmp2[12] = Ac Bc Cc Dc ... Af Bf Cf Df
133  // ...
134  // tmp2[15] = Mc Nc Oc Pc ... Mf Nf Of Pf
135  for (int i = 0; i < 8; ++i)
136  {
137  tmp1[i] = _mm_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
138  tmp1[i+8] = _mm_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
139  }
140  // tmp1[0] = A0 B0 .... H0 A1 B1 .... H1
141  // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
142  // ...
143  // tmp1[4] = A0 B0 .... H0 A1 B1 .... H1
144  // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
145  for (int i = 0; i < 8; ++i)
146  {
147  matrix[bit_reverse[i]] = reinterpret_cast<simd_t>(_mm_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]));
148  matrix[bit_reverse[i+8]] = reinterpret_cast<simd_t>(_mm_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]));
149  }
150 }
151 
152 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
153 constexpr target_simd_t upcast_signed_sse4(source_simd_t const & src)
154 {
155  if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi8 ...
156  {
157  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi16
158  return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi16(reinterpret_cast<__m128i const &>(src)));
159  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
160  return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi32(reinterpret_cast<__m128i const &>(src)));
161  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
162  return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi64(reinterpret_cast<__m128i const &>(src)));
163  }
164  else if constexpr (simd_traits<source_simd_t>::length == 8) // cast from epi16 ...
165  {
166  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
167  return reinterpret_cast<target_simd_t>(_mm_cvtepi16_epi32(reinterpret_cast<__m128i const &>(src)));
168  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
169  return reinterpret_cast<target_simd_t>(_mm_cvtepi16_epi64(reinterpret_cast<__m128i const &>(src)));
170  }
171  else // cast from epi32 to epi64
172  {
173  static_assert(simd_traits<source_simd_t>::length == 4, "Expected 32 bit scalar type.");
174  return reinterpret_cast<target_simd_t>(_mm_cvtepi32_epi64(reinterpret_cast<__m128i const &>(src)));
175  }
176 }
177 
178 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
179 constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const & src)
180 {
181  if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi8 ...
182  {
183  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi16
184  return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi16(reinterpret_cast<__m128i const &>(src)));
185  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
186  return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi32(reinterpret_cast<__m128i const &>(src)));
187  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
188  return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi64(reinterpret_cast<__m128i const &>(src)));
189  }
190  else if constexpr (simd_traits<source_simd_t>::length == 8) // cast from epi16 ...
191  {
192  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
193  return reinterpret_cast<target_simd_t>(_mm_cvtepu16_epi32(reinterpret_cast<__m128i const &>(src)));
194  if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
195  return reinterpret_cast<target_simd_t>(_mm_cvtepu16_epi64(reinterpret_cast<__m128i const &>(src)));
196  }
197  else // cast from epi32 to epi64
198  {
199  static_assert(simd_traits<source_simd_t>::length == 4, "Expected 32 bit scalar type.");
200  return reinterpret_cast<target_simd_t>(_mm_cvtepu32_epi64(reinterpret_cast<__m128i const &>(src)));
201  }
202 }
203 
204 template <uint8_t index, simd::simd_concept simd_t>
205 constexpr simd_t extract_halve_sse4(simd_t const & src)
206 {
207  return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), (index) << 3));
208 }
209 
210 template <uint8_t index, simd::simd_concept simd_t>
211 constexpr simd_t extract_quarter_sse4(simd_t const & src)
212 {
213  return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), index << 2));
214 }
215 
216 template <uint8_t index, simd::simd_concept simd_t>
217 constexpr simd_t extract_eighth_sse4(simd_t const & src)
218 {
219  return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), index << 1));
220 }
221 
222 } // namespace seqan3::detail
223 
224 #endif // __SSE4_2__
concept.hpp
Provides seqan3::simd::simd_concept.
simd_traits.hpp
Provides seqan3::simd::simd_traits.
builtin_simd_intrinsics.hpp
Provides intrinsics include for builtin simd.
std::array