SeqAn3 3.1.0
The Modern C++ library for sequence analysis.
simd_algorithm_sse4.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2021, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2021, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <array>
16
21
22//-----------------------------------------------------------------------------
23// forward declare sse4 simd algorithms that use sse4 intrinsics
24//-----------------------------------------------------------------------------
25
26namespace seqan3::detail
27{
31template <simd::simd_concept simd_t>
32constexpr simd_t load_sse4(void const * mem_addr);
33
37template <simd::simd_concept simd_t>
38constexpr void store_sse4(void * mem_addr, simd_t const & simd_vec);
39
43template <simd::simd_concept simd_t>
44inline void transpose_matrix_sse4(std::array<simd_t, simd_traits<simd_t>::length> & matrix);
45
49template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
50constexpr target_simd_t upcast_signed_sse4(source_simd_t const & src);
51
55template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
56constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const & src);
57
61template <uint8_t index, simd::simd_concept simd_t>
62constexpr simd_t extract_half_sse4(simd_t const & src);
63
67template <uint8_t index, simd::simd_concept simd_t>
68constexpr simd_t extract_quarter_sse4(simd_t const & src);
69
73template <uint8_t index, simd::simd_concept simd_t>
74constexpr simd_t extract_eighth_sse4(simd_t const & src);
75
76}
77
78//-----------------------------------------------------------------------------
79// implementation
80//-----------------------------------------------------------------------------
81
82#ifdef __SSE4_2__
83
84namespace seqan3::detail
85{
86
87template <simd::simd_concept simd_t>
88constexpr simd_t load_sse4(void const * mem_addr)
89{
90 return reinterpret_cast<simd_t>(_mm_loadu_si128(reinterpret_cast<__m128i const *>(mem_addr)));
91}
92
93template <simd::simd_concept simd_t>
94constexpr simd_t store_sse4(void * mem_addr, simd_t const & simd_vec)
95{
96 _mm_storeu_si128(reinterpret_cast<__m128i *>(mem_addr), reinterpret_cast<__m128i const &>(simd_vec));
97}
98
99template <simd::simd_concept simd_t>
100inline void transpose_matrix_sse4(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
101{
102 static_assert(simd_traits<simd_t>::length == simd_traits<simd_t>::max_length, "Expects byte scalar type.");
103 static_assert(is_native_builtin_simd_v<simd_t>, "The passed simd vector is not a native SSE4 simd vector type.");
104 static_assert(is_builtin_simd_v<simd_t>, "The passed simd vector is not a builtin vector type.");
105
106 // we need a look-up table to reverse the lowest 4 bits
107 // in order to place the permute the transposed rows
108 constexpr std::array<char, 16> bit_reverse{0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
109
110 // transpose a 16x16 byte matrix
111 //
112 // matrix =
113 // A0 A1 A2 ... Ae Af
114 // B0 B1 B2 ... Be Bf
115 // ...
116 // P0 P1 P2 ... Pe Pf
117 __m128i tmp1[16];
118 for (int i = 0; i < 8; ++i)
119 {
120 tmp1[i] = _mm_unpacklo_epi8(reinterpret_cast<__m128i &>(matrix[2*i]),
121 reinterpret_cast<__m128i &>(matrix[2*i+1]));
122 tmp1[i+8] = _mm_unpackhi_epi8(reinterpret_cast<__m128i &>(matrix[2*i]),
123 reinterpret_cast<__m128i &>(matrix[2*i+1]));
124 }
125 // tmp1[0] = A0 B0 A1 B1 ... A7 B7
126 // tmp1[1] = C0 D0 C1 D1 ... C7 D7
127 // ...
128 // tmp1[7] = O0 P0 O1 P1 ... O7 P7
129 // tmp1[8] = A8 B8 A9 B9 ... Af Bf
130 // ...
131 // tmp1[15] = O8 P8 O9 P9 ... Of Pf
132 __m128i tmp2[16];
133 for (int i = 0; i < 8; ++i)
134 {
135 tmp2[i] = _mm_unpacklo_epi16(tmp1[2*i], tmp1[2*i+1]);
136 tmp2[i+8] = _mm_unpackhi_epi16(tmp1[2*i], tmp1[2*i+1]);
137 }
138 // tmp2[0] = A0 B0 C0 D0 ... A3 B3 C3 D3
139 // tmp2[1] = E0 F0 G0 H0 ... E3 F3 G3 H3
140 // ...
141 // tmp2[3] = M0 N0 O0 P0 ... M3 N3 O3 P3
142 // tmp2[4] = A8 B8 C8 D8 ... Ab Bb Cb Db
143 // ...
144 // tmp2[7] = M8 N8 O8 P8 ... Mb Nb Ob Pb
145 // tmp2[8] = A4 B4 C4 D4 ... A7 B7 C7 D7
146 // ..
147 // tmp2[12] = Ac Bc Cc Dc ... Af Bf Cf Df
148 // ...
149 // tmp2[15] = Mc Nc Oc Pc ... Mf Nf Of Pf
150 for (int i = 0; i < 8; ++i)
151 {
152 tmp1[i] = _mm_unpacklo_epi32(tmp2[2*i], tmp2[2*i+1]);
153 tmp1[i+8] = _mm_unpackhi_epi32(tmp2[2*i], tmp2[2*i+1]);
154 }
155 // tmp1[0] = A0 B0 .... H0 A1 B1 .... H1
156 // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
157 // ...
158 // tmp1[4] = A0 B0 .... H0 A1 B1 .... H1
159 // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
160 for (int i = 0; i < 8; ++i)
161 {
162 matrix[bit_reverse[i]] = reinterpret_cast<simd_t>(_mm_unpacklo_epi64(tmp1[2*i], tmp1[2*i+1]));
163 matrix[bit_reverse[i+8]] = reinterpret_cast<simd_t>(_mm_unpackhi_epi64(tmp1[2*i], tmp1[2*i+1]));
164 }
165}
166
167template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
168constexpr target_simd_t upcast_signed_sse4(source_simd_t const & src)
169{
170 if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi8 ...
171 {
172 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi16
173 return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi16(reinterpret_cast<__m128i const &>(src)));
174 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
175 return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi32(reinterpret_cast<__m128i const &>(src)));
176 if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
177 return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi64(reinterpret_cast<__m128i const &>(src)));
178 }
179 else if constexpr (simd_traits<source_simd_t>::length == 8) // cast from epi16 ...
180 {
181 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
182 return reinterpret_cast<target_simd_t>(_mm_cvtepi16_epi32(reinterpret_cast<__m128i const &>(src)));
183 if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
184 return reinterpret_cast<target_simd_t>(_mm_cvtepi16_epi64(reinterpret_cast<__m128i const &>(src)));
185 }
186 else // cast from epi32 to epi64
187 {
188 static_assert(simd_traits<source_simd_t>::length == 4, "Expected 32 bit scalar type.");
189 return reinterpret_cast<target_simd_t>(_mm_cvtepi32_epi64(reinterpret_cast<__m128i const &>(src)));
190 }
191}
192
193template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
194constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const & src)
195{
196 if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi8 ...
197 {
198 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi16
199 return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi16(reinterpret_cast<__m128i const &>(src)));
200 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
201 return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi32(reinterpret_cast<__m128i const &>(src)));
202 if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
203 return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi64(reinterpret_cast<__m128i const &>(src)));
204 }
205 else if constexpr (simd_traits<source_simd_t>::length == 8) // cast from epi16 ...
206 {
207 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
208 return reinterpret_cast<target_simd_t>(_mm_cvtepu16_epi32(reinterpret_cast<__m128i const &>(src)));
209 if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
210 return reinterpret_cast<target_simd_t>(_mm_cvtepu16_epi64(reinterpret_cast<__m128i const &>(src)));
211 }
212 else // cast from epi32 to epi64
213 {
214 static_assert(simd_traits<source_simd_t>::length == 4, "Expected 32 bit scalar type.");
215 return reinterpret_cast<target_simd_t>(_mm_cvtepu32_epi64(reinterpret_cast<__m128i const &>(src)));
216 }
217}
218
219template <uint8_t index, simd::simd_concept simd_t>
220constexpr simd_t extract_half_sse4(simd_t const & src)
221{
222 return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), (index) << 3));
223}
224
225template <uint8_t index, simd::simd_concept simd_t>
226constexpr simd_t extract_quarter_sse4(simd_t const & src)
227{
228 return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), index << 2));
229}
230
231template <uint8_t index, simd::simd_concept simd_t>
232constexpr simd_t extract_eighth_sse4(simd_t const & src)
233{
234 return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), index << 1));
235}
236
237} // namespace seqan3::detail
238
239#endif // __SSE4_2__
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Provides seqan3::simd::simd_traits.
Provides seqan3::simd::simd_concept.