SeqAn3 3.4.0-rc.1
The Modern C++ library for sequence analysis.
Loading...
Searching...
No Matches
simd_algorithm_sse4.hpp
Go to the documentation of this file.
1// SPDX-FileCopyrightText: 2006-2024 Knut Reinert & Freie Universität Berlin
2// SPDX-FileCopyrightText: 2016-2024 Knut Reinert & MPI für molekulare Genetik
3// SPDX-License-Identifier: BSD-3-Clause
4
10#pragma once
11
12#include <array>
13
18
19//-----------------------------------------------------------------------------
20// forward declare sse4 simd algorithms that use sse4 intrinsics
21//-----------------------------------------------------------------------------
22
23namespace seqan3::detail
24{
28template <simd::simd_concept simd_t>
29constexpr simd_t load_sse4(void const * mem_addr);
30
34template <simd::simd_concept simd_t>
35constexpr void store_sse4(void * mem_addr, simd_t const & simd_vec);
36
40template <simd::simd_concept simd_t>
42
46template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
47constexpr target_simd_t upcast_signed_sse4(source_simd_t const & src);
48
52template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
53constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const & src);
54
58template <uint8_t index, simd::simd_concept simd_t>
59constexpr simd_t extract_half_sse4(simd_t const & src);
60
64template <uint8_t index, simd::simd_concept simd_t>
65constexpr simd_t extract_quarter_sse4(simd_t const & src);
66
70template <uint8_t index, simd::simd_concept simd_t>
71constexpr simd_t extract_eighth_sse4(simd_t const & src);
72
73} // namespace seqan3::detail
74
75//-----------------------------------------------------------------------------
76// implementation
77//-----------------------------------------------------------------------------
78
79#ifdef __SSE4_2__
80
81namespace seqan3::detail
82{
83
84template <simd::simd_concept simd_t>
85constexpr simd_t load_sse4(void const * mem_addr)
86{
87 return reinterpret_cast<simd_t>(_mm_loadu_si128(reinterpret_cast<__m128i const *>(mem_addr)));
88}
89
90template <simd::simd_concept simd_t>
91constexpr void store_sse4(void * mem_addr, simd_t const & simd_vec)
92{
93 _mm_storeu_si128(reinterpret_cast<__m128i *>(mem_addr), reinterpret_cast<__m128i const &>(simd_vec));
94}
95
96template <simd::simd_concept simd_t>
97inline void transpose_matrix_sse4(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
98{
99 static_assert(simd_traits<simd_t>::length == simd_traits<simd_t>::max_length, "Expects byte scalar type.");
100 static_assert(is_native_builtin_simd_v<simd_t>, "The passed simd vector is not a native SSE4 simd vector type.");
101 static_assert(is_builtin_simd_v<simd_t>, "The passed simd vector is not a builtin vector type.");
102
103 // we need a look-up table to reverse the lowest 4 bits
104 // in order to place the permute the transposed rows
105 constexpr std::array<char, 16> bit_reverse{0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15};
106
107 // transpose a 16x16 byte matrix
108 //
109 // matrix =
110 // A0 A1 A2 ... Ae Af
111 // B0 B1 B2 ... Be Bf
112 // ...
113 // P0 P1 P2 ... Pe Pf
114 __m128i tmp1[16];
115 for (int i = 0; i < 8; ++i)
116 {
117 tmp1[i] = _mm_unpacklo_epi8(reinterpret_cast<__m128i &>(matrix[2 * i]),
118 reinterpret_cast<__m128i &>(matrix[2 * i + 1]));
119 tmp1[i + 8] = _mm_unpackhi_epi8(reinterpret_cast<__m128i &>(matrix[2 * i]),
120 reinterpret_cast<__m128i &>(matrix[2 * i + 1]));
121 }
122 // tmp1[0] = A0 B0 A1 B1 ... A7 B7
123 // tmp1[1] = C0 D0 C1 D1 ... C7 D7
124 // ...
125 // tmp1[7] = O0 P0 O1 P1 ... O7 P7
126 // tmp1[8] = A8 B8 A9 B9 ... Af Bf
127 // ...
128 // tmp1[15] = O8 P8 O9 P9 ... Of Pf
129 __m128i tmp2[16];
130 for (int i = 0; i < 8; ++i)
131 {
132 tmp2[i] = _mm_unpacklo_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
133 tmp2[i + 8] = _mm_unpackhi_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
134 }
135 // tmp2[0] = A0 B0 C0 D0 ... A3 B3 C3 D3
136 // tmp2[1] = E0 F0 G0 H0 ... E3 F3 G3 H3
137 // ...
138 // tmp2[3] = M0 N0 O0 P0 ... M3 N3 O3 P3
139 // tmp2[4] = A8 B8 C8 D8 ... Ab Bb Cb Db
140 // ...
141 // tmp2[7] = M8 N8 O8 P8 ... Mb Nb Ob Pb
142 // tmp2[8] = A4 B4 C4 D4 ... A7 B7 C7 D7
143 // ..
144 // tmp2[12] = Ac Bc Cc Dc ... Af Bf Cf Df
145 // ...
146 // tmp2[15] = Mc Nc Oc Pc ... Mf Nf Of Pf
147 for (int i = 0; i < 8; ++i)
148 {
149 tmp1[i] = _mm_unpacklo_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
150 tmp1[i + 8] = _mm_unpackhi_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
151 }
152 // tmp1[0] = A0 B0 .... H0 A1 B1 .... H1
153 // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
154 // ...
155 // tmp1[4] = A0 B0 .... H0 A1 B1 .... H1
156 // tmp1[1] = I0 J0 .... P0 I1 J1 .... P1
157 for (int i = 0; i < 8; ++i)
158 {
159 matrix[bit_reverse[i]] = reinterpret_cast<simd_t>(_mm_unpacklo_epi64(tmp1[2 * i], tmp1[2 * i + 1]));
160 matrix[bit_reverse[i + 8]] = reinterpret_cast<simd_t>(_mm_unpackhi_epi64(tmp1[2 * i], tmp1[2 * i + 1]));
161 }
162}
163
164template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
165constexpr target_simd_t upcast_signed_sse4(source_simd_t const & src)
166{
167 if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi8 ...
168 {
169 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi16
170 return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi16(reinterpret_cast<__m128i const &>(src)));
171 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
172 return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi32(reinterpret_cast<__m128i const &>(src)));
173 if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
174 return reinterpret_cast<target_simd_t>(_mm_cvtepi8_epi64(reinterpret_cast<__m128i const &>(src)));
175 }
176 else if constexpr (simd_traits<source_simd_t>::length == 8) // cast from epi16 ...
177 {
178 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
179 return reinterpret_cast<target_simd_t>(_mm_cvtepi16_epi32(reinterpret_cast<__m128i const &>(src)));
180 if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
181 return reinterpret_cast<target_simd_t>(_mm_cvtepi16_epi64(reinterpret_cast<__m128i const &>(src)));
182 }
183 else // cast from epi32 to epi64
184 {
185 static_assert(simd_traits<source_simd_t>::length == 4, "Expected 32 bit scalar type.");
186 return reinterpret_cast<target_simd_t>(_mm_cvtepi32_epi64(reinterpret_cast<__m128i const &>(src)));
187 }
188}
189
190template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
191constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const & src)
192{
193 if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi8 ...
194 {
195 if constexpr (simd_traits<target_simd_t>::length == 8) // to epi16
196 return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi16(reinterpret_cast<__m128i const &>(src)));
197 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
198 return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi32(reinterpret_cast<__m128i const &>(src)));
199 if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
200 return reinterpret_cast<target_simd_t>(_mm_cvtepu8_epi64(reinterpret_cast<__m128i const &>(src)));
201 }
202 else if constexpr (simd_traits<source_simd_t>::length == 8) // cast from epi16 ...
203 {
204 if constexpr (simd_traits<target_simd_t>::length == 4) // to epi32
205 return reinterpret_cast<target_simd_t>(_mm_cvtepu16_epi32(reinterpret_cast<__m128i const &>(src)));
206 if constexpr (simd_traits<target_simd_t>::length == 2) // to epi64
207 return reinterpret_cast<target_simd_t>(_mm_cvtepu16_epi64(reinterpret_cast<__m128i const &>(src)));
208 }
209 else // cast from epi32 to epi64
210 {
211 static_assert(simd_traits<source_simd_t>::length == 4, "Expected 32 bit scalar type.");
212 return reinterpret_cast<target_simd_t>(_mm_cvtepu32_epi64(reinterpret_cast<__m128i const &>(src)));
213 }
214}
215
216template <uint8_t index, simd::simd_concept simd_t>
217constexpr simd_t extract_half_sse4(simd_t const & src)
218{
219 return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), (index) << 3));
220}
221
222template <uint8_t index, simd::simd_concept simd_t>
223constexpr simd_t extract_quarter_sse4(simd_t const & src)
224{
225 return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), index << 2));
226}
227
228template <uint8_t index, simd::simd_concept simd_t>
229constexpr simd_t extract_eighth_sse4(simd_t const & src)
230{
231 return reinterpret_cast<simd_t>(_mm_srli_si128(reinterpret_cast<__m128i const &>(src), index << 1));
232}
233
234} // namespace seqan3::detail
235
236#endif // __SSE4_2__
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Defines the requirements of a matrix (e.g. score matrices, trace matrices).
Definition matrix_concept.hpp:58
The internal SeqAn3 namespace.
Definition aligned_sequence_concept.hpp:26
void transpose_matrix_sse4(std::array< simd_t, simd_traits< simd_t >::length > &matrix)
Transposes the given simd vector matrix.
constexpr simd_t extract_eighth_sse4(simd_t const &src)
Extracts one eighth of the given simd vector and stores it in the lower eighth of the target vector.
constexpr simd_t extract_half_sse4(simd_t const &src)
Extracts one half of the given simd vector and stores the result in the lower half of the target vect...
constexpr target_simd_t upcast_signed_sse4(source_simd_t const &src)
Upcasts the given vector into the target vector using signed extension of packed values.
constexpr void store_sse4(void *mem_addr, simd_t const &simd_vec)
Store simd_t size bits of integral data into memory.
constexpr target_simd_t upcast_unsigned_sse4(source_simd_t const &src)
Upcasts the given vector into the target vector using unsigned extension of packed values.
constexpr simd_t extract_quarter_sse4(simd_t const &src)
Extracts one quarter of the given simd vector and stores it in the lower quarter of the target vector...
constexpr simd_t load_sse4(void const *mem_addr)
Load simd_t size bits of integral data from memory.
Provides seqan3::simd::simd_traits.
seqan3::simd::simd_traits is the trait class that provides uniform interface to the properties of sim...
Definition simd_traits.hpp:38
Provides seqan3::simd::simd_concept.
Hide me