LLVM  16.0.0git
blake3_avx2.c
Go to the documentation of this file.
1 #include "blake3_impl.h"
2 
3 #include <immintrin.h>
4 
5 #define DEGREE 8
6 
7 INLINE __m256i loadu(const uint8_t src[32]) {
8  return _mm256_loadu_si256((const __m256i *)src);
9 }
10 
11 INLINE void storeu(__m256i src, uint8_t dest[16]) {
12  _mm256_storeu_si256((__m256i *)dest, src);
13 }
14 
15 INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
16 
17 // Note that clang-format doesn't like the name "xor" for some reason.
18 INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
19 
20 INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
21 
22 INLINE __m256i rot16(__m256i x) {
23  return _mm256_shuffle_epi8(
24  x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
25  13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
26 }
27 
28 INLINE __m256i rot12(__m256i x) {
29  return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
30 }
31 
32 INLINE __m256i rot8(__m256i x) {
33  return _mm256_shuffle_epi8(
34  x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
35  12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
36 }
37 
38 INLINE __m256i rot7(__m256i x) {
39  return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
40 }
41 
42 INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
43  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
44  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
45  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
46  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
47  v[0] = addv(v[0], v[4]);
48  v[1] = addv(v[1], v[5]);
49  v[2] = addv(v[2], v[6]);
50  v[3] = addv(v[3], v[7]);
51  v[12] = xorv(v[12], v[0]);
52  v[13] = xorv(v[13], v[1]);
53  v[14] = xorv(v[14], v[2]);
54  v[15] = xorv(v[15], v[3]);
55  v[12] = rot16(v[12]);
56  v[13] = rot16(v[13]);
57  v[14] = rot16(v[14]);
58  v[15] = rot16(v[15]);
59  v[8] = addv(v[8], v[12]);
60  v[9] = addv(v[9], v[13]);
61  v[10] = addv(v[10], v[14]);
62  v[11] = addv(v[11], v[15]);
63  v[4] = xorv(v[4], v[8]);
64  v[5] = xorv(v[5], v[9]);
65  v[6] = xorv(v[6], v[10]);
66  v[7] = xorv(v[7], v[11]);
67  v[4] = rot12(v[4]);
68  v[5] = rot12(v[5]);
69  v[6] = rot12(v[6]);
70  v[7] = rot12(v[7]);
71  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
72  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
73  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
74  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
75  v[0] = addv(v[0], v[4]);
76  v[1] = addv(v[1], v[5]);
77  v[2] = addv(v[2], v[6]);
78  v[3] = addv(v[3], v[7]);
79  v[12] = xorv(v[12], v[0]);
80  v[13] = xorv(v[13], v[1]);
81  v[14] = xorv(v[14], v[2]);
82  v[15] = xorv(v[15], v[3]);
83  v[12] = rot8(v[12]);
84  v[13] = rot8(v[13]);
85  v[14] = rot8(v[14]);
86  v[15] = rot8(v[15]);
87  v[8] = addv(v[8], v[12]);
88  v[9] = addv(v[9], v[13]);
89  v[10] = addv(v[10], v[14]);
90  v[11] = addv(v[11], v[15]);
91  v[4] = xorv(v[4], v[8]);
92  v[5] = xorv(v[5], v[9]);
93  v[6] = xorv(v[6], v[10]);
94  v[7] = xorv(v[7], v[11]);
95  v[4] = rot7(v[4]);
96  v[5] = rot7(v[5]);
97  v[6] = rot7(v[6]);
98  v[7] = rot7(v[7]);
99 
100  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
101  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
102  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
103  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
104  v[0] = addv(v[0], v[5]);
105  v[1] = addv(v[1], v[6]);
106  v[2] = addv(v[2], v[7]);
107  v[3] = addv(v[3], v[4]);
108  v[15] = xorv(v[15], v[0]);
109  v[12] = xorv(v[12], v[1]);
110  v[13] = xorv(v[13], v[2]);
111  v[14] = xorv(v[14], v[3]);
112  v[15] = rot16(v[15]);
113  v[12] = rot16(v[12]);
114  v[13] = rot16(v[13]);
115  v[14] = rot16(v[14]);
116  v[10] = addv(v[10], v[15]);
117  v[11] = addv(v[11], v[12]);
118  v[8] = addv(v[8], v[13]);
119  v[9] = addv(v[9], v[14]);
120  v[5] = xorv(v[5], v[10]);
121  v[6] = xorv(v[6], v[11]);
122  v[7] = xorv(v[7], v[8]);
123  v[4] = xorv(v[4], v[9]);
124  v[5] = rot12(v[5]);
125  v[6] = rot12(v[6]);
126  v[7] = rot12(v[7]);
127  v[4] = rot12(v[4]);
128  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
129  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
130  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
131  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
132  v[0] = addv(v[0], v[5]);
133  v[1] = addv(v[1], v[6]);
134  v[2] = addv(v[2], v[7]);
135  v[3] = addv(v[3], v[4]);
136  v[15] = xorv(v[15], v[0]);
137  v[12] = xorv(v[12], v[1]);
138  v[13] = xorv(v[13], v[2]);
139  v[14] = xorv(v[14], v[3]);
140  v[15] = rot8(v[15]);
141  v[12] = rot8(v[12]);
142  v[13] = rot8(v[13]);
143  v[14] = rot8(v[14]);
144  v[10] = addv(v[10], v[15]);
145  v[11] = addv(v[11], v[12]);
146  v[8] = addv(v[8], v[13]);
147  v[9] = addv(v[9], v[14]);
148  v[5] = xorv(v[5], v[10]);
149  v[6] = xorv(v[6], v[11]);
150  v[7] = xorv(v[7], v[8]);
151  v[4] = xorv(v[4], v[9]);
152  v[5] = rot7(v[5]);
153  v[6] = rot7(v[6]);
154  v[7] = rot7(v[7]);
155  v[4] = rot7(v[4]);
156 }
157 
158 INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
159  // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
160  // is 22/33/66/77.
161  __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
162  __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
163  __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
164  __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
165  __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
166  __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
167  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
168  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
169 
170  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
171  // 11/33.
172  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
173  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
174  __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
175  __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
176  __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
177  __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
178  __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
179  __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
180 
181  // Interleave 128-bit lanes.
182  vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
183  vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
184  vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
185  vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
186  vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
187  vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
188  vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
189  vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
190 }
191 
192 INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
193  size_t block_offset, __m256i out[16]) {
194  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
195  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
196  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
197  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
198  out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
199  out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
200  out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
201  out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
202  out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
203  out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
204  out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
205  out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
206  out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
207  out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
208  out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
209  out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
210  for (size_t i = 0; i < 8; ++i) {
211  _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
212  }
213  transpose_vecs(&out[0]);
214  transpose_vecs(&out[8]);
215 }
216 
217 INLINE void load_counters(uint64_t counter, bool increment_counter,
218  __m256i *out_lo, __m256i *out_hi) {
219  const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
220  const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
221  const __m256i add1 = _mm256_and_si256(mask, add0);
222  __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
223  __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
224  _mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
225  __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
226  *out_lo = l;
227  *out_hi = h;
228 }
229 
230 static
231 void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
232  const uint32_t key[8], uint64_t counter,
233  bool increment_counter, uint8_t flags,
234  uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
235  __m256i h_vecs[8] = {
236  set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
237  set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
238  };
239  __m256i counter_low_vec, counter_high_vec;
240  load_counters(counter, increment_counter, &counter_low_vec,
241  &counter_high_vec);
242  uint8_t block_flags = flags | flags_start;
243 
244  for (size_t block = 0; block < blocks; block++) {
245  if (block + 1 == blocks) {
246  block_flags |= flags_end;
247  }
248  __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
249  __m256i block_flags_vec = set1(block_flags);
250  __m256i msg_vecs[16];
251  transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
252 
253  __m256i v[16] = {
254  h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
255  h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
256  set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
257  counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
258  };
259  round_fn(v, msg_vecs, 0);
260  round_fn(v, msg_vecs, 1);
261  round_fn(v, msg_vecs, 2);
262  round_fn(v, msg_vecs, 3);
263  round_fn(v, msg_vecs, 4);
264  round_fn(v, msg_vecs, 5);
265  round_fn(v, msg_vecs, 6);
266  h_vecs[0] = xorv(v[0], v[8]);
267  h_vecs[1] = xorv(v[1], v[9]);
268  h_vecs[2] = xorv(v[2], v[10]);
269  h_vecs[3] = xorv(v[3], v[11]);
270  h_vecs[4] = xorv(v[4], v[12]);
271  h_vecs[5] = xorv(v[5], v[13]);
272  h_vecs[6] = xorv(v[6], v[14]);
273  h_vecs[7] = xorv(v[7], v[15]);
274 
275  block_flags = flags;
276  }
277 
278  transpose_vecs(h_vecs);
279  storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
280  storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
281  storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
282  storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
283  storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
284  storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
285  storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
286  storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
287 }
288 
289 #if !defined(BLAKE3_NO_SSE41)
290 void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
291  size_t blocks, const uint32_t key[8],
292  uint64_t counter, bool increment_counter,
293  uint8_t flags, uint8_t flags_start,
294  uint8_t flags_end, uint8_t *out);
295 #else
296 void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
297  size_t blocks, const uint32_t key[8],
298  uint64_t counter, bool increment_counter,
299  uint8_t flags, uint8_t flags_start,
300  uint8_t flags_end, uint8_t *out);
301 #endif
302 
303 void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
304  size_t blocks, const uint32_t key[8],
305  uint64_t counter, bool increment_counter,
306  uint8_t flags, uint8_t flags_start,
307  uint8_t flags_end, uint8_t *out) {
308  while (num_inputs >= DEGREE) {
309  blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
310  flags_start, flags_end, out);
311  if (increment_counter) {
312  counter += DEGREE;
313  }
314  inputs += DEGREE;
315  num_inputs -= DEGREE;
316  out = &out[DEGREE * BLAKE3_OUT_LEN];
317  }
318 #if !defined(BLAKE3_NO_SSE41)
319  blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
320  increment_counter, flags, flags_start, flags_end, out);
321 #else
322  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
323  increment_counter, flags, flags_start, flags_end,
324  out);
325 #endif
326 }
i
i
Definition: README.txt:29
block
we get the following basic block
Definition: README_ALTIVEC.txt:95
transpose_msg_vecs
INLINE void transpose_msg_vecs(const uint8_t *const *inputs, size_t block_offset, __m256i out[16])
Definition: blake3_avx2.c:192
addv
INLINE __m256i addv(__m256i a, __m256i b)
Definition: blake3_avx2.c:15
load_counters
INLINE void load_counters(uint64_t counter, bool increment_counter, __m256i *out_lo, __m256i *out_hi)
Definition: blake3_avx2.c:217
BLAKE3_BLOCK_LEN
#define BLAKE3_BLOCK_LEN
Definition: blake3_impl.h:18
set1
INLINE __m256i set1(uint32_t x)
Definition: blake3_avx2.c:20
a
=0.0 ? 0.0 :(a > 0.0 ? 1.0 :-1.0) a
Definition: README.txt:489
BLAKE3_OUT_LEN
#define BLAKE3_OUT_LEN
Definition: blake3_impl.h:17
b
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int b
Definition: README.txt:418
DEGREE
#define DEGREE
Definition: blake3_avx2.c:5
l
This requires reassociating to forms of expressions that are already something that reassoc doesn t think about yet These two functions should generate the same code on big endian int * l
Definition: README.txt:100
blake3_hash8_avx2
static void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_avx2.c:231
MSG_SCHEDULE
static const uint8_t MSG_SCHEDULE[7][16]
Definition: blake3_impl.h:89
uint64_t
blake3_hash_many_avx2
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_avx2.c:303
loadu
INLINE __m256i loadu(const uint8_t src[32])
Definition: blake3_avx2.c:7
blake3_hash_many_portable
LLVM_LIBRARY_VISIBILITY void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_portable.c:145
uint32_t
transpose_vecs
INLINE void transpose_vecs(__m256i vecs[DEGREE])
Definition: blake3_avx2.c:158
round_fn
INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r)
Definition: blake3_avx2.c:42
rot7
INLINE __m256i rot7(__m256i x)
Definition: blake3_avx2.c:38
rot12
INLINE __m256i rot12(__m256i x)
Definition: blake3_avx2.c:28
x
TODO unsigned x
Definition: README.txt:10
rot16
INLINE __m256i rot16(__m256i x)
Definition: blake3_avx2.c:22
rot8
INLINE __m256i rot8(__m256i x)
Definition: blake3_avx2.c:32
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
h
the multiplication has a latency of four as opposed to two cycles for the movl lea variant It appears gcc place string data with linkonce linkage in section coalesced instead of section coalesced Take a look at darwin h
Definition: README.txt:261
storeu
INLINE void storeu(__m256i src, uint8_t dest[16])
Definition: blake3_avx2.c:11
blake3_hash_many_sse41
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_sse41.c:535
shuffles::mask
auto mask(ShuffFunc S, unsigned Length, OptArgs... args) -> MaskT
Definition: HexagonISelDAGToDAGHVX.cpp:903
blake3_impl.h
xorv
INLINE __m256i xorv(__m256i a, __m256i b)
Definition: blake3_avx2.c:18
INLINE
#define INLINE
Definition: blake3_impl.h:40