LLVM  15.0.0git
blake3_sse41.c
Go to the documentation of this file.
1 #include "blake3_impl.h"
2 
3 #include <immintrin.h>
4 
5 #define DEGREE 4
6 
7 #define _mm_shuffle_ps2(a, b, c) \
8  (_mm_castps_si128( \
9  _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
10 
11 INLINE __m128i loadu(const uint8_t src[16]) {
12  return _mm_loadu_si128((const __m128i *)src);
13 }
14 
15 INLINE void storeu(__m128i src, uint8_t dest[16]) {
16  _mm_storeu_si128((__m128i *)dest, src);
17 }
18 
19 INLINE __m128i addv(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
20 
21 // Note that clang-format doesn't like the name "xor" for some reason.
22 INLINE __m128i xorv(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
23 
24 INLINE __m128i set1(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
25 
27  return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
28 }
29 
30 INLINE __m128i rot16(__m128i x) {
31  return _mm_shuffle_epi8(
32  x, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
33 }
34 
35 INLINE __m128i rot12(__m128i x) {
36  return xorv(_mm_srli_epi32(x, 12), _mm_slli_epi32(x, 32 - 12));
37 }
38 
39 INLINE __m128i rot8(__m128i x) {
40  return _mm_shuffle_epi8(
41  x, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
42 }
43 
44 INLINE __m128i rot7(__m128i x) {
45  return xorv(_mm_srli_epi32(x, 7), _mm_slli_epi32(x, 32 - 7));
46 }
47 
48 INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
49  __m128i m) {
50  *row0 = addv(addv(*row0, m), *row1);
51  *row3 = xorv(*row3, *row0);
52  *row3 = rot16(*row3);
53  *row2 = addv(*row2, *row3);
54  *row1 = xorv(*row1, *row2);
55  *row1 = rot12(*row1);
56 }
57 
58 INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
59  __m128i m) {
60  *row0 = addv(addv(*row0, m), *row1);
61  *row3 = xorv(*row3, *row0);
62  *row3 = rot8(*row3);
63  *row2 = addv(*row2, *row3);
64  *row1 = xorv(*row1, *row2);
65  *row1 = rot7(*row1);
66 }
67 
68 // Note the optimization here of leaving row1 as the unrotated row, rather than
69 // row0. All the message loads below are adjusted to compensate for this. See
70 // discussion at https://github.com/sneves/blake2-avx2/pull/4
71 INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
72  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
73  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
74  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
75 }
76 
77 INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
78  *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
79  *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
80  *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
81 }
82 
83 INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
84  const uint8_t block[BLAKE3_BLOCK_LEN],
85  uint8_t block_len, uint64_t counter, uint8_t flags) {
86  rows[0] = loadu((uint8_t *)&cv[0]);
87  rows[1] = loadu((uint8_t *)&cv[4]);
88  rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
89  rows[3] = set4(counter_low(counter), counter_high(counter),
90  (uint32_t)block_len, (uint32_t)flags);
91 
92  __m128i m0 = loadu(&block[sizeof(__m128i) * 0]);
93  __m128i m1 = loadu(&block[sizeof(__m128i) * 1]);
94  __m128i m2 = loadu(&block[sizeof(__m128i) * 2]);
95  __m128i m3 = loadu(&block[sizeof(__m128i) * 3]);
96 
97  __m128i t0, t1, t2, t3, tt;
98 
99  // Round 1. The first round permutes the message words from the original
100  // input order, into the groups that get mixed in parallel.
101  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); // 6 4 2 0
102  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
103  t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); // 7 5 3 1
104  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
105  diagonalize(&rows[0], &rows[2], &rows[3]);
106  t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10 8
107  t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3)); // 12 10 8 14
108  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
109  t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11 9
110  t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3)); // 13 11 9 15
111  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
112  undiagonalize(&rows[0], &rows[2], &rows[3]);
113  m0 = t0;
114  m1 = t1;
115  m2 = t2;
116  m3 = t3;
117 
118  // Round 2. This round and all following rounds apply a fixed permutation
119  // to the message words from the round before.
120  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
121  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
122  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
123  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
124  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
125  t1 = _mm_blend_epi16(tt, t1, 0xCC);
126  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
127  diagonalize(&rows[0], &rows[2], &rows[3]);
128  t2 = _mm_unpacklo_epi64(m3, m1);
129  tt = _mm_blend_epi16(t2, m2, 0xC0);
130  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
131  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
132  t3 = _mm_unpackhi_epi32(m1, m3);
133  tt = _mm_unpacklo_epi32(m2, t3);
134  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
135  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
136  undiagonalize(&rows[0], &rows[2], &rows[3]);
137  m0 = t0;
138  m1 = t1;
139  m2 = t2;
140  m3 = t3;
141 
142  // Round 3
143  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
144  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
145  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
146  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
147  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
148  t1 = _mm_blend_epi16(tt, t1, 0xCC);
149  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
150  diagonalize(&rows[0], &rows[2], &rows[3]);
151  t2 = _mm_unpacklo_epi64(m3, m1);
152  tt = _mm_blend_epi16(t2, m2, 0xC0);
153  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
154  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
155  t3 = _mm_unpackhi_epi32(m1, m3);
156  tt = _mm_unpacklo_epi32(m2, t3);
157  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
158  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
159  undiagonalize(&rows[0], &rows[2], &rows[3]);
160  m0 = t0;
161  m1 = t1;
162  m2 = t2;
163  m3 = t3;
164 
165  // Round 4
166  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
167  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
168  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
169  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
170  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
171  t1 = _mm_blend_epi16(tt, t1, 0xCC);
172  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
173  diagonalize(&rows[0], &rows[2], &rows[3]);
174  t2 = _mm_unpacklo_epi64(m3, m1);
175  tt = _mm_blend_epi16(t2, m2, 0xC0);
176  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
177  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
178  t3 = _mm_unpackhi_epi32(m1, m3);
179  tt = _mm_unpacklo_epi32(m2, t3);
180  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
181  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
182  undiagonalize(&rows[0], &rows[2], &rows[3]);
183  m0 = t0;
184  m1 = t1;
185  m2 = t2;
186  m3 = t3;
187 
188  // Round 5
189  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
190  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
191  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
192  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
193  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
194  t1 = _mm_blend_epi16(tt, t1, 0xCC);
195  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
196  diagonalize(&rows[0], &rows[2], &rows[3]);
197  t2 = _mm_unpacklo_epi64(m3, m1);
198  tt = _mm_blend_epi16(t2, m2, 0xC0);
199  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
200  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
201  t3 = _mm_unpackhi_epi32(m1, m3);
202  tt = _mm_unpacklo_epi32(m2, t3);
203  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
204  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
205  undiagonalize(&rows[0], &rows[2], &rows[3]);
206  m0 = t0;
207  m1 = t1;
208  m2 = t2;
209  m3 = t3;
210 
211  // Round 6
212  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
213  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
214  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
215  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
216  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
217  t1 = _mm_blend_epi16(tt, t1, 0xCC);
218  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
219  diagonalize(&rows[0], &rows[2], &rows[3]);
220  t2 = _mm_unpacklo_epi64(m3, m1);
221  tt = _mm_blend_epi16(t2, m2, 0xC0);
222  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
223  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
224  t3 = _mm_unpackhi_epi32(m1, m3);
225  tt = _mm_unpacklo_epi32(m2, t3);
226  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
227  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
228  undiagonalize(&rows[0], &rows[2], &rows[3]);
229  m0 = t0;
230  m1 = t1;
231  m2 = t2;
232  m3 = t3;
233 
234  // Round 7
235  t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
236  t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
237  g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
238  t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
239  tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
240  t1 = _mm_blend_epi16(tt, t1, 0xCC);
241  g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
242  diagonalize(&rows[0], &rows[2], &rows[3]);
243  t2 = _mm_unpacklo_epi64(m3, m1);
244  tt = _mm_blend_epi16(t2, m2, 0xC0);
245  t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
246  g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
247  t3 = _mm_unpackhi_epi32(m1, m3);
248  tt = _mm_unpacklo_epi32(m2, t3);
249  t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
250  g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
251  undiagonalize(&rows[0], &rows[2], &rows[3]);
252 }
253 
255  const uint8_t block[BLAKE3_BLOCK_LEN],
256  uint8_t block_len, uint64_t counter,
257  uint8_t flags) {
258  __m128i rows[4];
259  compress_pre(rows, cv, block, block_len, counter, flags);
260  storeu(xorv(rows[0], rows[2]), (uint8_t *)&cv[0]);
261  storeu(xorv(rows[1], rows[3]), (uint8_t *)&cv[4]);
262 }
263 
265  const uint8_t block[BLAKE3_BLOCK_LEN],
266  uint8_t block_len, uint64_t counter,
267  uint8_t flags, uint8_t out[64]) {
268  __m128i rows[4];
269  compress_pre(rows, cv, block, block_len, counter, flags);
270  storeu(xorv(rows[0], rows[2]), &out[0]);
271  storeu(xorv(rows[1], rows[3]), &out[16]);
272  storeu(xorv(rows[2], loadu((uint8_t *)&cv[0])), &out[32]);
273  storeu(xorv(rows[3], loadu((uint8_t *)&cv[4])), &out[48]);
274 }
275 
276 INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r) {
277  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
278  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
279  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
280  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
281  v[0] = addv(v[0], v[4]);
282  v[1] = addv(v[1], v[5]);
283  v[2] = addv(v[2], v[6]);
284  v[3] = addv(v[3], v[7]);
285  v[12] = xorv(v[12], v[0]);
286  v[13] = xorv(v[13], v[1]);
287  v[14] = xorv(v[14], v[2]);
288  v[15] = xorv(v[15], v[3]);
289  v[12] = rot16(v[12]);
290  v[13] = rot16(v[13]);
291  v[14] = rot16(v[14]);
292  v[15] = rot16(v[15]);
293  v[8] = addv(v[8], v[12]);
294  v[9] = addv(v[9], v[13]);
295  v[10] = addv(v[10], v[14]);
296  v[11] = addv(v[11], v[15]);
297  v[4] = xorv(v[4], v[8]);
298  v[5] = xorv(v[5], v[9]);
299  v[6] = xorv(v[6], v[10]);
300  v[7] = xorv(v[7], v[11]);
301  v[4] = rot12(v[4]);
302  v[5] = rot12(v[5]);
303  v[6] = rot12(v[6]);
304  v[7] = rot12(v[7]);
305  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
306  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
307  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
308  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
309  v[0] = addv(v[0], v[4]);
310  v[1] = addv(v[1], v[5]);
311  v[2] = addv(v[2], v[6]);
312  v[3] = addv(v[3], v[7]);
313  v[12] = xorv(v[12], v[0]);
314  v[13] = xorv(v[13], v[1]);
315  v[14] = xorv(v[14], v[2]);
316  v[15] = xorv(v[15], v[3]);
317  v[12] = rot8(v[12]);
318  v[13] = rot8(v[13]);
319  v[14] = rot8(v[14]);
320  v[15] = rot8(v[15]);
321  v[8] = addv(v[8], v[12]);
322  v[9] = addv(v[9], v[13]);
323  v[10] = addv(v[10], v[14]);
324  v[11] = addv(v[11], v[15]);
325  v[4] = xorv(v[4], v[8]);
326  v[5] = xorv(v[5], v[9]);
327  v[6] = xorv(v[6], v[10]);
328  v[7] = xorv(v[7], v[11]);
329  v[4] = rot7(v[4]);
330  v[5] = rot7(v[5]);
331  v[6] = rot7(v[6]);
332  v[7] = rot7(v[7]);
333 
334  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
335  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
336  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
337  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
338  v[0] = addv(v[0], v[5]);
339  v[1] = addv(v[1], v[6]);
340  v[2] = addv(v[2], v[7]);
341  v[3] = addv(v[3], v[4]);
342  v[15] = xorv(v[15], v[0]);
343  v[12] = xorv(v[12], v[1]);
344  v[13] = xorv(v[13], v[2]);
345  v[14] = xorv(v[14], v[3]);
346  v[15] = rot16(v[15]);
347  v[12] = rot16(v[12]);
348  v[13] = rot16(v[13]);
349  v[14] = rot16(v[14]);
350  v[10] = addv(v[10], v[15]);
351  v[11] = addv(v[11], v[12]);
352  v[8] = addv(v[8], v[13]);
353  v[9] = addv(v[9], v[14]);
354  v[5] = xorv(v[5], v[10]);
355  v[6] = xorv(v[6], v[11]);
356  v[7] = xorv(v[7], v[8]);
357  v[4] = xorv(v[4], v[9]);
358  v[5] = rot12(v[5]);
359  v[6] = rot12(v[6]);
360  v[7] = rot12(v[7]);
361  v[4] = rot12(v[4]);
362  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
363  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
364  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
365  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
366  v[0] = addv(v[0], v[5]);
367  v[1] = addv(v[1], v[6]);
368  v[2] = addv(v[2], v[7]);
369  v[3] = addv(v[3], v[4]);
370  v[15] = xorv(v[15], v[0]);
371  v[12] = xorv(v[12], v[1]);
372  v[13] = xorv(v[13], v[2]);
373  v[14] = xorv(v[14], v[3]);
374  v[15] = rot8(v[15]);
375  v[12] = rot8(v[12]);
376  v[13] = rot8(v[13]);
377  v[14] = rot8(v[14]);
378  v[10] = addv(v[10], v[15]);
379  v[11] = addv(v[11], v[12]);
380  v[8] = addv(v[8], v[13]);
381  v[9] = addv(v[9], v[14]);
382  v[5] = xorv(v[5], v[10]);
383  v[6] = xorv(v[6], v[11]);
384  v[7] = xorv(v[7], v[8]);
385  v[4] = xorv(v[4], v[9]);
386  v[5] = rot7(v[5]);
387  v[6] = rot7(v[6]);
388  v[7] = rot7(v[7]);
389  v[4] = rot7(v[4]);
390 }
391 
392 INLINE void transpose_vecs(__m128i vecs[DEGREE]) {
393  // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is
394  // 22/33. Note that this doesn't split the vector into two lanes, as the
395  // AVX2 counterparts do.
396  __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
397  __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
398  __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
399  __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
400 
401  // Interleave 64-bit lanes.
402  __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
403  __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
404  __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
405  __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
406 
407  vecs[0] = abcd_0;
408  vecs[1] = abcd_1;
409  vecs[2] = abcd_2;
410  vecs[3] = abcd_3;
411 }
412 
413 INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
414  size_t block_offset, __m128i out[16]) {
415  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
416  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
417  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
418  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
419  out[4] = loadu(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
420  out[5] = loadu(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
421  out[6] = loadu(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
422  out[7] = loadu(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
423  out[8] = loadu(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
424  out[9] = loadu(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
425  out[10] = loadu(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
426  out[11] = loadu(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
427  out[12] = loadu(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
428  out[13] = loadu(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
429  out[14] = loadu(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
430  out[15] = loadu(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
431  for (size_t i = 0; i < 4; ++i) {
432  _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
433  }
434  transpose_vecs(&out[0]);
435  transpose_vecs(&out[4]);
436  transpose_vecs(&out[8]);
437  transpose_vecs(&out[12]);
438 }
439 
440 INLINE void load_counters(uint64_t counter, bool increment_counter,
441  __m128i *out_lo, __m128i *out_hi) {
442  const __m128i mask = _mm_set1_epi32(-(int32_t)increment_counter);
443  const __m128i add0 = _mm_set_epi32(3, 2, 1, 0);
444  const __m128i add1 = _mm_and_si128(mask, add0);
445  __m128i l = _mm_add_epi32(_mm_set1_epi32((int32_t)counter), add1);
446  __m128i carry = _mm_cmpgt_epi32(_mm_xor_si128(add1, _mm_set1_epi32(0x80000000)),
447  _mm_xor_si128( l, _mm_set1_epi32(0x80000000)));
448  __m128i h = _mm_sub_epi32(_mm_set1_epi32((int32_t)(counter >> 32)), carry);
449  *out_lo = l;
450  *out_hi = h;
451 }
452 
453 static
454 void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks,
455  const uint32_t key[8], uint64_t counter,
456  bool increment_counter, uint8_t flags,
457  uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
458  __m128i h_vecs[8] = {
459  set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
460  set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
461  };
462  __m128i counter_low_vec, counter_high_vec;
463  load_counters(counter, increment_counter, &counter_low_vec,
464  &counter_high_vec);
465  uint8_t block_flags = flags | flags_start;
466 
467  for (size_t block = 0; block < blocks; block++) {
468  if (block + 1 == blocks) {
469  block_flags |= flags_end;
470  }
471  __m128i block_len_vec = set1(BLAKE3_BLOCK_LEN);
472  __m128i block_flags_vec = set1(block_flags);
473  __m128i msg_vecs[16];
474  transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
475 
476  __m128i v[16] = {
477  h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
478  h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
479  set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
480  counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
481  };
482  round_fn(v, msg_vecs, 0);
483  round_fn(v, msg_vecs, 1);
484  round_fn(v, msg_vecs, 2);
485  round_fn(v, msg_vecs, 3);
486  round_fn(v, msg_vecs, 4);
487  round_fn(v, msg_vecs, 5);
488  round_fn(v, msg_vecs, 6);
489  h_vecs[0] = xorv(v[0], v[8]);
490  h_vecs[1] = xorv(v[1], v[9]);
491  h_vecs[2] = xorv(v[2], v[10]);
492  h_vecs[3] = xorv(v[3], v[11]);
493  h_vecs[4] = xorv(v[4], v[12]);
494  h_vecs[5] = xorv(v[5], v[13]);
495  h_vecs[6] = xorv(v[6], v[14]);
496  h_vecs[7] = xorv(v[7], v[15]);
497 
498  block_flags = flags;
499  }
500 
501  transpose_vecs(&h_vecs[0]);
502  transpose_vecs(&h_vecs[4]);
503  // The first four vecs now contain the first half of each output, and the
504  // second four vecs contain the second half of each output.
505  storeu(h_vecs[0], &out[0 * sizeof(__m128i)]);
506  storeu(h_vecs[4], &out[1 * sizeof(__m128i)]);
507  storeu(h_vecs[1], &out[2 * sizeof(__m128i)]);
508  storeu(h_vecs[5], &out[3 * sizeof(__m128i)]);
509  storeu(h_vecs[2], &out[4 * sizeof(__m128i)]);
510  storeu(h_vecs[6], &out[5 * sizeof(__m128i)]);
511  storeu(h_vecs[3], &out[6 * sizeof(__m128i)]);
512  storeu(h_vecs[7], &out[7 * sizeof(__m128i)]);
513 }
514 
515 INLINE void hash_one_sse41(const uint8_t *input, size_t blocks,
516  const uint32_t key[8], uint64_t counter,
517  uint8_t flags, uint8_t flags_start,
518  uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
519  uint32_t cv[8];
520  memcpy(cv, key, BLAKE3_KEY_LEN);
521  uint8_t block_flags = flags | flags_start;
522  while (blocks > 0) {
523  if (blocks == 1) {
524  block_flags |= flags_end;
525  }
527  block_flags);
529  blocks -= 1;
530  block_flags = flags;
531  }
532  memcpy(out, cv, BLAKE3_OUT_LEN);
533 }
534 
535 void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
536  size_t blocks, const uint32_t key[8],
537  uint64_t counter, bool increment_counter,
538  uint8_t flags, uint8_t flags_start,
539  uint8_t flags_end, uint8_t *out) {
540  while (num_inputs >= DEGREE) {
541  blake3_hash4_sse41(inputs, blocks, key, counter, increment_counter, flags,
542  flags_start, flags_end, out);
543  if (increment_counter) {
544  counter += DEGREE;
545  }
546  inputs += DEGREE;
547  num_inputs -= DEGREE;
548  out = &out[DEGREE * BLAKE3_OUT_LEN];
549  }
550  while (num_inputs > 0) {
551  hash_one_sse41(inputs[0], blocks, key, counter, flags, flags_start,
552  flags_end, out);
553  if (increment_counter) {
554  counter += 1;
555  }
556  inputs += 1;
557  num_inputs -= 1;
558  out = &out[BLAKE3_OUT_LEN];
559  }
560 }
blake3_hash_many_sse41
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_sse41.c:535
i
i
Definition: README.txt:29
block
we get the following basic block
Definition: README_ALTIVEC.txt:95
set4
INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
Definition: blake3_sse41.c:26
BLAKE3_KEY_LEN
#define BLAKE3_KEY_LEN
Definition: blake3_impl.h:16
counter_high
INLINE uint32_t counter_high(uint64_t counter)
Definition: blake3_impl.h:152
compress_pre
INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_sse41.c:83
counter_low
INLINE uint32_t counter_low(uint64_t counter)
Definition: blake3_impl.h:150
rot16
INLINE __m128i rot16(__m128i x)
Definition: blake3_sse41.c:30
BLAKE3_BLOCK_LEN
#define BLAKE3_BLOCK_LEN
Definition: blake3_impl.h:18
xorv
INLINE __m128i xorv(__m128i a, __m128i b)
Definition: blake3_sse41.c:22
blake3_hash4_sse41
static void blake3_hash4_sse41(const uint8_t *const *inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_sse41.c:454
a
=0.0 ? 0.0 :(a > 0.0 ? 1.0 :-1.0) a
Definition: README.txt:489
rot8
INLINE __m128i rot8(__m128i x)
Definition: blake3_sse41.c:39
transpose_vecs
INLINE void transpose_vecs(__m128i vecs[DEGREE])
Definition: blake3_sse41.c:392
BLAKE3_OUT_LEN
#define BLAKE3_OUT_LEN
Definition: blake3_impl.h:17
b
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int b
Definition: README.txt:418
l
This requires reassociating to forms of expressions that are already something that reassoc doesn t think about yet These two functions should generate the same code on big endian int * l
Definition: README.txt:100
_mm_shuffle_ps2
#define _mm_shuffle_ps2(a, b, c)
Definition: blake3_sse41.c:7
storeu
INLINE void storeu(__m128i src, uint8_t dest[16])
Definition: blake3_sse41.c:15
input
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 input
Definition: README.txt:10
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
undiagonalize
INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3)
Definition: blake3_sse41.c:77
diagonalize
INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3)
Definition: blake3_sse41.c:71
MSG_SCHEDULE
static const uint8_t MSG_SCHEDULE[7][16]
Definition: blake3_impl.h:89
addv
INLINE __m128i addv(__m128i a, __m128i b)
Definition: blake3_sse41.c:19
hash_one_sse41
INLINE void hash_one_sse41(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
Definition: blake3_sse41.c:515
uint64_t
rot7
INLINE __m128i rot7(__m128i x)
Definition: blake3_sse41.c:44
set1
INLINE __m128i set1(uint32_t x)
Definition: blake3_sse41.c:24
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
loadu
INLINE __m128i loadu(const uint8_t src[16])
Definition: blake3_sse41.c:11
uint32_t
t1
<%struct.bf ** > define void t1() nounwind ssp
Definition: README.txt:1497
blake3_compress_xof_sse41
void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
Definition: blake3_sse41.c:264
load_counters
INLINE void load_counters(uint64_t counter, bool increment_counter, __m128i *out_lo, __m128i *out_hi)
Definition: blake3_sse41.c:440
rot12
INLINE __m128i rot12(__m128i x)
Definition: blake3_sse41.c:35
transpose_msg_vecs
INLINE void transpose_msg_vecs(const uint8_t *const *inputs, size_t block_offset, __m128i out[16])
Definition: blake3_sse41.c:413
x
TODO unsigned x
Definition: README.txt:10
blake3_compress_in_place_sse41
void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_sse41.c:254
g2
INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
Definition: blake3_sse41.c:58
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
h
the multiplication has a latency of four as opposed to two cycles for the movl lea variant It appears gcc place string data with linkonce linkage in section coalesced instead of section coalesced Take a look at darwin h
Definition: README.txt:261
d
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int int d
Definition: README.txt:418
blake3_impl.h
g1
INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3, __m128i m)
Definition: blake3_sse41.c:48
DEGREE
#define DEGREE
Definition: blake3_sse41.c:5
round_fn
INLINE void round_fn(__m128i v[16], __m128i m[16], size_t r)
Definition: blake3_sse41.c:276
INLINE
#define INLINE
Definition: blake3_impl.h:40