LLVM  16.0.0git
blake3_portable.c
Go to the documentation of this file.
1 #include "blake3_impl.h"
2 #include <string.h>
3 
5  return (w >> c) | (w << (32 - c));
6 }
7 
8 INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d,
9  uint32_t x, uint32_t y) {
10  state[a] = state[a] + state[b] + x;
11  state[d] = rotr32(state[d] ^ state[a], 16);
12  state[c] = state[c] + state[d];
13  state[b] = rotr32(state[b] ^ state[c], 12);
14  state[a] = state[a] + state[b] + y;
15  state[d] = rotr32(state[d] ^ state[a], 8);
16  state[c] = state[c] + state[d];
17  state[b] = rotr32(state[b] ^ state[c], 7);
18 }
19 
20 INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round) {
21  // Select the message schedule based on the round.
22  const uint8_t *schedule = MSG_SCHEDULE[round];
23 
24  // Mix the columns.
25  g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]);
26  g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]);
27  g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]);
28  g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]);
29 
30  // Mix the rows.
31  g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]);
32  g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]);
33  g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]);
34  g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]);
35 }
36 
37 INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8],
38  const uint8_t block[BLAKE3_BLOCK_LEN],
39  uint8_t block_len, uint64_t counter, uint8_t flags) {
40  uint32_t block_words[16];
41  block_words[0] = load32(block + 4 * 0);
42  block_words[1] = load32(block + 4 * 1);
43  block_words[2] = load32(block + 4 * 2);
44  block_words[3] = load32(block + 4 * 3);
45  block_words[4] = load32(block + 4 * 4);
46  block_words[5] = load32(block + 4 * 5);
47  block_words[6] = load32(block + 4 * 6);
48  block_words[7] = load32(block + 4 * 7);
49  block_words[8] = load32(block + 4 * 8);
50  block_words[9] = load32(block + 4 * 9);
51  block_words[10] = load32(block + 4 * 10);
52  block_words[11] = load32(block + 4 * 11);
53  block_words[12] = load32(block + 4 * 12);
54  block_words[13] = load32(block + 4 * 13);
55  block_words[14] = load32(block + 4 * 14);
56  block_words[15] = load32(block + 4 * 15);
57 
58  state[0] = cv[0];
59  state[1] = cv[1];
60  state[2] = cv[2];
61  state[3] = cv[3];
62  state[4] = cv[4];
63  state[5] = cv[5];
64  state[6] = cv[6];
65  state[7] = cv[7];
66  state[8] = IV[0];
67  state[9] = IV[1];
68  state[10] = IV[2];
69  state[11] = IV[3];
70  state[12] = counter_low(counter);
71  state[13] = counter_high(counter);
72  state[14] = (uint32_t)block_len;
73  state[15] = (uint32_t)flags;
74 
75  round_fn(state, &block_words[0], 0);
76  round_fn(state, &block_words[0], 1);
77  round_fn(state, &block_words[0], 2);
78  round_fn(state, &block_words[0], 3);
79  round_fn(state, &block_words[0], 4);
80  round_fn(state, &block_words[0], 5);
81  round_fn(state, &block_words[0], 6);
82 }
83 
85  const uint8_t block[BLAKE3_BLOCK_LEN],
86  uint8_t block_len, uint64_t counter,
87  uint8_t flags) {
88  uint32_t state[16];
89  compress_pre(state, cv, block, block_len, counter, flags);
90  cv[0] = state[0] ^ state[8];
91  cv[1] = state[1] ^ state[9];
92  cv[2] = state[2] ^ state[10];
93  cv[3] = state[3] ^ state[11];
94  cv[4] = state[4] ^ state[12];
95  cv[5] = state[5] ^ state[13];
96  cv[6] = state[6] ^ state[14];
97  cv[7] = state[7] ^ state[15];
98 }
99 
101  const uint8_t block[BLAKE3_BLOCK_LEN],
102  uint8_t block_len, uint64_t counter,
103  uint8_t flags, uint8_t out[64]) {
104  uint32_t state[16];
105  compress_pre(state, cv, block, block_len, counter, flags);
106 
107  store32(&out[0 * 4], state[0] ^ state[8]);
108  store32(&out[1 * 4], state[1] ^ state[9]);
109  store32(&out[2 * 4], state[2] ^ state[10]);
110  store32(&out[3 * 4], state[3] ^ state[11]);
111  store32(&out[4 * 4], state[4] ^ state[12]);
112  store32(&out[5 * 4], state[5] ^ state[13]);
113  store32(&out[6 * 4], state[6] ^ state[14]);
114  store32(&out[7 * 4], state[7] ^ state[15]);
115  store32(&out[8 * 4], state[8] ^ cv[0]);
116  store32(&out[9 * 4], state[9] ^ cv[1]);
117  store32(&out[10 * 4], state[10] ^ cv[2]);
118  store32(&out[11 * 4], state[11] ^ cv[3]);
119  store32(&out[12 * 4], state[12] ^ cv[4]);
120  store32(&out[13 * 4], state[13] ^ cv[5]);
121  store32(&out[14 * 4], state[14] ^ cv[6]);
122  store32(&out[15 * 4], state[15] ^ cv[7]);
123 }
124 
125 INLINE void hash_one_portable(const uint8_t *input, size_t blocks,
126  const uint32_t key[8], uint64_t counter,
127  uint8_t flags, uint8_t flags_start,
128  uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
129  uint32_t cv[8];
130  memcpy(cv, key, BLAKE3_KEY_LEN);
131  uint8_t block_flags = flags | flags_start;
132  while (blocks > 0) {
133  if (blocks == 1) {
134  block_flags |= flags_end;
135  }
137  block_flags);
139  blocks -= 1;
140  block_flags = flags;
141  }
142  store_cv_words(out, cv);
143 }
144 
145 void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
146  size_t blocks, const uint32_t key[8],
147  uint64_t counter, bool increment_counter,
148  uint8_t flags, uint8_t flags_start,
149  uint8_t flags_end, uint8_t *out) {
150  while (num_inputs > 0) {
151  hash_one_portable(inputs[0], blocks, key, counter, flags, flags_start,
152  flags_end, out);
153  if (increment_counter) {
154  counter += 1;
155  }
156  inputs += 1;
157  num_inputs -= 1;
158  out = &out[BLAKE3_OUT_LEN];
159  }
160 }
block
we get the following basic block
Definition: README_ALTIVEC.txt:95
BLAKE3_KEY_LEN
#define BLAKE3_KEY_LEN
Definition: blake3_impl.h:16
g
INLINE void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, uint32_t x, uint32_t y)
Definition: blake3_portable.c:8
counter_high
INLINE uint32_t counter_high(uint64_t counter)
Definition: blake3_impl.h:152
counter_low
INLINE uint32_t counter_low(uint64_t counter)
Definition: blake3_impl.h:150
round_fn
INLINE void round_fn(uint32_t state[16], const uint32_t *msg, size_t round)
Definition: blake3_portable.c:20
BLAKE3_BLOCK_LEN
#define BLAKE3_BLOCK_LEN
Definition: blake3_impl.h:18
blake3_compress_xof_portable
void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
Definition: blake3_portable.c:100
a
=0.0 ? 0.0 :(a > 0.0 ? 1.0 :-1.0) a
Definition: README.txt:489
blake3_hash_many_portable
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
Definition: blake3_portable.c:145
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:56
BLAKE3_OUT_LEN
#define BLAKE3_OUT_LEN
Definition: blake3_impl.h:17
b
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int b
Definition: README.txt:418
load32
INLINE uint32_t load32(const void *src)
Definition: blake3_impl.h:156
input
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 input
Definition: README.txt:10
c
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int c
Definition: README.txt:418
MSG_SCHEDULE
static const uint8_t MSG_SCHEDULE[7][16]
Definition: blake3_impl.h:89
hash_one_portable
INLINE void hash_one_portable(const uint8_t *input, size_t blocks, const uint32_t key[8], uint64_t counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN])
Definition: blake3_portable.c:125
uint64_t
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
store32
INLINE void store32(void *dst, uint32_t w)
Definition: blake3_impl.h:174
blake3_compress_in_place_portable
void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_portable.c:84
uint32_t
rotr32
INLINE uint32_t rotr32(uint32_t w, uint32_t c)
Definition: blake3_portable.c:4
x
TODO unsigned x
Definition: README.txt:10
y
into llvm powi allowing the code generator to produce balanced multiplication trees the intrinsic needs to be extended to support and second the code generator needs to be enhanced to lower these to multiplication trees Interesting testcase for add shift mul int y
Definition: README.txt:61
compress_pre
INLINE void compress_pre(uint32_t state[16], const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
Definition: blake3_portable.c:37
IV
static const uint32_t IV[8]
Definition: blake3_impl.h:85
d
the resulting code requires compare and branches when and if the revised code is with conditional branches instead of More there is a byte word extend before each where there should be only and the condition codes are not remembered when the same two values are compared twice More LSR enhancements i8 and i32 load store addressing modes are identical int int int d
Definition: README.txt:418
blake3_impl.h
store_cv_words
INLINE void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8])
Definition: blake3_impl.h:182
INLINE
#define INLINE
Definition: blake3_impl.h:40