Go to the documentation of this file.
10 #elif defined(__GNUC__)
11 #include <immintrin.h>
13 #error "Unimplemented!"
17 #define MAYBE_UNUSED(x) (void)((x))
25 __asm__ __volatile__(
"xgetbv\n" :
"=a"(
eax),
"=d"(
edx) :
"c"(0));
32 __cpuid((
int *)out,
id);
33 #elif defined(__i386__) || defined(_M_IX86)
34 __asm__ __volatile__(
"movl %%ebx, %1\n"
37 :
"=a"(out[0]),
"=r"(out[1]),
"=c"(out[2]),
"=d"(out[3])
40 __asm__ __volatile__(
"cpuid\n"
41 :
"=a"(out[0]),
"=b"(out[1]),
"=c"(out[2]),
"=d"(out[3])
48 __cpuidex((
int *)out,
id, sid);
49 #elif defined(__i386__) || defined(_M_IX86)
50 __asm__ __volatile__(
"movl %%ebx, %1\n"
53 :
"=a"(out[0]),
"=r"(out[1]),
"=c"(out[2]),
"=d"(out[3])
56 __asm__ __volatile__(
"cpuid\n"
57 :
"=a"(out[0]),
"=b"(out[1]),
"=c"(out[2]),
"=d"(out[3])
76 #if !defined(BLAKE3_TESTING)
82 #if !defined(BLAKE3_TESTING)
97 const int max_id = *
eax;
99 #if defined(__amd64__) || defined(_M_X64)
102 if (*
edx & (1UL << 26))
105 if (*
ecx & (1UL << 0))
107 if (*
ecx & (1UL << 19))
110 if (*
ecx & (1UL << 27)) {
112 if ((mask & 6) == 6) {
113 if (*
ecx & (1UL << 28))
117 if (*
ebx & (1UL << 5))
119 if ((mask & 224) == 224) {
120 if (*
ebx & (1UL << 31))
122 if (*
ebx & (1UL << 16))
139 uint8_t block_len,
uint64_t counter,
144 #if !defined(BLAKE3_NO_AVX512)
150 #if !defined(BLAKE3_NO_SSE41)
151 if (features &
SSE41) {
156 #if !defined(BLAKE3_NO_SSE2)
157 if (features &
SSE2) {
168 uint8_t block_len,
uint64_t counter, uint8_t flags,
173 #if !defined(BLAKE3_NO_AVX512)
179 #if !defined(BLAKE3_NO_SSE41)
180 if (features &
SSE41) {
185 #if !defined(BLAKE3_NO_SSE2)
186 if (features &
SSE2) {
197 bool increment_counter, uint8_t flags,
198 uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
202 #if !defined(BLAKE3_NO_AVX512)
205 increment_counter, flags, flags_start, flags_end,
210 #if !defined(BLAKE3_NO_AVX2)
211 if (features &
AVX2) {
213 increment_counter, flags, flags_start, flags_end,
218 #if !defined(BLAKE3_NO_SSE41)
219 if (features &
SSE41) {
221 increment_counter, flags, flags_start, flags_end,
226 #if !defined(BLAKE3_NO_SSE2)
227 if (features &
SSE2) {
229 increment_counter, flags, flags_start, flags_end,
236 #if BLAKE3_USE_NEON == 1
237 blake3_hash_many_neon(inputs, num_inputs, blocks, key, counter,
238 increment_counter, flags, flags_start, flags_end, out);
243 increment_counter, flags, flags_start, flags_end,
252 #if !defined(BLAKE3_NO_AVX512)
257 #if !defined(BLAKE3_NO_AVX2)
258 if (features &
AVX2) {
262 #if !defined(BLAKE3_NO_SSE41)
263 if (features &
SSE41) {
267 #if !defined(BLAKE3_NO_SSE2)
268 if (features &
SSE2) {
273 #if BLAKE3_USE_NEON == 1
we get the following basic block
#define LLVM_ATTRIBUTE_USED
Instead of the following for memset char edx edx edx It might be better to generate eax movl edx movl edx movw edx when we can spare a register It reduces code size Evaluate what the best way to codegen sdiv C is For we currently get ret i32 Y eax movl ecx ecx ecx addl ecx
Add support for conditional and other related patterns Instead eax eax je LBB16_2 eax edi eax movl eax
void blake3_compress_xof_sse2(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
LLVM_LIBRARY_VISIBILITY void blake3_compress_in_place_portable(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
void blake3_compress_in_place_avx512(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
static LLVM_ATTRIBUTE_USED enum cpu_feature get_cpu_features(void)
void blake3_compress_xof_avx512(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
sar eax, 31) more aggressively edx
void blake3_hash_many_sse2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
void blake3_compress_in_place_sse2(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
static enum cpu_feature g_cpu_features
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
void blake3_compress_xof(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
LLVM_LIBRARY_VISIBILITY void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
void blake3_compress_xof_sse41(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
void blake3_compress_in_place_sse41(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
LLVM_LIBRARY_VISIBILITY void blake3_compress_xof_portable(const uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags, uint8_t out[64])
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, bool increment_counter, uint8_t flags, uint8_t flags_start, uint8_t flags_end, uint8_t *out)
void blake3_compress_in_place(uint32_t cv[8], const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, uint64_t counter, uint8_t flags)
http eax xorl edx cl sete al setne dl sall eax sall edx But that requires good bit subreg support this might be better It s an extra but it s one instruction and doesn t stress bit subreg eax eax movl edx edx sall eax sall cl edx bit we should expand to a conditional branch like GCC produces Some isel and Sequencing of Instructions Scheduling for reduced register pressure E g Minimum Register Instruction Sequence load p Because the compare isn t it is not matched with the load on both sides The dag combiner should be made smart enough to canonicalize the load into the RHS of a compare when it can invert the result of the compare for free In many LLVM generates code like eax cmpl esp setl al movzbl eax ret on some it is more efficient to do ebx xor eax cmpl ebx
size_t blake3_simd_degree(void)