134 __m128i t0, t1, t2, t3, tt;
139 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
141 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
144 t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));
145 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
147 t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));
148 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
158 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
159 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
161 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
162 t1 = _mm_blend_epi16(tt, t1, 0xCC);
163 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
165 t2 = _mm_unpacklo_epi64(m3, m1);
166 tt = _mm_blend_epi16(t2, m2, 0xC0);
167 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
168 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
169 t3 = _mm_unpackhi_epi32(m1, m3);
170 tt = _mm_unpacklo_epi32(m2, t3);
171 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
172 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
181 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
182 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
184 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
185 t1 = _mm_blend_epi16(tt, t1, 0xCC);
186 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
188 t2 = _mm_unpacklo_epi64(m3, m1);
189 tt = _mm_blend_epi16(t2, m2, 0xC0);
190 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
191 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
192 t3 = _mm_unpackhi_epi32(m1, m3);
193 tt = _mm_unpacklo_epi32(m2, t3);
194 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
195 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
204 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
205 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
207 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
208 t1 = _mm_blend_epi16(tt, t1, 0xCC);
209 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
211 t2 = _mm_unpacklo_epi64(m3, m1);
212 tt = _mm_blend_epi16(t2, m2, 0xC0);
213 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
214 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
215 t3 = _mm_unpackhi_epi32(m1, m3);
216 tt = _mm_unpacklo_epi32(m2, t3);
217 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
218 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
227 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
228 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
230 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
231 t1 = _mm_blend_epi16(tt, t1, 0xCC);
232 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
234 t2 = _mm_unpacklo_epi64(m3, m1);
235 tt = _mm_blend_epi16(t2, m2, 0xC0);
236 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
237 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
238 t3 = _mm_unpackhi_epi32(m1, m3);
239 tt = _mm_unpacklo_epi32(m2, t3);
240 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
241 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
250 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
251 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
253 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
254 t1 = _mm_blend_epi16(tt, t1, 0xCC);
255 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
257 t2 = _mm_unpacklo_epi64(m3, m1);
258 tt = _mm_blend_epi16(t2, m2, 0xC0);
259 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
260 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
261 t3 = _mm_unpackhi_epi32(m1, m3);
262 tt = _mm_unpacklo_epi32(m2, t3);
263 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
264 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
273 t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
274 g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
276 tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
277 t1 = _mm_blend_epi16(tt, t1, 0xCC);
278 g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
280 t2 = _mm_unpacklo_epi64(m3, m1);
281 tt = _mm_blend_epi16(t2, m2, 0xC0);
282 t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
283 g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
284 t3 = _mm_unpackhi_epi32(m1, m3);
285 tt = _mm_unpacklo_epi32(m2, t3);
286 t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
287 g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
1041 __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
1042 __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
1043 __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
1044 __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
1045 __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
1046 __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
1047 __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
1048 __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
1049 __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
1050 __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
1051 __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
1052 __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
1053 __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
1054 __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
1055 __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
1056 __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
1063 __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
1064 __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
1065 __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
1066 __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
1067 __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
1068 __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
1069 __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
1070 __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
1071 __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
1072 __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
1073 __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
1074 __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
1075 __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
1076 __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
1077 __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
1078 __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
1171 bool increment_counter,
uint8_t flags,
1174 __m512i h_vecs[8] = {
1178 __m512i counter_low_vec, counter_high_vec;
1181 uint8_t block_flags = flags | flags_start;
1185 block_flags |= flags_end;
1188 __m512i block_flags_vec =
set1_512(block_flags);
1189 __m512i msg_vecs[16];
1193 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1194 h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1196 counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
1205 h_vecs[0] =
xor_512(v[0], v[8]);
1206 h_vecs[1] =
xor_512(v[1], v[9]);
1207 h_vecs[2] =
xor_512(v[2], v[10]);
1208 h_vecs[3] =
xor_512(v[3], v[11]);
1209 h_vecs[4] =
xor_512(v[4], v[12]);
1210 h_vecs[5] =
xor_512(v[5], v[13]);
1211 h_vecs[6] =
xor_512(v[6], v[14]);
1212 h_vecs[7] =
xor_512(v[7], v[15]);
1214 block_flags = flags;
1220 __m512i padded[16] = {
1221 h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
1222 h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
1227 _mm256_mask_storeu_epi32(&out[0 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
1228 _mm256_mask_storeu_epi32(&out[1 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
1229 _mm256_mask_storeu_epi32(&out[2 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
1230 _mm256_mask_storeu_epi32(&out[3 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
1231 _mm256_mask_storeu_epi32(&out[4 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
1232 _mm256_mask_storeu_epi32(&out[5 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
1233 _mm256_mask_storeu_epi32(&out[6 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
1234 _mm256_mask_storeu_epi32(&out[7 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
1235 _mm256_mask_storeu_epi32(&out[8 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
1236 _mm256_mask_storeu_epi32(&out[9 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
1237 _mm256_mask_storeu_epi32(&out[10 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
1238 _mm256_mask_storeu_epi32(&out[11 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
1239 _mm256_mask_storeu_epi32(&out[12 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
1240 _mm256_mask_storeu_epi32(&out[13 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
1241 _mm256_mask_storeu_epi32(&out[14 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
1242 _mm256_mask_storeu_epi32(&out[15 *
sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));