7#include <botan/internal/chacha.h>
9#include <botan/internal/simd_avx2.h>
15void ChaCha::chacha_avx2_x8(uint8_t output[64 * 8], uint32_t state[16],
size_t rounds) {
16 SIMD_8x32::reset_registers();
19 const SIMD_8x32 CTR0 = SIMD_8x32(0, 1, 2, 3, 4, 5, 6, 7);
21 const uint32_t C = 0xFFFFFFFF - state[12];
22 const SIMD_8x32 CTR1 = SIMD_8x32(0, C < 1, C < 2, C < 3, C < 4, C < 5, C < 6, C < 7);
24 SIMD_8x32 R00 = SIMD_8x32::splat(state[0]);
25 SIMD_8x32 R01 = SIMD_8x32::splat(state[1]);
26 SIMD_8x32 R02 = SIMD_8x32::splat(state[2]);
27 SIMD_8x32 R03 = SIMD_8x32::splat(state[3]);
28 SIMD_8x32 R04 = SIMD_8x32::splat(state[4]);
29 SIMD_8x32 R05 = SIMD_8x32::splat(state[5]);
30 SIMD_8x32 R06 = SIMD_8x32::splat(state[6]);
31 SIMD_8x32 R07 = SIMD_8x32::splat(state[7]);
32 SIMD_8x32 R08 = SIMD_8x32::splat(state[8]);
33 SIMD_8x32 R09 = SIMD_8x32::splat(state[9]);
34 SIMD_8x32 R10 = SIMD_8x32::splat(state[10]);
35 SIMD_8x32 R11 = SIMD_8x32::splat(state[11]);
36 SIMD_8x32 R12 = SIMD_8x32::splat(state[12]) + CTR0;
37 SIMD_8x32 R13 = SIMD_8x32::splat(state[13]) + CTR1;
38 SIMD_8x32 R14 = SIMD_8x32::splat(state[14]);
39 SIMD_8x32 R15 = SIMD_8x32::splat(state[15]);
41 for(
size_t r = 0; r != rounds / 2; ++r) {
112 R15 = R15.rotl<16>();
113 R12 = R12.rotl<16>();
114 R13 = R13.rotl<16>();
115 R14 = R14.rotl<16>();
127 R05 = R05.rotl<12>();
128 R06 = R06.rotl<12>();
129 R07 = R07.rotl<12>();
130 R04 = R04.rotl<12>();
163 R00 += SIMD_8x32::splat(state[0]);
164 R01 += SIMD_8x32::splat(state[1]);
165 R02 += SIMD_8x32::splat(state[2]);
166 R03 += SIMD_8x32::splat(state[3]);
167 R04 += SIMD_8x32::splat(state[4]);
168 R05 += SIMD_8x32::splat(state[5]);
169 R06 += SIMD_8x32::splat(state[6]);
170 R07 += SIMD_8x32::splat(state[7]);
171 R08 += SIMD_8x32::splat(state[8]);
172 R09 += SIMD_8x32::splat(state[9]);
173 R10 += SIMD_8x32::splat(state[10]);
174 R11 += SIMD_8x32::splat(state[11]);
175 R12 += SIMD_8x32::splat(state[12]) + CTR0;
176 R13 += SIMD_8x32::splat(state[13]) + CTR1;
177 R14 += SIMD_8x32::splat(state[14]);
178 R15 += SIMD_8x32::splat(state[15]);
180 SIMD_8x32::transpose(R00, R01, R02, R03, R04, R05, R06, R07);
181 SIMD_8x32::transpose(R08, R09, R10, R11, R12, R13, R14, R15);
183 R00.store_le(output);
184 R08.store_le(output + 32 * 1);
185 R01.store_le(output + 32 * 2);
186 R09.store_le(output + 32 * 3);
187 R02.store_le(output + 32 * 4);
188 R10.store_le(output + 32 * 5);
189 R03.store_le(output + 32 * 6);
190 R11.store_le(output + 32 * 7);
191 R04.store_le(output + 32 * 8);
192 R12.store_le(output + 32 * 9);
193 R05.store_le(output + 32 * 10);
194 R13.store_le(output + 32 * 11);
195 R06.store_le(output + 32 * 12);
196 R14.store_le(output + 32 * 13);
197 R07.store_le(output + 32 * 14);
198 R15.store_le(output + 32 * 15);
200 SIMD_8x32::zero_registers();
#define BOTAN_ASSERT(expr, assertion_made)