44constexpr uint8_t pre_c = 0b01000101;
79 const __m256i xr = x.
raw();
82 const auto y123 = _mm256_gf2p8affine_epi64_epi8(xr, _mm256_set1_epi64x(pre123_a), pre_c);
83 const auto y4 = _mm256_gf2p8affine_epi64_epi8(xr, _mm256_set1_epi64x(pre4_a), pre_c);
85 const auto s1 = _mm256_gf2p8affineinv_epi64_epi8(y123, _mm256_set1_epi64x(post14_a), 0x6E);
86 const auto s2 = _mm256_gf2p8affineinv_epi64_epi8(y123, _mm256_set1_epi64x(post2_a), 0xDC);
87 const auto s3 = _mm256_gf2p8affineinv_epi64_epi8(y123, _mm256_set1_epi64x(post3_a), 0x37);
88 const auto s4 = _mm256_gf2p8affineinv_epi64_epi8(y4, _mm256_set1_epi64x(post14_a), 0x6E);
92 const auto mask_s2 = _mm256_set1_epi64x(0x00FF0000FF000000);
93 const auto mask_s3 = _mm256_set1_epi64x(0x0000FF0000FF0000);
94 const auto mask_s4 = _mm256_set1_epi64x(0x000000FF0000FF00);
97 sx = _mm256_blendv_epi8(sx, s2, mask_s2);
98 sx = _mm256_blendv_epi8(sx, s3, mask_s3);
99 sx = _mm256_blendv_epi8(sx, s4, mask_s4);
102 const auto P1 = _mm256_set_epi64x(0x0808080908080809, 0x0000000100000001, 0x0808080908080809, 0x0000000100000001);
103 const auto P2 = _mm256_set_epi64x(0x09090A0A09090A0A, 0x0101020201010202, 0x09090A0A09090A0A, 0x0101020201010202);
104 const auto P3 = _mm256_set_epi64x(0x0A0B0B0B0A0B0B0B, 0x0203030302030303, 0x0A0B0B0B0A0B0B0B, 0x0203030302030303);
105 const auto P4 = _mm256_set_epi64x(0x0C0C0D0C0E0D0C0C, 0x0404050406050404, 0x0C0C0D0C0E0D0C0C, 0x0404050406050404);
106 const auto P5 = _mm256_set_epi64x(0x0D0E0E0D0F0E0D0F, 0x0506060507060507, 0x0D0E0E0D0F0E0D0F, 0x0506060507060507);
107 const auto P6 = _mm256_set_epi64x(0x0F0F0F0EFFFFFFFF, 0x07070706FFFFFFFF, 0x0F0F0F0EFFFFFFFF, 0x07070706FFFFFFFF);
109 const auto t1 =
SIMD_4x64(_mm256_shuffle_epi8(sx,
P1));
110 const auto t2 =
SIMD_4x64(_mm256_shuffle_epi8(sx, P2));
111 const auto t3 =
SIMD_4x64(_mm256_shuffle_epi8(sx, P3));
112 const auto t4 =
SIMD_4x64(_mm256_shuffle_epi8(sx, P4));
113 const auto t5 =
SIMD_4x64(_mm256_shuffle_epi8(sx, P5));
114 const auto t6 =
SIMD_4x64(_mm256_shuffle_epi8(sx, P6));
116 return (t1 ^ t2 ^ t3 ^ t4 ^ t5 ^ t6);
123 auto Ap = _mm256_permute4x64_epi64(A.raw(), 0b11'01'10'00);
124 auto Bp = _mm256_permute4x64_epi64(B.raw(), 0b11'01'10'00);
126 L =
SIMD_4x64(_mm256_permute2x128_si256(Ap, Bp, 0x20));
127 R =
SIMD_4x64(_mm256_permute2x128_si256(Ap, Bp, 0x31));
131 auto T1 = _mm256_permute2x128_si256(R.
raw(), L.
raw(), 0x20);
132 auto T2 = _mm256_permute2x128_si256(R.
raw(), L.
raw(), 0x31);
134 auto A =
SIMD_4x64(_mm256_permute4x64_epi64(T1, 0b11'01'10'00));
135 auto B =
SIMD_4x64(_mm256_permute4x64_epi64(T2, 0b11'01'10'00));
138 B.store_be(out + 32);
145 return SIMD_4x64(_mm256_or_si256(_mm256_slli_epi32(t.
raw(), 1), _mm256_srli_epi32(t.
raw(), 31)));
151 const uint32_t k1 =
static_cast<uint32_t
>(K >> 32);
152 const uint32_t k2 =
static_cast<uint32_t
>(K & 0xFFFFFFFF);
154 auto x1 = v.
shr<32>();
160 return x1.shl<32>() | x2;
164 const uint32_t k1 =
static_cast<uint32_t
>(K >> 32);
165 const uint32_t k2 =
static_cast<uint32_t
>(K & 0xFFFFFFFF);
167 auto x1 = v.
shr<32>();
173 return x1.shl<32>() | x2;
196BOTAN_FN_ISA_AVX2_GFNI
197void camellia_encrypt_x4_18r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
200 load_and_deinterleave(in, L, R);
205 six_e_rounds(L, R, SK.subspan(2));
208 R = FLINV_4(R, SK[9]);
210 six_e_rounds(L, R, SK.subspan(10));
213 R = FLINV_4(R, SK[17]);
215 six_e_rounds(L, R, SK.subspan(18));
220 interleave_and_store(out, L, R);
223BOTAN_FN_ISA_AVX2_GFNI
224void camellia_decrypt_x4_18r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
227 load_and_deinterleave(in, L, R);
232 six_d_rounds(L, R, SK.subspan(18));
235 R = FLINV_4(R, SK[16]);
237 six_d_rounds(L, R, SK.subspan(10));
240 R = FLINV_4(R, SK[8]);
242 six_d_rounds(L, R, SK.subspan(2));
247 interleave_and_store(out, L, R);
250BOTAN_FN_ISA_AVX2_GFNI
251void camellia_encrypt_x4_24r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
254 load_and_deinterleave(in, L, R);
259 six_e_rounds(L, R, SK.subspan(2));
262 R = FLINV_4(R, SK[9]);
264 six_e_rounds(L, R, SK.subspan(10));
267 R = FLINV_4(R, SK[17]);
269 six_e_rounds(L, R, SK.subspan(18));
272 R = FLINV_4(R, SK[25]);
274 six_e_rounds(L, R, SK.subspan(26));
279 interleave_and_store(out, L, R);
282BOTAN_FN_ISA_AVX2_GFNI
283void camellia_decrypt_x4_24r(
const uint8_t in[], uint8_t out[], std::span<const uint64_t> SK) {
286 load_and_deinterleave(in, L, R);
291 six_d_rounds(L, R, SK.subspan(26));
294 R = FLINV_4(R, SK[24]);
296 six_d_rounds(L, R, SK.subspan(18));
299 R = FLINV_4(R, SK[16]);
301 six_d_rounds(L, R, SK.subspan(10));
304 R = FLINV_4(R, SK[8]);
306 six_d_rounds(L, R, SK.subspan(2));
311 interleave_and_store(out, L, R);