7#include <botan/internal/keccak_perm.h>
9#include <botan/internal/isa_extn.h>
16class SIMD_5x64 final {
18 explicit BOTAN_FN_ISA_AVX512 SIMD_5x64() : SIMD_5x64(_mm512_setzero_si512()) {}
20 static BOTAN_FN_ISA_AVX512 SIMD_5x64 rc(uint64_t RC) {
21 return SIMD_5x64(_mm512_maskz_set1_epi64(0b00000001, RC));
24 static BOTAN_FN_ISA_AVX512 SIMD_5x64 load(
const uint64_t v[5]) {
25 return SIMD_5x64(_mm512_maskz_loadu_epi64(0b00011111, v));
28 template <
size_t I0,
size_t I1,
size_t I2,
size_t I3,
size_t I4>
29 inline BOTAN_FN_ISA_AVX512 SIMD_5x64 permute()
const {
30 static_assert(I0 < 5 && I1 < 5 && I2 < 5 && I3 < 5 && I4 < 5);
31 const __m512i tbl = _mm512_setr_epi64(I0, I1, I2, I3, I4, 0, 0, 0);
32 return SIMD_5x64(_mm512_permutexvar_epi64(tbl, m_v));
35 static BOTAN_FN_ISA_AVX512
void transpose5(
36 SIMD_5x64& i0, SIMD_5x64& i1, SIMD_5x64& i2, SIMD_5x64& i3, SIMD_5x64& i4) {
39 const auto lo_01 = _mm512_unpacklo_epi64(i0.m_v, i1.m_v);
40 const auto lo_23 = _mm512_unpacklo_epi64(i2.m_v, i3.m_v);
42 const auto hi_01 = _mm512_unpackhi_epi64(i0.m_v, i1.m_v);
43 const auto hi_23 = _mm512_unpackhi_epi64(i2.m_v, i3.m_v);
46 const auto i4_lo_idx = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 8, 10);
47 const auto i4_hi_idx = _mm512_setr_epi64(0, 1, 2, 3, -1, -1, 9, 11);
49 auto t0 = _mm512_permutex2var_epi64(lo_01, i4_lo_idx, i4.m_v);
50 auto t2 = _mm512_permutex2var_epi64(hi_01, i4_hi_idx, i4.m_v);
53 const auto idx0 = _mm512_setr_epi64(0, 1, 8, 9, 6, -1, -1, -1);
54 const auto idx1 = _mm512_setr_epi64(2, 3, 10, 11, 7, -1, -1, -1);
55 const auto idx4 = _mm512_setr_epi64(4, 5, 12, 13, -1, -1, -1, -1);
57 i0.m_v = _mm512_permutex2var_epi64(t0, idx0, lo_23);
58 i1.m_v = _mm512_permutex2var_epi64(t2, idx0, hi_23);
59 i2.m_v = _mm512_permutex2var_epi64(t0, idx1, lo_23);
60 i3.m_v = _mm512_permutex2var_epi64(t2, idx1, hi_23);
61 i4.m_v = _mm512_mask_blend_epi64(0b00010000, _mm512_permutex2var_epi64(t0, idx4, lo_23), i4.m_v);
64 static BOTAN_FN_ISA_AVX512 SIMD_5x64 chi(
const SIMD_5x64& x,
const SIMD_5x64& y,
const SIMD_5x64& z) {
65 constexpr uint8_t xor_not_and = 0b11010010;
66 return SIMD_5x64(_mm512_ternarylogic_epi64(x.m_v, y.m_v, z.m_v, xor_not_and));
69 friend BOTAN_FN_ISA_AVX512 SIMD_5x64
operator^(
const SIMD_5x64& x,
const SIMD_5x64& y) {
70 return SIMD_5x64(_mm512_xor_epi64(x.m_v, y.m_v));
73 static BOTAN_FN_ISA_AVX512 SIMD_5x64
74 xor5(
const SIMD_5x64& i0,
const SIMD_5x64& i1,
const SIMD_5x64& i2,
const SIMD_5x64& i3,
const SIMD_5x64& i4) {
75 constexpr uint8_t tern_xor = 0b10010110;
76 auto t = _mm512_ternarylogic_epi64(i0.m_v, i1.m_v, i2.m_v, tern_xor);
77 return SIMD_5x64(_mm512_ternarylogic_epi64(i3.m_v, i4.m_v, t, tern_xor));
80 BOTAN_FN_ISA_AVX512 SIMD_5x64 rol1()
const {
return SIMD_5x64(_mm512_rol_epi64(m_v, 1)); }
82 template <
size_t R0,
size_t R1,
size_t R2,
size_t R3,
size_t R4>
83 BOTAN_FN_ISA_AVX512 SIMD_5x64 rolv()
const {
84 static_assert(R0 < 64 &&
R1 < 64 &&
R2 < 64 && R3 < 64 && R4 < 64);
85 const __m512i rot = _mm512_setr_epi64(R0,
R1,
R2, R3, R4, 0, 0, 0);
86 return SIMD_5x64(_mm512_rolv_epi64(m_v, rot));
89 BOTAN_FN_ISA_AVX512
void store(uint64_t v[5])
const { _mm512_mask_storeu_epi64(v, 0b00011111, m_v); }
92 explicit BOTAN_FN_ISA_AVX512 SIMD_5x64(__m512i v) : m_v(v) {}
97inline void BOTAN_FN_ISA_AVX512 Keccak_Permutation_round_avx512(SIMD_5x64 A[5], uint64_t RC) {
98 const auto C = SIMD_5x64::xor5(A[0], A[1], A[2], A[3], A[4]);
100 const auto D = C.permute<4, 0, 1, 2, 3>() ^ C.permute<1, 2, 3, 4, 0>().rol1();
102 const auto B0 = (A[0] ^ D).permute<0, 3, 1, 4, 2>().rolv<0, 28, 1, 27, 62>();
103 const auto B1 = (A[1] ^ D).permute<1, 4, 2, 0, 3>().rolv<44, 20, 6, 36, 55>();
104 const auto B2 = (A[2] ^ D).permute<2, 0, 3, 1, 4>().rolv<43, 3, 25, 10, 39>();
105 const auto B3 = (A[3] ^ D).permute<3, 1, 4, 2, 0>().rolv<21, 45, 8, 15, 41>();
106 const auto B4 = (A[4] ^ D).permute<4, 2, 0, 3, 1>().rolv<14, 61, 18, 56, 2>();
108 auto T0 = SIMD_5x64::chi(B0, B1, B2) ^ SIMD_5x64::rc(RC);
109 auto T1 = SIMD_5x64::chi(B1, B2, B3);
110 auto T2 = SIMD_5x64::chi(B2, B3, B4);
111 auto T3 = SIMD_5x64::chi(B3, B4, B0);
112 auto T4 = SIMD_5x64::chi(B4, B0, B1);
114 SIMD_5x64::transpose5(T0, T1, T2, T3, T4);
125void BOTAN_FN_ISA_AVX512 Keccak_Permutation::permute_avx512() {
126 static const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808A, 0x8000000080008000,
127 0x000000000000808B, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
128 0x000000000000008A, 0x0000000000000088, 0x0000000080008009, 0x000000008000000A,
129 0x000000008000808B, 0x800000000000008B, 0x8000000000008089, 0x8000000000008003,
130 0x8000000000008002, 0x8000000000000080, 0x000000000000800A, 0x800000008000000A,
131 0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
135 std::array<SIMD_5x64, 5> X{
136 SIMD_5x64::load(&S[0]),
137 SIMD_5x64::load(&S[5]),
138 SIMD_5x64::load(&S[10]),
139 SIMD_5x64::load(&S[15]),
140 SIMD_5x64::load(&S[20]),
144 for(
size_t i = 0; i != 24; ++i) {
145 Keccak_Permutation_round_avx512(X.data(), RC[i]);
148 for(
size_t i = 0; i != 5; ++i) {
149 X[i].store(&S[5 * i]);
OctetString operator^(const OctetString &k1, const OctetString &k2)
void R2(uint32_t A, uint32_t &B, uint32_t C, uint32_t &D, uint32_t E, uint32_t &F, uint32_t G, uint32_t &H, uint32_t TJ, uint32_t Wi, uint32_t Wj)
void R1(uint32_t A, uint32_t &B, uint32_t C, uint32_t &D, uint32_t E, uint32_t &F, uint32_t G, uint32_t &H, uint32_t TJ, uint32_t Wi, uint32_t Wj)