Botan 3.11.0
Crypto and TLS for C&
keccak_perm_avx512.cpp
Go to the documentation of this file.
1/*
2* (C) 2025 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/keccak_perm.h>
8
9#include <botan/internal/isa_extn.h>
10#include <immintrin.h>
11
12namespace Botan {
13
14namespace {
15
16class SIMD_5x64 final {
17 public:
18 explicit BOTAN_FN_ISA_AVX512 SIMD_5x64() : SIMD_5x64(_mm512_setzero_si512()) {}
19
20 static BOTAN_FN_ISA_AVX512 SIMD_5x64 rc(uint64_t RC) {
21 return SIMD_5x64(_mm512_maskz_set1_epi64(0b00000001, RC));
22 }
23
24 static BOTAN_FN_ISA_AVX512 SIMD_5x64 load(const uint64_t v[5]) {
25 return SIMD_5x64(_mm512_maskz_loadu_epi64(0b00011111, v));
26 }
27
28 template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4>
29 inline BOTAN_FN_ISA_AVX512 SIMD_5x64 permute() const {
30 static_assert(I0 < 5 && I1 < 5 && I2 < 5 && I3 < 5 && I4 < 5);
31 const __m512i tbl = _mm512_setr_epi64(I0, I1, I2, I3, I4, 0, 0, 0);
32 return SIMD_5x64(_mm512_permutexvar_epi64(tbl, m_v));
33 }
34
35 static BOTAN_FN_ISA_AVX512 void transpose5(
36 SIMD_5x64& i0, SIMD_5x64& i1, SIMD_5x64& i2, SIMD_5x64& i3, SIMD_5x64& i4) {
37 // 5x5 u64 transpose using 7 permutex2var, 4 unpack, 1 blend, 5 constants
38
39 const auto lo_01 = _mm512_unpacklo_epi64(i0.m_v, i1.m_v);
40 const auto lo_23 = _mm512_unpacklo_epi64(i2.m_v, i3.m_v);
41
42 const auto hi_01 = _mm512_unpackhi_epi64(i0.m_v, i1.m_v);
43 const auto hi_23 = _mm512_unpackhi_epi64(i2.m_v, i3.m_v);
44
45 // Insert the relevant words from i4 into the i0/i1 data
46 const auto i4_lo_idx = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 8, 10);
47 const auto i4_hi_idx = _mm512_setr_epi64(0, 1, 2, 3, -1, -1, 9, 11);
48
49 auto t0 = _mm512_permutex2var_epi64(lo_01, i4_lo_idx, i4.m_v);
50 auto t2 = _mm512_permutex2var_epi64(hi_01, i4_hi_idx, i4.m_v);
51
52 // Now merge the 0/1/4 and 2/3 vectors using permutes
53 const auto idx0 = _mm512_setr_epi64(0, 1, 8, 9, 6, -1, -1, -1);
54 const auto idx1 = _mm512_setr_epi64(2, 3, 10, 11, 7, -1, -1, -1);
55 const auto idx4 = _mm512_setr_epi64(4, 5, 12, 13, -1, -1, -1, -1);
56
57 i0.m_v = _mm512_permutex2var_epi64(t0, idx0, lo_23);
58 i1.m_v = _mm512_permutex2var_epi64(t2, idx0, hi_23);
59 i2.m_v = _mm512_permutex2var_epi64(t0, idx1, lo_23);
60 i3.m_v = _mm512_permutex2var_epi64(t2, idx1, hi_23);
61 i4.m_v = _mm512_mask_blend_epi64(0b00010000, _mm512_permutex2var_epi64(t0, idx4, lo_23), i4.m_v);
62 }
63
64 static BOTAN_FN_ISA_AVX512 SIMD_5x64 chi(const SIMD_5x64& x, const SIMD_5x64& y, const SIMD_5x64& z) {
65 constexpr uint8_t xor_not_and = 0b11010010; // (x ^ (~y & z))
66 return SIMD_5x64(_mm512_ternarylogic_epi64(x.m_v, y.m_v, z.m_v, xor_not_and));
67 }
68
69 friend BOTAN_FN_ISA_AVX512 SIMD_5x64 operator^(const SIMD_5x64& x, const SIMD_5x64& y) {
70 return SIMD_5x64(_mm512_xor_epi64(x.m_v, y.m_v));
71 }
72
73 static BOTAN_FN_ISA_AVX512 SIMD_5x64
74 xor5(const SIMD_5x64& i0, const SIMD_5x64& i1, const SIMD_5x64& i2, const SIMD_5x64& i3, const SIMD_5x64& i4) {
75 constexpr uint8_t tern_xor = 0b10010110;
76 auto t = _mm512_ternarylogic_epi64(i0.m_v, i1.m_v, i2.m_v, tern_xor);
77 return SIMD_5x64(_mm512_ternarylogic_epi64(i3.m_v, i4.m_v, t, tern_xor));
78 }
79
80 BOTAN_FN_ISA_AVX512 SIMD_5x64 rol1() const { return SIMD_5x64(_mm512_rol_epi64(m_v, 1)); }
81
82 template <size_t R0, size_t R1, size_t R2, size_t R3, size_t R4>
83 BOTAN_FN_ISA_AVX512 SIMD_5x64 rolv() const {
84 static_assert(R0 < 64 && R1 < 64 && R2 < 64 && R3 < 64 && R4 < 64);
85 const __m512i rot = _mm512_setr_epi64(R0, R1, R2, R3, R4, 0, 0, 0);
86 return SIMD_5x64(_mm512_rolv_epi64(m_v, rot));
87 }
88
89 BOTAN_FN_ISA_AVX512 void store(uint64_t v[5]) const { _mm512_mask_storeu_epi64(v, 0b00011111, m_v); }
90
91 private:
92 explicit BOTAN_FN_ISA_AVX512 SIMD_5x64(__m512i v) : m_v(v) {}
93
94 __m512i m_v;
95};
96
97inline void BOTAN_FN_ISA_AVX512 Keccak_Permutation_round_avx512(SIMD_5x64 A[5], uint64_t RC) {
98 const auto C = SIMD_5x64::xor5(A[0], A[1], A[2], A[3], A[4]);
99
100 const auto D = C.permute<4, 0, 1, 2, 3>() ^ C.permute<1, 2, 3, 4, 0>().rol1();
101
102 const auto B0 = (A[0] ^ D).permute<0, 3, 1, 4, 2>().rolv<0, 28, 1, 27, 62>();
103 const auto B1 = (A[1] ^ D).permute<1, 4, 2, 0, 3>().rolv<44, 20, 6, 36, 55>();
104 const auto B2 = (A[2] ^ D).permute<2, 0, 3, 1, 4>().rolv<43, 3, 25, 10, 39>();
105 const auto B3 = (A[3] ^ D).permute<3, 1, 4, 2, 0>().rolv<21, 45, 8, 15, 41>();
106 const auto B4 = (A[4] ^ D).permute<4, 2, 0, 3, 1>().rolv<14, 61, 18, 56, 2>();
107
108 auto T0 = SIMD_5x64::chi(B0, B1, B2) ^ SIMD_5x64::rc(RC);
109 auto T1 = SIMD_5x64::chi(B1, B2, B3);
110 auto T2 = SIMD_5x64::chi(B2, B3, B4);
111 auto T3 = SIMD_5x64::chi(B3, B4, B0);
112 auto T4 = SIMD_5x64::chi(B4, B0, B1);
113
114 SIMD_5x64::transpose5(T0, T1, T2, T3, T4);
115
116 A[0] = T0;
117 A[1] = T1;
118 A[2] = T2;
119 A[3] = T3;
120 A[4] = T4;
121}
122
123} // namespace
124
125void BOTAN_FN_ISA_AVX512 Keccak_Permutation::permute_avx512() {
126 static const uint64_t RC[24] = {0x0000000000000001, 0x0000000000008082, 0x800000000000808A, 0x8000000080008000,
127 0x000000000000808B, 0x0000000080000001, 0x8000000080008081, 0x8000000000008009,
128 0x000000000000008A, 0x0000000000000088, 0x0000000080008009, 0x000000008000000A,
129 0x000000008000808B, 0x800000000000008B, 0x8000000000008089, 0x8000000000008003,
130 0x8000000000008002, 0x8000000000000080, 0x000000000000800A, 0x800000008000000A,
131 0x8000000080008081, 0x8000000000008080, 0x0000000080000001, 0x8000000080008008};
132
133 auto& S = state();
134
135 std::array<SIMD_5x64, 5> X{
136 SIMD_5x64::load(&S[0]), // NOLINT(*container-data-pointer)
137 SIMD_5x64::load(&S[5]),
138 SIMD_5x64::load(&S[10]),
139 SIMD_5x64::load(&S[15]),
140 SIMD_5x64::load(&S[20]),
141 };
142
143 // NOLINTNEXTLINE(modernize-loop-convert)
144 for(size_t i = 0; i != 24; ++i) {
145 Keccak_Permutation_round_avx512(X.data(), RC[i]);
146 }
147
148 for(size_t i = 0; i != 5; ++i) {
149 X[i].store(&S[5 * i]);
150 }
151}
152
153} // namespace Botan
constexpr auto & state()
Definition sponge.h:55
OctetString operator^(const OctetString &k1, const OctetString &k2)
Definition symkey.cpp:109
void R2(uint32_t A, uint32_t &B, uint32_t C, uint32_t &D, uint32_t E, uint32_t &F, uint32_t G, uint32_t &H, uint32_t TJ, uint32_t Wi, uint32_t Wj)
Definition sm3_fn.h:43
void R1(uint32_t A, uint32_t &B, uint32_t C, uint32_t &D, uint32_t E, uint32_t &F, uint32_t G, uint32_t &H, uint32_t TJ, uint32_t Wi, uint32_t Wj)
Definition sm3_fn.h:21