Botan 3.11.0
Crypto and TLS for C&
ghash_avx512_clmul.cpp
Go to the documentation of this file.
1/*
2* (C) 2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/ghash.h>
8
9#include <botan/internal/isa_extn.h>
10#include <botan/internal/polyval_fn.h>
11#include <botan/internal/simd_4x32.h>
12#include <botan/internal/target_info.h>
13#include <immintrin.h>
14
15namespace Botan {
16
17namespace {
18
19BOTAN_FORCE_INLINE __m512i BOTAN_FN_ISA_AVX512_CLMUL fold(__m512i H) {
20 return _mm512_xor_si512(H, _mm512_bsrli_epi128(H, 8));
21}
22
23BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FN_ISA_AVX512_CLMUL reduce_xor(__m512i z) {
24 auto y = _mm256_xor_si256(_mm512_castsi512_si256(z), _mm512_extracti64x4_epi64(z, 0x1));
25 auto x = _mm_xor_si128(_mm256_castsi256_si128(y), _mm256_extracti32x4_epi32(y, 0x1));
26 return SIMD_4x32(x);
27}
28
29BOTAN_FORCE_INLINE void BOTAN_FN_ISA_AVX512_CLMUL
30ghash_x4_accum(__m512i H, __m512i H_fold, __m512i M, __m512i& lo, __m512i& hi, __m512i& mid) {
31 lo = _mm512_xor_si512(lo, _mm512_clmulepi64_epi128(H, M, 0x00));
32 hi = _mm512_xor_si512(hi, _mm512_clmulepi64_epi128(H, M, 0x11));
33 mid = _mm512_xor_si512(mid, _mm512_clmulepi64_epi128(H_fold, fold(M), 0x00));
34}
35
36BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FN_ISA_AVX512_CLMUL ghash_reduce(__m512i lo, __m512i hi, __m512i mid) {
37 mid = _mm512_ternarylogic_epi64(lo, mid, hi, 0x96); // mid ^= lo ^ hi
38 hi = _mm512_xor_si512(hi, _mm512_bsrli_epi128(mid, 8));
39 lo = _mm512_xor_si512(lo, _mm512_bslli_epi128(mid, 8));
40 return polyval_reduce(reduce_xor(hi), reduce_xor(lo));
41}
42
43BOTAN_FORCE_INLINE __m512i BOTAN_FN_ISA_AVX512_CLMUL insert_a(__m512i M, const SIMD_4x32& a) {
44 return _mm512_xor_epi64(M, _mm512_inserti64x2(_mm512_setzero_si512(), a.raw(), 0));
45}
46
47} // namespace
48
49void BOTAN_FN_ISA_AVX512_CLMUL GHASH::ghash_precompute_avx512_clmul(const uint8_t H_bytes[16], uint64_t H_pow[16 * 2]) {
50 const SIMD_4x32 H1 = mulx_polyval(reverse_vector(SIMD_4x32::load_le(H_bytes)));
51
52 const SIMD_4x32 H2 = polyval_multiply(H1, H1);
53 const SIMD_4x32 H3 = polyval_multiply(H1, H2);
54 const SIMD_4x32 H4 = polyval_multiply(H2, H2);
55
56 const SIMD_4x32 H5 = polyval_multiply(H4, H1);
57 const SIMD_4x32 H6 = polyval_multiply(H4, H2);
58 const SIMD_4x32 H7 = polyval_multiply(H4, H3);
59 const SIMD_4x32 H8 = polyval_multiply(H4, H4);
60
61 const SIMD_4x32 H9 = polyval_multiply(H8, H1);
62 const SIMD_4x32 H10 = polyval_multiply(H8, H2);
63 const SIMD_4x32 H11 = polyval_multiply(H8, H3);
64 const SIMD_4x32 H12 = polyval_multiply(H8, H4);
65
66 const SIMD_4x32 H13 = polyval_multiply(H8, H5);
67 const SIMD_4x32 H14 = polyval_multiply(H8, H6);
68 const SIMD_4x32 H15 = polyval_multiply(H8, H7);
69 const SIMD_4x32 H16 = polyval_multiply(H8, H8);
70
71 // Store in reversed order in blocks of 4 so that the zmm load
72 // of H powers matches up with the message blocks
73 H4.store_le(H_pow);
74 H3.store_le(H_pow + 2);
75 H2.store_le(H_pow + 4);
76 H1.store_le(H_pow + 6);
77
78 H8.store_le(H_pow + 8);
79 H7.store_le(H_pow + 10);
80 H6.store_le(H_pow + 12);
81 H5.store_le(H_pow + 14);
82
83 H12.store_le(H_pow + 16);
84 H11.store_le(H_pow + 18);
85 H10.store_le(H_pow + 20);
86 H9.store_le(H_pow + 22);
87
88 H16.store_le(H_pow + 24);
89 H15.store_le(H_pow + 26);
90 H14.store_le(H_pow + 28);
91 H13.store_le(H_pow + 30);
92}
93
94void BOTAN_FN_ISA_AVX512_CLMUL GHASH::ghash_multiply_avx512_clmul(uint8_t x[16],
95 const uint64_t H_pow[16 * 2],
96 const uint8_t input[],
97 size_t blocks) {
98 SIMD_4x32 a = reverse_vector(SIMD_4x32::load_le(x));
99
100 // Byte swap each lane
101 const auto BSWAP = _mm512_set_epi64(0x0001020304050607,
102 0x08090A0B0C0D0E0F,
103 0x0001020304050607,
104 0x08090A0B0C0D0E0F,
105 0x0001020304050607,
106 0x08090A0B0C0D0E0F,
107 0x0001020304050607,
108 0x08090A0B0C0D0E0F);
109
110 if(blocks >= 16) {
111 const auto H1 = _mm512_loadu_si512(H_pow); // [H4,H3,H2,H1]
112 const auto H2 = _mm512_loadu_si512(H_pow + 8); // [H8,H7,H6,H5]
113 const auto H3 = _mm512_loadu_si512(H_pow + 16); // [H12,H11,H10,H9]
114 const auto H4 = _mm512_loadu_si512(H_pow + 24); // [H16,H15,H14,H13]
115
116 // Precompute H folds (H ^ (H >> 64)) for Karatsuba - loop invariant
117 const auto H1_fold = fold(H1);
118 const auto H2_fold = fold(H2);
119 const auto H3_fold = fold(H3);
120 const auto H4_fold = fold(H4);
121
122 while(blocks >= 16) {
123 __m512i M1 = _mm512_shuffle_epi8(_mm512_loadu_si512(input), BSWAP);
124 const auto M2 = _mm512_shuffle_epi8(_mm512_loadu_si512(input + 64), BSWAP);
125 const auto M3 = _mm512_shuffle_epi8(_mm512_loadu_si512(input + 128), BSWAP);
126 const auto M4 = _mm512_shuffle_epi8(_mm512_loadu_si512(input + 192), BSWAP);
127
128 M1 = insert_a(M1, a);
129
130 auto lo = _mm512_setzero_si512();
131 auto hi = _mm512_setzero_si512();
132 auto mid = _mm512_setzero_si512();
133
134 ghash_x4_accum(H4, H4_fold, M1, lo, hi, mid);
135 ghash_x4_accum(H3, H3_fold, M2, lo, hi, mid);
136 ghash_x4_accum(H2, H2_fold, M3, lo, hi, mid);
137 ghash_x4_accum(H1, H1_fold, M4, lo, hi, mid);
138
139 a = ghash_reduce(lo, hi, mid);
140
141 input += 16 * 16;
142 blocks -= 16;
143 }
144 }
145
146 if(blocks >= 8) {
147 const auto H1 = _mm512_loadu_si512(H_pow); // [H4,H3,H2,H1]
148 const auto H2 = _mm512_loadu_si512(H_pow + 8); // [H8,H7,H6,H5]
149
150 const auto H1_fold = fold(H1);
151 const auto H2_fold = fold(H2);
152
153 while(blocks >= 8) {
154 __m512i M1 = _mm512_shuffle_epi8(_mm512_loadu_si512(input), BSWAP);
155 const __m512i M2 = _mm512_shuffle_epi8(_mm512_loadu_si512(input + 64), BSWAP);
156
157 M1 = insert_a(M1, a);
158
159 auto lo = _mm512_setzero_si512();
160 auto hi = _mm512_setzero_si512();
161 auto mid = _mm512_setzero_si512();
162
163 ghash_x4_accum(H2, H2_fold, M1, lo, hi, mid);
164 ghash_x4_accum(H1, H1_fold, M2, lo, hi, mid);
165
166 a = ghash_reduce(lo, hi, mid);
167
168 input += 8 * 16;
169 blocks -= 8;
170 }
171 }
172
173 if(blocks >= 4) {
174 const auto H1 = _mm512_loadu_si512(H_pow); // [H4,H3,H2,H1]
175 const auto H1_fold = fold(H1);
176
177 while(blocks >= 4) {
178 __m512i M = _mm512_shuffle_epi8(_mm512_loadu_si512(input), BSWAP);
179 M = insert_a(M, a);
180
181 auto lo = _mm512_clmulepi64_epi128(H1, M, 0x00);
182 auto hi = _mm512_clmulepi64_epi128(H1, M, 0x11);
183 auto mid = _mm512_clmulepi64_epi128(H1_fold, fold(M), 0x00);
184
185 a = ghash_reduce(lo, hi, mid);
186
187 input += 4 * 16;
188 blocks -= 4;
189 }
190 }
191
192 if(blocks > 0) {
193 // H1 is at offset 6 in the reversed layout [H4,H3,H2,H1,...]
194 const SIMD_4x32 H1 = SIMD_4x32::load_le(H_pow + 6);
195
196 for(size_t i = 0; i != blocks; ++i) {
197 const SIMD_4x32 m = reverse_vector(SIMD_4x32::load_le(input + 16 * i));
198 a ^= m;
199 a = polyval_multiply(H1, a);
200 }
201 }
202
203 a = reverse_vector(a);
204 a.store_le(x);
205}
206
207} // namespace Botan
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_le(const void *in) noexcept
Definition simd_4x32.h:162
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 mulx_polyval(const SIMD_4x32 &h)
Definition polyval_fn.h:92
BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FN_ISA_CLMUL polyval_multiply(const SIMD_4x32 &H, const SIMD_4x32 &x)
Definition polyval_fn.h:128
BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FN_ISA_CLMUL polyval_reduce(const SIMD_4x32 &hi, const SIMD_4x32 &lo)
Definition polyval_fn.h:107
BOTAN_FORCE_INLINE BOTAN_FN_ISA_SIMD_4X32 SIMD_4x32 reverse_vector(const SIMD_4x32 &in)
Definition polyval_fn.h:16