7#include <botan/internal/ghash.h>
9#include <botan/internal/isa_extn.h>
10#include <botan/internal/polyval_fn.h>
11#include <botan/internal/simd_4x32.h>
12#include <botan/internal/target_info.h>
20 return _mm512_xor_si512(H, _mm512_bsrli_epi128(H, 8));
24 auto y = _mm256_xor_si256(_mm512_castsi512_si256(z), _mm512_extracti64x4_epi64(z, 0x1));
25 auto x = _mm_xor_si128(_mm256_castsi256_si128(y), _mm256_extracti32x4_epi32(y, 0x1));
30ghash_x4_accum(__m512i H, __m512i H_fold, __m512i M, __m512i& lo, __m512i& hi, __m512i& mid) {
31 lo = _mm512_xor_si512(lo, _mm512_clmulepi64_epi128(H, M, 0x00));
32 hi = _mm512_xor_si512(hi, _mm512_clmulepi64_epi128(H, M, 0x11));
33 mid = _mm512_xor_si512(mid, _mm512_clmulepi64_epi128(H_fold, fold(M), 0x00));
37 mid = _mm512_ternarylogic_epi64(lo, mid, hi, 0x96);
38 hi = _mm512_xor_si512(hi, _mm512_bsrli_epi128(mid, 8));
39 lo = _mm512_xor_si512(lo, _mm512_bslli_epi128(mid, 8));
44 return _mm512_xor_epi64(M, _mm512_inserti64x2(_mm512_setzero_si512(), a.raw(), 0));
49void BOTAN_FN_ISA_AVX512_CLMUL GHASH::ghash_precompute_avx512_clmul(
const uint8_t H_bytes[16], uint64_t H_pow[16 * 2]) {
74 H3.store_le(H_pow + 2);
75 H2.store_le(H_pow + 4);
76 H1.store_le(H_pow + 6);
78 H8.store_le(H_pow + 8);
79 H7.store_le(H_pow + 10);
80 H6.store_le(H_pow + 12);
81 H5.store_le(H_pow + 14);
83 H12.store_le(H_pow + 16);
84 H11.store_le(H_pow + 18);
85 H10.store_le(H_pow + 20);
86 H9.store_le(H_pow + 22);
88 H16.store_le(H_pow + 24);
89 H15.store_le(H_pow + 26);
90 H14.store_le(H_pow + 28);
91 H13.store_le(H_pow + 30);
94void BOTAN_FN_ISA_AVX512_CLMUL GHASH::ghash_multiply_avx512_clmul(uint8_t x[16],
95 const uint64_t H_pow[16 * 2],
96 const uint8_t input[],
101 const auto BSWAP = _mm512_set_epi64(0x0001020304050607,
111 const auto H1 = _mm512_loadu_si512(H_pow);
112 const auto H2 = _mm512_loadu_si512(H_pow + 8);
113 const auto H3 = _mm512_loadu_si512(H_pow + 16);
114 const auto H4 = _mm512_loadu_si512(H_pow + 24);
117 const auto H1_fold = fold(H1);
118 const auto H2_fold = fold(H2);
119 const auto H3_fold = fold(H3);
120 const auto H4_fold = fold(H4);
122 while(blocks >= 16) {
123 __m512i M1 = _mm512_shuffle_epi8(_mm512_loadu_si512(input), BSWAP);
124 const auto M2 = _mm512_shuffle_epi8(_mm512_loadu_si512(input + 64), BSWAP);
125 const auto M3 = _mm512_shuffle_epi8(_mm512_loadu_si512(input + 128), BSWAP);
126 const auto M4 = _mm512_shuffle_epi8(_mm512_loadu_si512(input + 192), BSWAP);
128 M1 = insert_a(M1, a);
130 auto lo = _mm512_setzero_si512();
131 auto hi = _mm512_setzero_si512();
132 auto mid = _mm512_setzero_si512();
134 ghash_x4_accum(H4, H4_fold, M1, lo, hi, mid);
135 ghash_x4_accum(H3, H3_fold, M2, lo, hi, mid);
136 ghash_x4_accum(H2, H2_fold, M3, lo, hi, mid);
137 ghash_x4_accum(H1, H1_fold, M4, lo, hi, mid);
139 a = ghash_reduce(lo, hi, mid);
147 const auto H1 = _mm512_loadu_si512(H_pow);
148 const auto H2 = _mm512_loadu_si512(H_pow + 8);
150 const auto H1_fold = fold(H1);
151 const auto H2_fold = fold(H2);
154 __m512i M1 = _mm512_shuffle_epi8(_mm512_loadu_si512(input), BSWAP);
155 const __m512i M2 = _mm512_shuffle_epi8(_mm512_loadu_si512(input + 64), BSWAP);
157 M1 = insert_a(M1, a);
159 auto lo = _mm512_setzero_si512();
160 auto hi = _mm512_setzero_si512();
161 auto mid = _mm512_setzero_si512();
163 ghash_x4_accum(H2, H2_fold, M1, lo, hi, mid);
164 ghash_x4_accum(H1, H1_fold, M2, lo, hi, mid);
166 a = ghash_reduce(lo, hi, mid);
174 const auto H1 = _mm512_loadu_si512(H_pow);
175 const auto H1_fold = fold(H1);
178 __m512i M = _mm512_shuffle_epi8(_mm512_loadu_si512(input), BSWAP);
181 auto lo = _mm512_clmulepi64_epi128(H1, M, 0x00);
182 auto hi = _mm512_clmulepi64_epi128(H1, M, 0x11);
183 auto mid = _mm512_clmulepi64_epi128(H1_fold, fold(M), 0x00);
185 a = ghash_reduce(lo, hi, mid);
196 for(
size_t i = 0; i != blocks; ++i) {
static SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 load_le(const void *in) noexcept
#define BOTAN_FORCE_INLINE
BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FN_ISA_SIMD_4X32 mulx_polyval(const SIMD_4x32 &h)
BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FN_ISA_CLMUL polyval_multiply(const SIMD_4x32 &H, const SIMD_4x32 &x)
BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FN_ISA_CLMUL polyval_reduce(const SIMD_4x32 &hi, const SIMD_4x32 &lo)
BOTAN_FORCE_INLINE BOTAN_FN_ISA_SIMD_4X32 SIMD_4x32 reverse_vector(const SIMD_4x32 &in)