Botan 3.7.1
Crypto and TLS for C&
ghash_cpu.cpp
Go to the documentation of this file.
1/*
2* Hook for CLMUL/PMULL/VPMSUM
3* (C) 2013,2017,2019,2020 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/internal/ghash.h>
9
10#include <botan/internal/simd_32.h>
11
12#if defined(BOTAN_SIMD_USE_SSE2)
13 #include <immintrin.h>
14 #include <wmmintrin.h>
15#endif
16
17namespace Botan {
18
19namespace {
20
21BOTAN_FUNC_ISA_INLINE(BOTAN_VPERM_ISA) SIMD_4x32 reverse_vector(const SIMD_4x32& in) {
22#if defined(BOTAN_SIMD_USE_SSE2)
23 const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
24 return SIMD_4x32(_mm_shuffle_epi8(in.raw(), BSWAP_MASK));
25#elif defined(BOTAN_SIMD_USE_NEON)
26 const uint8_t maskb[16] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
27 const uint8x16_t mask = vld1q_u8(maskb);
28 return SIMD_4x32(vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(in.raw()), mask)));
29#elif defined(BOTAN_SIMD_USE_ALTIVEC)
30 const __vector unsigned char mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
31 return SIMD_4x32(vec_perm(in.raw(), in.raw(), mask));
32#endif
33}
34
35template <int M>
36BOTAN_FORCE_INLINE SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) clmul(const SIMD_4x32& H, const SIMD_4x32& x) {
37 static_assert(M == 0x00 || M == 0x01 || M == 0x10 || M == 0x11, "Valid clmul mode");
38
39#if defined(BOTAN_SIMD_USE_SSE2)
40 return SIMD_4x32(_mm_clmulepi64_si128(x.raw(), H.raw(), M));
41#elif defined(BOTAN_SIMD_USE_NEON)
42 const uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u32(x.raw()), M & 0x01);
43 const uint64_t b = vgetq_lane_u64(vreinterpretq_u64_u32(H.raw()), (M & 0x10) >> 4);
44
45 #if defined(BOTAN_BUILD_COMPILER_IS_MSVC)
46 __n64 a1 = {a}, b1 = {b};
47 return SIMD_4x32(vmull_p64(a1, b1));
48 #else
49 return SIMD_4x32(reinterpret_cast<uint32x4_t>(vmull_p64(a, b)));
50 #endif
51
52#elif defined(BOTAN_SIMD_USE_ALTIVEC)
53 const SIMD_4x32 mask_lo = SIMD_4x32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF);
54
55 SIMD_4x32 i1 = x;
56 SIMD_4x32 i2 = H;
57
58 if(M == 0x11) {
59 i1 &= mask_lo;
60 i2 &= mask_lo;
61 } else if(M == 0x10) {
62 i1 = i1.shift_elems_left<2>();
63 } else if(M == 0x01) {
64 i2 = i2.shift_elems_left<2>();
65 } else if(M == 0x00) {
66 i1 = mask_lo.andc(i1);
67 i2 = mask_lo.andc(i2);
68 }
69
70 auto i1v = reinterpret_cast<__vector unsigned long long>(i1.raw());
71 auto i2v = reinterpret_cast<__vector unsigned long long>(i2.raw());
72
73 #if BOTAN_COMPILER_HAS_BUILTIN(__builtin_crypto_vpmsumd)
74 auto rv = __builtin_crypto_vpmsumd(i1v, i2v);
75 #else
76 auto rv = __builtin_altivec_crypto_vpmsumd(i1v, i2v);
77 #endif
78
79 return SIMD_4x32(reinterpret_cast<__vector unsigned int>(rv));
80#endif
81}
82
83inline SIMD_4x32 gcm_reduce(const SIMD_4x32& B0, const SIMD_4x32& B1) {
84 SIMD_4x32 X0 = B1.shr<31>();
85 SIMD_4x32 X1 = B1.shl<1>();
86 SIMD_4x32 X2 = B0.shr<31>();
87 SIMD_4x32 X3 = B0.shl<1>();
88
89 X3 |= X0.shift_elems_right<3>();
90 X3 |= X2.shift_elems_left<1>();
91 X1 |= X0.shift_elems_left<1>();
92
93 X0 = X1.shl<31>() ^ X1.shl<30>() ^ X1.shl<25>();
94
95 X1 ^= X0.shift_elems_left<3>();
96
97 X0 = X1 ^ X3 ^ X0.shift_elems_right<1>();
98 X0 ^= X1.shr<7>() ^ X1.shr<2>() ^ X1.shr<1>();
99 return X0;
100}
101
102inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply(const SIMD_4x32& H, const SIMD_4x32& x) {
103 SIMD_4x32 T0 = clmul<0x11>(H, x);
104 SIMD_4x32 T1 = clmul<0x10>(H, x);
105 SIMD_4x32 T2 = clmul<0x01>(H, x);
106 SIMD_4x32 T3 = clmul<0x00>(H, x);
107
108 T1 ^= T2;
109 T0 ^= T1.shift_elems_right<2>();
110 T3 ^= T1.shift_elems_left<2>();
111
112 return gcm_reduce(T0, T3);
113}
114
115inline SIMD_4x32 BOTAN_FUNC_ISA(BOTAN_CLMUL_ISA) gcm_multiply_x4(const SIMD_4x32& H1,
116 const SIMD_4x32& H2,
117 const SIMD_4x32& H3,
118 const SIMD_4x32& H4,
119 const SIMD_4x32& X1,
120 const SIMD_4x32& X2,
121 const SIMD_4x32& X3,
122 const SIMD_4x32& X4) {
123 /*
124 * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski
125 * and Pierre Laurent of Intel
126 */
127
128 const SIMD_4x32 lo = (clmul<0x00>(H1, X1) ^ clmul<0x00>(H2, X2)) ^ (clmul<0x00>(H3, X3) ^ clmul<0x00>(H4, X4));
129
130 const SIMD_4x32 hi = (clmul<0x11>(H1, X1) ^ clmul<0x11>(H2, X2)) ^ (clmul<0x11>(H3, X3) ^ clmul<0x11>(H4, X4));
131
132 SIMD_4x32 T;
133
134 T ^= clmul<0x00>(H1 ^ H1.shift_elems_right<2>(), X1 ^ X1.shift_elems_right<2>());
135 T ^= clmul<0x00>(H2 ^ H2.shift_elems_right<2>(), X2 ^ X2.shift_elems_right<2>());
136 T ^= clmul<0x00>(H3 ^ H3.shift_elems_right<2>(), X3 ^ X3.shift_elems_right<2>());
137 T ^= clmul<0x00>(H4 ^ H4.shift_elems_right<2>(), X4 ^ X4.shift_elems_right<2>());
138 T ^= lo;
139 T ^= hi;
140
141 return gcm_reduce(hi ^ T.shift_elems_right<2>(), lo ^ T.shift_elems_left<2>());
142}
143
144} // namespace
145
146BOTAN_FUNC_ISA(BOTAN_VPERM_ISA) void GHASH::ghash_precompute_cpu(const uint8_t H_bytes[16], uint64_t H_pow[4 * 2]) {
147 const SIMD_4x32 H1 = reverse_vector(SIMD_4x32::load_le(H_bytes));
148 const SIMD_4x32 H2 = gcm_multiply(H1, H1);
149 const SIMD_4x32 H3 = gcm_multiply(H1, H2);
150 const SIMD_4x32 H4 = gcm_multiply(H2, H2);
151
152 H1.store_le(H_pow);
153 H2.store_le(H_pow + 2);
154 H3.store_le(H_pow + 4);
155 H4.store_le(H_pow + 6);
156}
157
158BOTAN_FUNC_ISA(BOTAN_VPERM_ISA)
159void GHASH::ghash_multiply_cpu(uint8_t x[16], const uint64_t H_pow[8], const uint8_t input[], size_t blocks) {
160 /*
161 * Algorithms 1 and 5 from Intel's CLMUL guide
162 */
163 const SIMD_4x32 H1 = SIMD_4x32::load_le(H_pow);
164
165 SIMD_4x32 a = reverse_vector(SIMD_4x32::load_le(x));
166
167 if(blocks >= 4) {
168 const SIMD_4x32 H2 = SIMD_4x32::load_le(H_pow + 2);
169 const SIMD_4x32 H3 = SIMD_4x32::load_le(H_pow + 4);
170 const SIMD_4x32 H4 = SIMD_4x32::load_le(H_pow + 6);
171
172 while(blocks >= 4) {
173 const SIMD_4x32 m0 = reverse_vector(SIMD_4x32::load_le(input));
174 const SIMD_4x32 m1 = reverse_vector(SIMD_4x32::load_le(input + 16 * 1));
175 const SIMD_4x32 m2 = reverse_vector(SIMD_4x32::load_le(input + 16 * 2));
176 const SIMD_4x32 m3 = reverse_vector(SIMD_4x32::load_le(input + 16 * 3));
177
178 a ^= m0;
179 a = gcm_multiply_x4(H1, H2, H3, H4, m3, m2, m1, a);
180
181 input += 4 * 16;
182 blocks -= 4;
183 }
184 }
185
186 for(size_t i = 0; i != blocks; ++i) {
187 const SIMD_4x32 m = reverse_vector(SIMD_4x32::load_le(input + 16 * i));
188
189 a ^= m;
190 a = gcm_multiply(H1, a);
191 }
192
193 a = reverse_vector(a);
194 a.store_le(x);
195}
196
197} // namespace Botan
static SIMD_4x32 load_le(const void *in) noexcept
Definition simd_32.h:159
#define BOTAN_FUNC_ISA(isa)
Definition compiler.h:42
#define BOTAN_FORCE_INLINE
Definition compiler.h:71
#define BOTAN_FUNC_ISA_INLINE(isa)
Definition compiler.h:48
FE_25519 T
Definition ge.cpp:34
const SIMD_8x32 & b