Botan  2.4.0
Crypto and TLS for C++11
clmul.cpp
Go to the documentation of this file.
1 /*
2 * CLMUL hook
3 * (C) 2013,2017 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #include <botan/internal/clmul.h>
9 #include <immintrin.h>
10 #include <wmmintrin.h>
11 
12 namespace Botan {
13 
14 namespace {
15 
16 BOTAN_FUNC_ISA("sse2")
17 inline __m128i gcm_reduce(const __m128i& B0, const __m128i& B1)
18  {
19  __m128i T0, T1, T2, T3;
20 
21  T0 = _mm_srli_epi32(B1, 31);
22  T1 = _mm_slli_epi32(B1, 1);
23  T2 = _mm_srli_epi32(B0, 31);
24  T3 = _mm_slli_epi32(B0, 1);
25 
26  T3 = _mm_or_si128(T3, _mm_srli_si128(T0, 12));
27  T3 = _mm_or_si128(T3, _mm_slli_si128(T2, 4));
28  T1 = _mm_or_si128(T1, _mm_slli_si128(T0, 4));
29 
30  T0 = _mm_xor_si128(_mm_slli_epi32(T1, 31), _mm_slli_epi32(T1, 30));
31  T0 = _mm_xor_si128(T0, _mm_slli_epi32(T1, 25));
32 
33  T1 = _mm_xor_si128(T1, _mm_slli_si128(T0, 12));
34 
35  T0 = _mm_xor_si128(T3, _mm_srli_si128(T0, 4));
36  T0 = _mm_xor_si128(T0, T1);
37  T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 7));
38  T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 1));
39  T0 = _mm_xor_si128(T0, _mm_srli_epi32(T1, 2));
40  return T0;
41  }
42 
43 BOTAN_FUNC_ISA("pclmul,sse2")
44 inline __m128i gcm_multiply(const __m128i& H, const __m128i& x)
45  {
46  __m128i T0, T1, T2, T3;
47 
48  T0 = _mm_clmulepi64_si128(x, H, 0x11);
49  T1 = _mm_clmulepi64_si128(x, H, 0x10);
50  T2 = _mm_clmulepi64_si128(x, H, 0x01);
51  T3 = _mm_clmulepi64_si128(x, H, 0x00);
52 
53  T1 = _mm_xor_si128(T1, T2);
54  T0 = _mm_xor_si128(T0, _mm_srli_si128(T1, 8));
55  T3 = _mm_xor_si128(T3, _mm_slli_si128(T1, 8));
56 
57  return gcm_reduce(T0, T3);
58  }
59 
60 BOTAN_FUNC_ISA("pclmul,sse2")
61 inline __m128i gcm_multiply_x4(const __m128i& H1, const __m128i& H2, const __m128i& H3, const __m128i& H4,
62  const __m128i& X1, const __m128i& X2, const __m128i& X3, const __m128i& X4)
63  {
64  /*
65  * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski
66  * and Pierre Laurent of Intel
67  */
68 
69  const __m128i H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
70  const __m128i H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
71  const __m128i H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
72  const __m128i H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
73 
74  const __m128i lo = _mm_xor_si128(
75  _mm_xor_si128(H1_X1_lo, H2_X2_lo),
76  _mm_xor_si128(H3_X3_lo, H4_X4_lo));
77 
78  const __m128i H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
79  const __m128i H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
80  const __m128i H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
81  const __m128i H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
82 
83  const __m128i hi = _mm_xor_si128(
84  _mm_xor_si128(H1_X1_hi, H2_X2_hi),
85  _mm_xor_si128(H3_X3_hi, H4_X4_hi));
86 
87  __m128i T0 = _mm_xor_si128(lo, hi);
88  __m128i T1, T2, T3, T4;
89 
90  T1 = _mm_xor_si128(_mm_srli_si128(H1, 8), H1);
91  T2 = _mm_xor_si128(_mm_srli_si128(X1, 8), X1);
92  T3 = _mm_xor_si128(_mm_srli_si128(H2, 8), H2);
93  T4 = _mm_xor_si128(_mm_srli_si128(X2, 8), X2);
94  T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00));
95  T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00));
96 
97  T1 = _mm_xor_si128(_mm_srli_si128(H3, 8), H3);
98  T2 = _mm_xor_si128(_mm_srli_si128(X3, 8), X3);
99  T3 = _mm_xor_si128(_mm_srli_si128(H4, 8), H4);
100  T4 = _mm_xor_si128(_mm_srli_si128(X4, 8), X4);
101  T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00));
102  T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00));
103 
104  T1 = _mm_xor_si128(_mm_srli_si128(T0, 8), hi);
105  T2 = _mm_xor_si128(_mm_slli_si128(T0, 8), lo);
106 
107  return gcm_reduce(T1, T2);
108  }
109 
110 }
111 
112 BOTAN_FUNC_ISA("ssse3")
113 void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2])
114  {
115  const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
116 
117  const __m128i H = _mm_shuffle_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(H_bytes)), BSWAP_MASK);
118  const __m128i H2 = gcm_multiply(H, H);
119  const __m128i H3 = gcm_multiply(H, H2);
120  const __m128i H4 = gcm_multiply(H, H3);
121 
122  __m128i* H_pow_mm = reinterpret_cast<__m128i*>(H_pow);
123 
124  _mm_storeu_si128(H_pow_mm+0, H);
125  _mm_storeu_si128(H_pow_mm+1, H2);
126  _mm_storeu_si128(H_pow_mm+2, H3);
127  _mm_storeu_si128(H_pow_mm+3, H4);
128  }
129 
130 BOTAN_FUNC_ISA("ssse3")
131 void gcm_multiply_clmul(uint8_t x[16],
132  const uint64_t H_pow[8],
133  const uint8_t input_bytes[], size_t blocks)
134  {
135  /*
136  * Algorithms 1 and 5 from Intel's CLMUL guide
137  */
138  const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
139 
140  const __m128i* input = reinterpret_cast<const __m128i*>(input_bytes);
141 
142  const __m128i* H_pow_mm = reinterpret_cast<const __m128i*>(H_pow);
143 
144  const __m128i H = _mm_loadu_si128(H_pow_mm);
145 
146  __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x));
147  a = _mm_shuffle_epi8(a, BSWAP_MASK);
148 
149  if(blocks >= 4)
150  {
151  const __m128i H2 = _mm_loadu_si128(H_pow_mm + 1);
152  const __m128i H3 = _mm_loadu_si128(H_pow_mm + 2);
153  const __m128i H4 = _mm_loadu_si128(H_pow_mm + 3);
154 
155  while(blocks >= 4)
156  {
157  const __m128i m0 = _mm_shuffle_epi8(_mm_loadu_si128(input + 0), BSWAP_MASK);
158  const __m128i m1 = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), BSWAP_MASK);
159  const __m128i m2 = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), BSWAP_MASK);
160  const __m128i m3 = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), BSWAP_MASK);
161 
162  a = _mm_xor_si128(a, m0);
163  a = gcm_multiply_x4(H, H2, H3, H4, m3, m2, m1, a);
164 
165  input += 4;
166  blocks -= 4;
167  }
168  }
169 
170  for(size_t i = 0; i != blocks; ++i)
171  {
172  const __m128i m = _mm_shuffle_epi8(_mm_loadu_si128(input + i), BSWAP_MASK);
173 
174  a = _mm_xor_si128(a, m);
175  a = gcm_multiply(H, a);
176  }
177 
178  a = _mm_shuffle_epi8(a, BSWAP_MASK);
179  _mm_storeu_si128(reinterpret_cast<__m128i*>(x), a);
180  }
181 
182 }
void gcm_clmul_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4 *2])
Definition: clmul.cpp:113
void gcm_multiply_clmul(uint8_t x[16], const uint64_t H_pow[8], const uint8_t input_bytes[], size_t blocks)
Definition: clmul.cpp:131
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:75
Definition: alg_id.cpp:13