Botan  2.4.0
Crypto and TLS for C++11
pmull.cpp
Go to the documentation of this file.
1 /*
2 * Contributed by Jeffrey Walton
3 *
4 * Further changes
5 * (C) 2017 Jack Lloyd
6 *
7 * Botan is released under the Simplified BSD License (see license.txt)
8 */
9 
10 #include <botan/internal/pmull.h>
11 #include <arm_neon.h>
12 
13 namespace Botan {
14 
15 /*
16 This follows the same pattern as the clmul implementation.
17 
18 See also https://conradoplg.cryptoland.net/files/2010/12/gcm14.pdf
19 */
20 
21 namespace {
22 
23 BOTAN_FUNC_ISA("+simd")
24 inline uint64x2_t gcm_reduce(uint32x4_t B0, uint32x4_t B1)
25  {
26  const uint32x4_t zero = vdupq_n_u32(0);
27 
28  uint32x4_t T0, T1, T2, T3, T4, T5;
29 
30  T4 = vshrq_n_u32(B0, 31);
31  T0 = vshlq_n_u32(B0, 1);
32  T5 = vshrq_n_u32(B1, 31);
33  T3 = vshlq_n_u32(B1, 1);
34 
35  T2 = vextq_u32(T4, zero, 3);
36  T5 = vextq_u32(zero, T5, 3);
37  T4 = vextq_u32(zero, T4, 3);
38  T0 = vorrq_u32(T0, T4);
39  T3 = vorrq_u32(T3, T5);
40  T3 = vorrq_u32(T3, T2);
41 
42  T4 = vshlq_n_u32(T0, 31);
43  T5 = vshlq_n_u32(T0, 30);
44  T2 = vshlq_n_u32(T0, 25);
45 
46  T4 = veorq_u32(T4, T5);
47  T4 = veorq_u32(T4, T2);
48  T5 = vextq_u32(T4, zero, 1);
49  T3 = veorq_u32(T3, T5);
50  T4 = vextq_u32(zero, T4, 1);
51  T0 = veorq_u32(T0, T4);
52  T3 = veorq_u32(T3, T0);
53 
54  T4 = vshrq_n_u32(T0, 1);
55  T1 = vshrq_n_u32(T0, 2);
56  T2 = vshrq_n_u32(T0, 7);
57  T3 = veorq_u32(T3, T1);
58  T3 = veorq_u32(T3, T2);
59  T3 = veorq_u32(T3, T4);
60 
61  return vreinterpretq_u64_u32(T3);
62  }
63 
64 BOTAN_FUNC_ISA("+crypto")
65 inline uint64x2_t gcm_multiply(uint64x2_t H, uint64x2_t x)
66  {
67  const uint32x4_t zero = vdupq_n_u32(0);
68 
69  const uint64_t x_hi = vgetq_lane_u64(x, 0);
70  const uint64_t x_lo = vgetq_lane_u64(x, 1);
71  const uint64_t H_hi = vgetq_lane_u64(H, 0);
72  const uint64_t H_lo = vgetq_lane_u64(H, 1);
73 
74  uint32x4_t T0 = (uint32x4_t)vmull_p64(x_hi, H_hi);
75  uint32x4_t T1 = (uint32x4_t)vmull_p64(x_lo, H_hi);
76  uint32x4_t T2 = (uint32x4_t)vmull_p64(x_hi, H_lo);
77  uint32x4_t T3 = (uint32x4_t)vmull_p64(x_lo, H_lo);
78 
79  T1 = veorq_u32(T1, T2);
80  T0 = veorq_u32(T0, vextq_u32(zero, T1, 2));
81  T3 = veorq_u32(T3, vextq_u32(T1, zero, 2));
82 
83  return gcm_reduce(T0, T3);
84  }
85 
86 BOTAN_FUNC_ISA("+crypto")
87 inline uint64x2_t gcm_multiply_x4(uint64x2_t H1, uint64x2_t H2, uint64x2_t H3, uint64x2_t H4,
88  uint64x2_t X1, uint64x2_t X2, uint64x2_t X3, uint64x2_t X4)
89  {
90  const uint64_t H1_hi = vgetq_lane_u64(H1, 0);
91  const uint64_t H1_lo = vgetq_lane_u64(H1, 1);
92  const uint64_t H2_hi = vgetq_lane_u64(H2, 0);
93  const uint64_t H2_lo = vgetq_lane_u64(H2, 1);
94  const uint64_t H3_hi = vgetq_lane_u64(H3, 0);
95  const uint64_t H3_lo = vgetq_lane_u64(H3, 1);
96  const uint64_t H4_hi = vgetq_lane_u64(H4, 0);
97  const uint64_t H4_lo = vgetq_lane_u64(H4, 1);
98 
99  const uint64_t X1_hi = vgetq_lane_u64(X1, 0);
100  const uint64_t X1_lo = vgetq_lane_u64(X1, 1);
101  const uint64_t X2_hi = vgetq_lane_u64(X2, 0);
102  const uint64_t X2_lo = vgetq_lane_u64(X2, 1);
103  const uint64_t X3_hi = vgetq_lane_u64(X3, 0);
104  const uint64_t X3_lo = vgetq_lane_u64(X3, 1);
105  const uint64_t X4_hi = vgetq_lane_u64(X4, 0);
106  const uint64_t X4_lo = vgetq_lane_u64(X4, 1);
107 
108  const uint32x4_t H1_X1_lo = (uint32x4_t)vmull_p64(X1_lo, H1_lo);
109  const uint32x4_t H2_X2_lo = (uint32x4_t)vmull_p64(X2_lo, H2_lo);
110  const uint32x4_t H3_X3_lo = (uint32x4_t)vmull_p64(X3_lo, H3_lo);
111  const uint32x4_t H4_X4_lo = (uint32x4_t)vmull_p64(X4_lo, H4_lo);
112 
113  const uint32x4_t lo = veorq_u32(
114  veorq_u32(H1_X1_lo, H2_X2_lo),
115  veorq_u32(H3_X3_lo, H4_X4_lo));
116 
117  const uint32x4_t H1_X1_hi = (uint32x4_t)vmull_p64(X1_hi, H1_hi);
118  const uint32x4_t H2_X2_hi = (uint32x4_t)vmull_p64(X2_hi, H2_hi);
119  const uint32x4_t H3_X3_hi = (uint32x4_t)vmull_p64(X3_hi, H3_hi);
120  const uint32x4_t H4_X4_hi = (uint32x4_t)vmull_p64(X4_hi, H4_hi);
121 
122  const uint32x4_t hi = veorq_u32(
123  veorq_u32(H1_X1_hi, H2_X2_hi),
124  veorq_u32(H3_X3_hi, H4_X4_hi));
125 
126  uint32x4_t T0 = veorq_u32(lo, hi);
127 
128  T0 = veorq_u32(T0, (uint32x4_t)vmull_p64(X1_hi ^ X1_lo, H1_hi ^ H1_lo));
129  T0 = veorq_u32(T0, (uint32x4_t)vmull_p64(X2_hi ^ X2_lo, H2_hi ^ H2_lo));
130  T0 = veorq_u32(T0, (uint32x4_t)vmull_p64(X3_hi ^ X3_lo, H3_hi ^ H3_lo));
131  T0 = veorq_u32(T0, (uint32x4_t)vmull_p64(X4_hi ^ X4_lo, H4_hi ^ H4_lo));
132 
133  const uint32x4_t zero = vdupq_n_u32(0);
134  uint32x4_t B0 = veorq_u32(vextq_u32(zero, T0, 2), hi);
135  uint32x4_t B1 = veorq_u32(vextq_u32(T0, zero, 2), lo);
136  return gcm_reduce(B0, B1);
137  }
138 
139 BOTAN_FUNC_ISA("+simd")
140 inline uint8x16_t bswap_vec(uint8x16_t v)
141  {
142  const uint8_t maskb[16] = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
143  const uint8x16_t mask = vld1q_u8(maskb);
144  return vqtbl1q_u8(v, mask);
145  }
146 
147 }
148 
149 BOTAN_FUNC_ISA("+simd")
150 void gcm_pmull_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4*2])
151  {
152  const uint64x2_t H = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(H_bytes)));
153  const uint64x2_t H2 = gcm_multiply(H, H);
154  const uint64x2_t H3 = gcm_multiply(H, H2);
155  const uint64x2_t H4 = gcm_multiply(H, H3);
156 
157  vst1q_u64(H_pow , H);
158  vst1q_u64(H_pow+2, H2);
159  vst1q_u64(H_pow+4, H3);
160  vst1q_u64(H_pow+6, H4);
161  }
162 
163 BOTAN_FUNC_ISA("+simd")
164 void gcm_multiply_pmull(uint8_t x[16],
165  const uint64_t H64[8],
166  const uint8_t input[], size_t blocks)
167  {
168  const uint64x2_t H = vld1q_u64(H64);
169  uint64x2_t a = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(x)));
170 
171  if(blocks >= 4)
172  {
173  const uint64x2_t H2 = vld1q_u64(H64 + 2);
174  const uint64x2_t H3 = vld1q_u64(H64 + 4);
175  const uint64x2_t H4 = vld1q_u64(H64 + 6);
176 
177  while(blocks >= 4)
178  {
179  const uint64x2_t m0 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input)));
180  const uint64x2_t m1 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 16)));
181  const uint64x2_t m2 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 32)));
182  const uint64x2_t m3 = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 48)));
183 
184  a = veorq_u64(a, m0);
185  a = gcm_multiply_x4(H, H2, H3, H4, m3, m2, m1, a);
186 
187  input += 64;
188  blocks -= 4;
189  }
190  }
191 
192  for(size_t i = 0; i != blocks; ++i)
193  {
194  const uint64x2_t m = vreinterpretq_u64_u8(bswap_vec(vld1q_u8(input + 16*i)));
195  a = veorq_u64(a, m);
196  a = gcm_multiply(H, a);
197  }
198 
199  vst1q_u8(x, bswap_vec(vreinterpretq_u8_u64(a)));
200  }
201 
202 }
void gcm_multiply_pmull(uint8_t x[16], const uint64_t H64[8], const uint8_t input[], size_t blocks)
Definition: pmull.cpp:164
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:75
Definition: alg_id.cpp:13
void gcm_pmull_precompute(const uint8_t H_bytes[16], uint64_t H_pow[4 *2])
Definition: pmull.cpp:150