Botan  2.8.0
Crypto and TLS for C++11
chacha_sse2.cpp
Go to the documentation of this file.
1 /*
2 * SSE2 ChaCha
3 * (C) 2016 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #include <botan/chacha.h>
9 #include <emmintrin.h>
10 
11 namespace Botan {
12 
13 //static
14 BOTAN_FUNC_ISA("sse2")
15 void ChaCha::chacha_sse2_x4(uint8_t output[64*4], uint32_t input[16], size_t rounds)
16  {
17  BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds");
18 
19  const __m128i* input_mm = reinterpret_cast<const __m128i*>(input);
20  __m128i* output_mm = reinterpret_cast<__m128i*>(output);
21 
22  __m128i input0 = _mm_loadu_si128(input_mm);
23  __m128i input1 = _mm_loadu_si128(input_mm + 1);
24  __m128i input2 = _mm_loadu_si128(input_mm + 2);
25  __m128i input3 = _mm_loadu_si128(input_mm + 3);
26 
27  // TODO: try transposing, which would avoid the permutations each round
28 
29 #define mm_rotl(r, n) \
30  _mm_or_si128(_mm_slli_epi32(r, n), _mm_srli_epi32(r, 32-n))
31 
32  __m128i r0_0 = input0;
33  __m128i r0_1 = input1;
34  __m128i r0_2 = input2;
35  __m128i r0_3 = input3;
36 
37  __m128i r1_0 = input0;
38  __m128i r1_1 = input1;
39  __m128i r1_2 = input2;
40  __m128i r1_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 1));
41 
42  __m128i r2_0 = input0;
43  __m128i r2_1 = input1;
44  __m128i r2_2 = input2;
45  __m128i r2_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 2));
46 
47  __m128i r3_0 = input0;
48  __m128i r3_1 = input1;
49  __m128i r3_2 = input2;
50  __m128i r3_3 = _mm_add_epi64(r0_3, _mm_set_epi32(0, 0, 0, 3));
51 
52  for(size_t r = 0; r != rounds / 2; ++r)
53  {
54  r0_0 = _mm_add_epi32(r0_0, r0_1);
55  r1_0 = _mm_add_epi32(r1_0, r1_1);
56  r2_0 = _mm_add_epi32(r2_0, r2_1);
57  r3_0 = _mm_add_epi32(r3_0, r3_1);
58 
59  r0_3 = _mm_xor_si128(r0_3, r0_0);
60  r1_3 = _mm_xor_si128(r1_3, r1_0);
61  r2_3 = _mm_xor_si128(r2_3, r2_0);
62  r3_3 = _mm_xor_si128(r3_3, r3_0);
63 
64  r0_3 = mm_rotl(r0_3, 16);
65  r1_3 = mm_rotl(r1_3, 16);
66  r2_3 = mm_rotl(r2_3, 16);
67  r3_3 = mm_rotl(r3_3, 16);
68 
69  r0_2 = _mm_add_epi32(r0_2, r0_3);
70  r1_2 = _mm_add_epi32(r1_2, r1_3);
71  r2_2 = _mm_add_epi32(r2_2, r2_3);
72  r3_2 = _mm_add_epi32(r3_2, r3_3);
73 
74  r0_1 = _mm_xor_si128(r0_1, r0_2);
75  r1_1 = _mm_xor_si128(r1_1, r1_2);
76  r2_1 = _mm_xor_si128(r2_1, r2_2);
77  r3_1 = _mm_xor_si128(r3_1, r3_2);
78 
79  r0_1 = mm_rotl(r0_1, 12);
80  r1_1 = mm_rotl(r1_1, 12);
81  r2_1 = mm_rotl(r2_1, 12);
82  r3_1 = mm_rotl(r3_1, 12);
83 
84  r0_0 = _mm_add_epi32(r0_0, r0_1);
85  r1_0 = _mm_add_epi32(r1_0, r1_1);
86  r2_0 = _mm_add_epi32(r2_0, r2_1);
87  r3_0 = _mm_add_epi32(r3_0, r3_1);
88 
89  r0_3 = _mm_xor_si128(r0_3, r0_0);
90  r1_3 = _mm_xor_si128(r1_3, r1_0);
91  r2_3 = _mm_xor_si128(r2_3, r2_0);
92  r3_3 = _mm_xor_si128(r3_3, r3_0);
93 
94  r0_3 = mm_rotl(r0_3, 8);
95  r1_3 = mm_rotl(r1_3, 8);
96  r2_3 = mm_rotl(r2_3, 8);
97  r3_3 = mm_rotl(r3_3, 8);
98 
99  r0_2 = _mm_add_epi32(r0_2, r0_3);
100  r1_2 = _mm_add_epi32(r1_2, r1_3);
101  r2_2 = _mm_add_epi32(r2_2, r2_3);
102  r3_2 = _mm_add_epi32(r3_2, r3_3);
103 
104  r0_1 = _mm_xor_si128(r0_1, r0_2);
105  r1_1 = _mm_xor_si128(r1_1, r1_2);
106  r2_1 = _mm_xor_si128(r2_1, r2_2);
107  r3_1 = _mm_xor_si128(r3_1, r3_2);
108 
109  r0_1 = mm_rotl(r0_1, 7);
110  r1_1 = mm_rotl(r1_1, 7);
111  r2_1 = mm_rotl(r2_1, 7);
112  r3_1 = mm_rotl(r3_1, 7);
113 
114  r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(0, 3, 2, 1));
115  r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
116  r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(2, 1, 0, 3));
117 
118  r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(0, 3, 2, 1));
119  r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
120  r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(2, 1, 0, 3));
121 
122  r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(0, 3, 2, 1));
123  r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
124  r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(2, 1, 0, 3));
125 
126  r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(0, 3, 2, 1));
127  r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
128  r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(2, 1, 0, 3));
129 
130  r0_0 = _mm_add_epi32(r0_0, r0_1);
131  r1_0 = _mm_add_epi32(r1_0, r1_1);
132  r2_0 = _mm_add_epi32(r2_0, r2_1);
133  r3_0 = _mm_add_epi32(r3_0, r3_1);
134 
135  r0_3 = _mm_xor_si128(r0_3, r0_0);
136  r1_3 = _mm_xor_si128(r1_3, r1_0);
137  r2_3 = _mm_xor_si128(r2_3, r2_0);
138  r3_3 = _mm_xor_si128(r3_3, r3_0);
139 
140  r0_3 = mm_rotl(r0_3, 16);
141  r1_3 = mm_rotl(r1_3, 16);
142  r2_3 = mm_rotl(r2_3, 16);
143  r3_3 = mm_rotl(r3_3, 16);
144 
145  r0_2 = _mm_add_epi32(r0_2, r0_3);
146  r1_2 = _mm_add_epi32(r1_2, r1_3);
147  r2_2 = _mm_add_epi32(r2_2, r2_3);
148  r3_2 = _mm_add_epi32(r3_2, r3_3);
149 
150  r0_1 = _mm_xor_si128(r0_1, r0_2);
151  r1_1 = _mm_xor_si128(r1_1, r1_2);
152  r2_1 = _mm_xor_si128(r2_1, r2_2);
153  r3_1 = _mm_xor_si128(r3_1, r3_2);
154 
155  r0_1 = mm_rotl(r0_1, 12);
156  r1_1 = mm_rotl(r1_1, 12);
157  r2_1 = mm_rotl(r2_1, 12);
158  r3_1 = mm_rotl(r3_1, 12);
159 
160  r0_0 = _mm_add_epi32(r0_0, r0_1);
161  r1_0 = _mm_add_epi32(r1_0, r1_1);
162  r2_0 = _mm_add_epi32(r2_0, r2_1);
163  r3_0 = _mm_add_epi32(r3_0, r3_1);
164 
165  r0_3 = _mm_xor_si128(r0_3, r0_0);
166  r1_3 = _mm_xor_si128(r1_3, r1_0);
167  r2_3 = _mm_xor_si128(r2_3, r2_0);
168  r3_3 = _mm_xor_si128(r3_3, r3_0);
169 
170  r0_3 = mm_rotl(r0_3, 8);
171  r1_3 = mm_rotl(r1_3, 8);
172  r2_3 = mm_rotl(r2_3, 8);
173  r3_3 = mm_rotl(r3_3, 8);
174 
175  r0_2 = _mm_add_epi32(r0_2, r0_3);
176  r1_2 = _mm_add_epi32(r1_2, r1_3);
177  r2_2 = _mm_add_epi32(r2_2, r2_3);
178  r3_2 = _mm_add_epi32(r3_2, r3_3);
179 
180  r0_1 = _mm_xor_si128(r0_1, r0_2);
181  r1_1 = _mm_xor_si128(r1_1, r1_2);
182  r2_1 = _mm_xor_si128(r2_1, r2_2);
183  r3_1 = _mm_xor_si128(r3_1, r3_2);
184 
185  r0_1 = mm_rotl(r0_1, 7);
186  r1_1 = mm_rotl(r1_1, 7);
187  r2_1 = mm_rotl(r2_1, 7);
188  r3_1 = mm_rotl(r3_1, 7);
189 
190  r0_1 = _mm_shuffle_epi32(r0_1, _MM_SHUFFLE(2, 1, 0, 3));
191  r0_2 = _mm_shuffle_epi32(r0_2, _MM_SHUFFLE(1, 0, 3, 2));
192  r0_3 = _mm_shuffle_epi32(r0_3, _MM_SHUFFLE(0, 3, 2, 1));
193 
194  r1_1 = _mm_shuffle_epi32(r1_1, _MM_SHUFFLE(2, 1, 0, 3));
195  r1_2 = _mm_shuffle_epi32(r1_2, _MM_SHUFFLE(1, 0, 3, 2));
196  r1_3 = _mm_shuffle_epi32(r1_3, _MM_SHUFFLE(0, 3, 2, 1));
197 
198  r2_1 = _mm_shuffle_epi32(r2_1, _MM_SHUFFLE(2, 1, 0, 3));
199  r2_2 = _mm_shuffle_epi32(r2_2, _MM_SHUFFLE(1, 0, 3, 2));
200  r2_3 = _mm_shuffle_epi32(r2_3, _MM_SHUFFLE(0, 3, 2, 1));
201 
202  r3_1 = _mm_shuffle_epi32(r3_1, _MM_SHUFFLE(2, 1, 0, 3));
203  r3_2 = _mm_shuffle_epi32(r3_2, _MM_SHUFFLE(1, 0, 3, 2));
204  r3_3 = _mm_shuffle_epi32(r3_3, _MM_SHUFFLE(0, 3, 2, 1));
205  }
206 
207  r0_0 = _mm_add_epi32(r0_0, input0);
208  r0_1 = _mm_add_epi32(r0_1, input1);
209  r0_2 = _mm_add_epi32(r0_2, input2);
210  r0_3 = _mm_add_epi32(r0_3, input3);
211 
212  r1_0 = _mm_add_epi32(r1_0, input0);
213  r1_1 = _mm_add_epi32(r1_1, input1);
214  r1_2 = _mm_add_epi32(r1_2, input2);
215  r1_3 = _mm_add_epi32(r1_3, input3);
216  r1_3 = _mm_add_epi64(r1_3, _mm_set_epi32(0, 0, 0, 1));
217 
218  r2_0 = _mm_add_epi32(r2_0, input0);
219  r2_1 = _mm_add_epi32(r2_1, input1);
220  r2_2 = _mm_add_epi32(r2_2, input2);
221  r2_3 = _mm_add_epi32(r2_3, input3);
222  r2_3 = _mm_add_epi64(r2_3, _mm_set_epi32(0, 0, 0, 2));
223 
224  r3_0 = _mm_add_epi32(r3_0, input0);
225  r3_1 = _mm_add_epi32(r3_1, input1);
226  r3_2 = _mm_add_epi32(r3_2, input2);
227  r3_3 = _mm_add_epi32(r3_3, input3);
228  r3_3 = _mm_add_epi64(r3_3, _mm_set_epi32(0, 0, 0, 3));
229 
230  _mm_storeu_si128(output_mm + 0, r0_0);
231  _mm_storeu_si128(output_mm + 1, r0_1);
232  _mm_storeu_si128(output_mm + 2, r0_2);
233  _mm_storeu_si128(output_mm + 3, r0_3);
234 
235  _mm_storeu_si128(output_mm + 4, r1_0);
236  _mm_storeu_si128(output_mm + 5, r1_1);
237  _mm_storeu_si128(output_mm + 6, r1_2);
238  _mm_storeu_si128(output_mm + 7, r1_3);
239 
240  _mm_storeu_si128(output_mm + 8, r2_0);
241  _mm_storeu_si128(output_mm + 9, r2_1);
242  _mm_storeu_si128(output_mm + 10, r2_2);
243  _mm_storeu_si128(output_mm + 11, r2_3);
244 
245  _mm_storeu_si128(output_mm + 12, r3_0);
246  _mm_storeu_si128(output_mm + 13, r3_1);
247  _mm_storeu_si128(output_mm + 14, r3_2);
248  _mm_storeu_si128(output_mm + 15, r3_3);
249 
250 #undef mm_rotl
251 
252  input[12] += 4;
253  if(input[12] < 4)
254  input[13]++;
255  }
256 
257 }
#define mm_rotl(r, n)
#define BOTAN_ASSERT(expr, assertion_made)
Definition: assert.h:55
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:75
Definition: alg_id.cpp:13