Botan  2.4.0
Crypto and TLS for C++11
threefish_avx2.cpp
Go to the documentation of this file.
1 /*
2 * Threefish-512 using AVX2
3 * (C) 2013,2016 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #include <botan/threefish.h>
9 #include <immintrin.h>
10 
11 namespace Botan {
12 
13 namespace {
14 
15 BOTAN_FUNC_ISA("avx2")
16 inline void interleave_epi64(__m256i& X0, __m256i& X1)
17  {
18  // interleave X0 and X1 qwords
19  // (X0,X1,X2,X3),(X4,X5,X6,X7) -> (X0,X2,X4,X6),(X1,X3,X5,X7)
20 
21  const __m256i T0 = _mm256_unpacklo_epi64(X0, X1);
22  const __m256i T1 = _mm256_unpackhi_epi64(X0, X1);
23 
24  X0 = _mm256_permute4x64_epi64(T0, _MM_SHUFFLE(3,1,2,0));
25  X1 = _mm256_permute4x64_epi64(T1, _MM_SHUFFLE(3,1,2,0));
26  }
27 
28 BOTAN_FUNC_ISA("avx2")
29 inline void deinterleave_epi64(__m256i& X0, __m256i& X1)
30  {
31  const __m256i T0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(3,1,2,0));
32  const __m256i T1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(3,1,2,0));
33 
34  X0 = _mm256_unpacklo_epi64(T0, T1);
35  X1 = _mm256_unpackhi_epi64(T0, T1);
36  }
37 
38 BOTAN_FUNC_ISA("avx2")
39 inline void rotate_keys(__m256i& R0, __m256i& R1, __m256i R2)
40  {
41  /*
42  Behold. The key schedule progresses like so. The values
43  loop back to the originals after the rounds are complete
44  so we don't need to reload for starting the next block.
45 
46  R0 R1 R2
47  K1,K2,K3 (7,5,3,1),(8,6,4,2),(0,7,5,3)
48  K3,K4,K5 (0,7,5,3),(1,8,6,4),(2,0,7,5)
49  K5,K6,K7 (2,0,7,5),(3,1,8,6),(4,2,0,7)
50 
51  K7,K8,K0 (4,2,0,7),(5,3,1,8),(6,4,2,0)
52  K0,K1,K2 (6,4,2,0),(7,5,3,1),(8,6,4,2)
53  K2,K3,K4 (8,6,4,2),(0,7,5,3),(1,8,6,4)
54 
55  K4,K5,K6 (1,8,6,4),(2,0,7,5),(3,1,8,6)
56  K6,K7,K8 (3,1,8,6),(4,2,0,7),(5,3,1,8)
57  K8,K0,K1 (5,3,1,8),(6,4,2,0),(7,5,3,1)
58 
59  To compute the values for the next round:
60  X0 is X2 from the last round
61  X1 becomes (X0[4],X1[1:3])
62  X2 becomes (X1[4],X2[1:3])
63 
64  Uses 3 permutes and 2 blends, is there a faster way?
65  */
66  __m256i T0 = _mm256_permute4x64_epi64(R0, _MM_SHUFFLE(0,0,0,0));
67  __m256i T1 = _mm256_permute4x64_epi64(R1, _MM_SHUFFLE(0,3,2,1));
68  __m256i T2 = _mm256_permute4x64_epi64(R2, _MM_SHUFFLE(0,3,2,1));
69 
70  R0 = _mm256_blend_epi32(T1, T0, 0xC0);
71  R1 = _mm256_blend_epi32(T2, T1, 0xC0);
72  }
73 
74 
75 }
76 
77 BOTAN_FUNC_ISA("avx2")
78 void Threefish_512::avx2_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
79  {
80  const uint64_t* K = &get_K()[0];
81  const uint64_t* T_64 = &get_T()[0];
82 
83  const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
84  const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
85  const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
86  const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
87  const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
88  const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
89  const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
90  const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
91 
92 #define THREEFISH_ROUND(X0, X1, SHL) \
93  do { \
94  const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \
95  X0 = _mm256_add_epi64(X0, X1); \
96  X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
97  X1 = _mm256_xor_si256(X1, X0); \
98  X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \
99  X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
100  } while(0)
101 
102 #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL) \
103  do { \
104  const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \
105  X0 = _mm256_add_epi64(X0, X1); \
106  X2 = _mm256_add_epi64(X2, X3); \
107  X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
108  X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
109  X1 = _mm256_xor_si256(X1, X0); \
110  X3 = _mm256_xor_si256(X3, X2); \
111  X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \
112  X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); \
113  X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
114  X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \
115  } while(0)
116 
117 #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \
118  do { \
119  const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
120  const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
121  X0 = _mm256_add_epi64(X0, K0); \
122  X1 = _mm256_add_epi64(X1, K1); \
123  X1 = _mm256_add_epi64(X1, _mm256_set_epi64x(R,0,0,0)); \
124  X0 = _mm256_add_epi64(X0, T0); \
125  X1 = _mm256_add_epi64(X1, T1); \
126  } while(0)
127 
128 #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \
129  do { \
130  const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
131  __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
132  X0 = _mm256_add_epi64(X0, K0); \
133  X2 = _mm256_add_epi64(X2, K0); \
134  X1 = _mm256_add_epi64(X1, K1); \
135  X3 = _mm256_add_epi64(X3, K1); \
136  T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \
137  X0 = _mm256_add_epi64(X0, T0); \
138  X2 = _mm256_add_epi64(X2, T0); \
139  X1 = _mm256_add_epi64(X1, T1); \
140  X3 = _mm256_add_epi64(X3, T1); \
141  } while(0)
142 
143 #define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2) \
144  do { \
145  rotate_keys(K1, K2, K0); \
146  THREEFISH_ROUND(X0, X1, ROTATE_1); \
147  THREEFISH_ROUND(X0, X1, ROTATE_2); \
148  THREEFISH_ROUND(X0, X1, ROTATE_3); \
149  THREEFISH_ROUND(X0, X1, ROTATE_4); \
150  THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0, T1); \
151  \
152  THREEFISH_ROUND(X0, X1, ROTATE_5); \
153  THREEFISH_ROUND(X0, X1, ROTATE_6); \
154  THREEFISH_ROUND(X0, X1, ROTATE_7); \
155  THREEFISH_ROUND(X0, X1, ROTATE_8); \
156  THREEFISH_INJECT_KEY(X0, X1, R+1, K1, K2, T2, T0); \
157  } while(0)
158 
159 #define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2) \
160  do { \
161  rotate_keys(K1, K2, K0); \
162  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \
163  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \
164  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \
165  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \
166  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0, T1); \
167  \
168  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \
169  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \
170  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \
171  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \
172  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K1, K2, T2, T0); \
173  } while(0)
174 
175  __m256i K0 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
176  __m256i K1 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
177  __m256i K2 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
178 
179  const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
180  __m256i* out_mm = reinterpret_cast<__m256i*>(out);
181 
182  while(blocks >= 2)
183  {
184  __m256i X0 = _mm256_loadu_si256(in_mm++);
185  __m256i X1 = _mm256_loadu_si256(in_mm++);
186  __m256i X2 = _mm256_loadu_si256(in_mm++);
187  __m256i X3 = _mm256_loadu_si256(in_mm++);
188 
189  const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
190 
191  interleave_epi64(X0, X1);
192  interleave_epi64(X2, X3);
193 
194  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K1, K2, 2, 3);
195 
196  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 1, K2,K0,K1, 1, 2, 3);
197  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 3, K1,K2,K0, 2, 3, 1);
198  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 5, K0,K1,K2, 3, 1, 2);
199  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 7, K2,K0,K1, 1, 2, 3);
200  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 9, K1,K2,K0, 2, 3, 1);
201  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 11, K0,K1,K2, 3, 1, 2);
202  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 13, K2,K0,K1, 1, 2, 3);
203  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 15, K1,K2,K0, 2, 3, 1);
204  THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, 17, K0,K1,K2, 3, 1, 2);
205 
206  deinterleave_epi64(X0, X1);
207  deinterleave_epi64(X2, X3);
208 
209  _mm256_storeu_si256(out_mm++, X0);
210  _mm256_storeu_si256(out_mm++, X1);
211  _mm256_storeu_si256(out_mm++, X2);
212  _mm256_storeu_si256(out_mm++, X3);
213 
214  blocks -= 2;
215  }
216 
217  for(size_t i = 0; i != blocks; ++i)
218  {
219  __m256i X0 = _mm256_loadu_si256(in_mm++);
220  __m256i X1 = _mm256_loadu_si256(in_mm++);
221 
222  const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
223 
224  interleave_epi64(X0, X1);
225 
226  THREEFISH_INJECT_KEY(X0, X1, 0, K1, K2, 2, 3);
227 
228  THREEFISH_ENC_8_ROUNDS(X0, X1, 1, K2,K0,K1, 1, 2, 3);
229  THREEFISH_ENC_8_ROUNDS(X0, X1, 3, K1,K2,K0, 2, 3, 1);
230  THREEFISH_ENC_8_ROUNDS(X0, X1, 5, K0,K1,K2, 3, 1, 2);
231  THREEFISH_ENC_8_ROUNDS(X0, X1, 7, K2,K0,K1, 1, 2, 3);
232  THREEFISH_ENC_8_ROUNDS(X0, X1, 9, K1,K2,K0, 2, 3, 1);
233  THREEFISH_ENC_8_ROUNDS(X0, X1, 11, K0,K1,K2, 3, 1, 2);
234  THREEFISH_ENC_8_ROUNDS(X0, X1, 13, K2,K0,K1, 1, 2, 3);
235  THREEFISH_ENC_8_ROUNDS(X0, X1, 15, K1,K2,K0, 2, 3, 1);
236  THREEFISH_ENC_8_ROUNDS(X0, X1, 17, K0,K1,K2, 3, 1, 2);
237 
238  deinterleave_epi64(X0, X1);
239 
240  _mm256_storeu_si256(out_mm++, X0);
241  _mm256_storeu_si256(out_mm++, X1);
242  }
243 
244 #undef THREEFISH_ENC_8_ROUNDS
245 #undef THREEFISH_ROUND
246 #undef THREEFISH_INJECT_KEY
247 #undef THREEFISH_DEC_2_8_ROUNDS
248 #undef THREEFISH_ROUND_2
249 #undef THREEFISH_INJECT_KEY_2
250  }
251 
252 BOTAN_FUNC_ISA("avx2")
253 void Threefish_512::avx2_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const
254  {
255  const uint64_t* K = &get_K()[0];
256  const uint64_t* T_64 = &get_T()[0];
257 
258  const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
259  const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
260  const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
261  const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
262  const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
263  const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
264  const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
265  const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);
266 
267 #define THREEFISH_ROUND(X0, X1, SHR) \
268  do { \
269  const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \
270  X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \
271  X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
272  X1 = _mm256_xor_si256(X1, X0); \
273  X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
274  X0 = _mm256_sub_epi64(X0, X1); \
275  } while(0)
276 
277 #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHR) \
278  do { \
279  const __m256i SHL = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHR); \
280  X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(2, 1, 0, 3)); \
281  X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(2, 1, 0, 3)); \
282  X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \
283  X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \
284  X1 = _mm256_xor_si256(X1, X0); \
285  X3 = _mm256_xor_si256(X3, X2); \
286  X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
287  X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
288  X0 = _mm256_sub_epi64(X0, X1); \
289  X2 = _mm256_sub_epi64(X2, X3); \
290  } while(0)
291 
292 #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \
293  do { \
294  const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
295  const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
296  X0 = _mm256_sub_epi64(X0, K0); \
297  X1 = _mm256_sub_epi64(X1, K1); \
298  X1 = _mm256_sub_epi64(X1, _mm256_set_epi64x(R, 0, 0, 0)); \
299  X0 = _mm256_sub_epi64(X0, T0); \
300  X1 = _mm256_sub_epi64(X1, T1); \
301  } while(0)
302 
303 #define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \
304  do { \
305  THREEFISH_INJECT_KEY(X0, X1, R+1, K2, K3, T2, T0); \
306  THREEFISH_ROUND(X0, X1, ROTATE_8); \
307  THREEFISH_ROUND(X0, X1, ROTATE_7); \
308  THREEFISH_ROUND(X0, X1, ROTATE_6); \
309  THREEFISH_ROUND(X0, X1, ROTATE_5); \
310  \
311  THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \
312  THREEFISH_ROUND(X0, X1, ROTATE_4); \
313  THREEFISH_ROUND(X0, X1, ROTATE_3); \
314  THREEFISH_ROUND(X0, X1, ROTATE_2); \
315  THREEFISH_ROUND(X0, X1, ROTATE_1); \
316  } while(0)
317 
318 #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \
319  do { \
320  const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
321  __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
322  X0 = _mm256_sub_epi64(X0, K0); \
323  X2 = _mm256_sub_epi64(X2, K0); \
324  X1 = _mm256_sub_epi64(X1, K1); \
325  X3 = _mm256_sub_epi64(X3, K1); \
326  T1 = _mm256_add_epi64(T1, _mm256_set_epi64x(R,0,0,0)); \
327  X0 = _mm256_sub_epi64(X0, T0); \
328  X2 = _mm256_sub_epi64(X2, T0); \
329  X1 = _mm256_sub_epi64(X1, T1); \
330  X3 = _mm256_sub_epi64(X3, T1); \
331  } while(0)
332 
333 #define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
334  do { \
335  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R+1, K2, K3, T2, T0); \
336  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \
337  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \
338  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \
339  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \
340  \
341  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \
342  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \
343  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \
344  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \
345  THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \
346  } while(0)
347 
348  /*
349  v1.0 key schedule: 9 ymm registers (only need 2 or 3)
350  (0,1,2,3),(4,5,6,7) [8]
351  then mutating with vpermq
352  */
353  const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
354  const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
355  const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
356  const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
357  const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
358  const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
359  const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
360  const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
361  const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);
362 
363  const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
364  __m256i* out_mm = reinterpret_cast<__m256i*>(out);
365 
366  while(blocks >= 2)
367  {
368  __m256i X0 = _mm256_loadu_si256(in_mm++);
369  __m256i X1 = _mm256_loadu_si256(in_mm++);
370  __m256i X2 = _mm256_loadu_si256(in_mm++);
371  __m256i X3 = _mm256_loadu_si256(in_mm++);
372 
373  const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
374 
375  interleave_epi64(X0, X1);
376  interleave_epi64(X2, X3);
377 
378  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 17, K8,K0,K1, 3, 1, 2);
379  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 15, K6,K7,K8, 2, 3, 1);
380  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 13, K4,K5,K6, 1, 2, 3);
381  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 11, K2,K3,K4, 3, 1, 2);
382  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 9, K0,K1,K2, 2, 3, 1);
383  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 7, K7,K8,K0, 1, 2, 3);
384  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 5, K5,K6,K7, 3, 1, 2);
385  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 3, K3,K4,K5, 2, 3, 1);
386  THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, 1, K1,K2,K3, 1, 2, 3);
387 
388  THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, 0, K0, K1, 2, 3);
389 
390  deinterleave_epi64(X0, X1);
391  deinterleave_epi64(X2, X3);
392 
393  _mm256_storeu_si256(out_mm++, X0);
394  _mm256_storeu_si256(out_mm++, X1);
395  _mm256_storeu_si256(out_mm++, X2);
396  _mm256_storeu_si256(out_mm++, X3);
397 
398  blocks -= 2;
399  }
400 
401  for(size_t i = 0; i != blocks; ++i)
402  {
403  __m256i X0 = _mm256_loadu_si256(in_mm++);
404  __m256i X1 = _mm256_loadu_si256(in_mm++);
405 
406  const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);
407 
408  interleave_epi64(X0, X1);
409 
410  THREEFISH_DEC_8_ROUNDS(X0, X1, 17, K8,K0,K1, 3, 1, 2);
411  THREEFISH_DEC_8_ROUNDS(X0, X1, 15, K6,K7,K8, 2, 3, 1);
412  THREEFISH_DEC_8_ROUNDS(X0, X1, 13, K4,K5,K6, 1, 2, 3);
413  THREEFISH_DEC_8_ROUNDS(X0, X1, 11, K2,K3,K4, 3, 1, 2);
414  THREEFISH_DEC_8_ROUNDS(X0, X1, 9, K0,K1,K2, 2, 3, 1);
415  THREEFISH_DEC_8_ROUNDS(X0, X1, 7, K7,K8,K0, 1, 2, 3);
416  THREEFISH_DEC_8_ROUNDS(X0, X1, 5, K5,K6,K7, 3, 1, 2);
417  THREEFISH_DEC_8_ROUNDS(X0, X1, 3, K3,K4,K5, 2, 3, 1);
418  THREEFISH_DEC_8_ROUNDS(X0, X1, 1, K1,K2,K3, 1, 2, 3);
419 
420  THREEFISH_INJECT_KEY(X0, X1, 0, K0, K1, 2, 3);
421 
422  deinterleave_epi64(X0, X1);
423 
424  _mm256_storeu_si256(out_mm++, X0);
425  _mm256_storeu_si256(out_mm++, X1);
426  }
427 
428 #undef THREEFISH_DEC_8_ROUNDS
429 #undef THREEFISH_ROUND
430 #undef THREEFISH_INJECT_KEY
431 #undef THREEFISH_DEC_2_8_ROUNDS
432 #undef THREEFISH_ROUND_2
433 #undef THREEFISH_INJECT_KEY_2
434  }
435 
436 }
#define THREEFISH_DEC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2)
#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)
#define THREEFISH_DEC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)
#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0, K1, K2, T0, T1, T2)
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:75
#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0, K1, K2, T0, T1, T2)
Definition: alg_id.cpp:13
fe T
Definition: ge.cpp:37
#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)