Botan  2.7.0
Crypto and TLS for C++11
idea_sse2.cpp
Go to the documentation of this file.
1 /*
2 * IDEA in SSE2
3 * (C) 2009 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #include <botan/idea.h>
9 #include <botan/internal/ct_utils.h>
10 #include <emmintrin.h>
11 
12 namespace Botan {
13 
14 namespace {
15 
16 BOTAN_FUNC_ISA("sse2")
17 inline __m128i mul(__m128i X, uint16_t K_16)
18  {
19  const __m128i zeros = _mm_set1_epi16(0);
20  const __m128i ones = _mm_set1_epi16(1);
21 
22  const __m128i K = _mm_set1_epi16(K_16);
23 
24  const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros);
25  const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
26 
27  const __m128i mul_lo = _mm_mullo_epi16(X, K);
28  const __m128i mul_hi = _mm_mulhi_epu16(X, K);
29 
30  __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
31 
32  // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0
33  const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
34  const __m128i cmp = _mm_min_epu8(
35  _mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
36 
37  T = _mm_add_epi16(T, cmp);
38 
39  /* Selection: if X[i] is zero then assign 1-K
40  if K is zero then assign 1-X[i]
41 
42  Could if() off value of K_16 for the second, but this gives a
43  constant time implementation which is a nice bonus.
44  */
45 
46  T = _mm_or_si128(
47  _mm_andnot_si128(X_is_zero, T),
48  _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
49 
50  T = _mm_or_si128(
51  _mm_andnot_si128(K_is_zero, T),
52  _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
53 
54  return T;
55  }
56 
57 /*
58 * 4x8 matrix transpose
59 *
60 * FIXME: why do I need the extra set of unpack_epi32 here? Inverse in
61 * transpose_out doesn't need it. Something with the shuffle? Removing
62 * that extra unpack could easily save 3-4 cycles per block, and would
63 * also help a lot with register pressure on 32-bit x86
64 */
65 BOTAN_FUNC_ISA("sse2")
66 void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
67  {
68  __m128i T0 = _mm_unpackhi_epi32(B0, B1);
69  __m128i T1 = _mm_unpacklo_epi32(B0, B1);
70  __m128i T2 = _mm_unpackhi_epi32(B2, B3);
71  __m128i T3 = _mm_unpacklo_epi32(B2, B3);
72 
73  __m128i T4 = _mm_unpacklo_epi32(T0, T1);
74  __m128i T5 = _mm_unpackhi_epi32(T0, T1);
75  __m128i T6 = _mm_unpacklo_epi32(T2, T3);
76  __m128i T7 = _mm_unpackhi_epi32(T2, T3);
77 
78  T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
79  T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
80  T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
81  T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
82 
83  T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
84  T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
85  T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
86  T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
87 
88  T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
89  T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
90  T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
91  T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
92 
93  B0 = _mm_unpacklo_epi64(T0, T2);
94  B1 = _mm_unpackhi_epi64(T0, T2);
95  B2 = _mm_unpacklo_epi64(T1, T3);
96  B3 = _mm_unpackhi_epi64(T1, T3);
97  }
98 
99 /*
100 * 4x8 matrix transpose (reverse)
101 */
102 BOTAN_FUNC_ISA("sse2")
103 void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
104  {
105  __m128i T0 = _mm_unpacklo_epi64(B0, B1);
106  __m128i T1 = _mm_unpacklo_epi64(B2, B3);
107  __m128i T2 = _mm_unpackhi_epi64(B0, B1);
108  __m128i T3 = _mm_unpackhi_epi64(B2, B3);
109 
110  T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
111  T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
112  T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
113  T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
114 
115  T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
116  T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
117  T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
118  T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
119 
120  T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
121  T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
122  T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
123  T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
124 
125  B0 = _mm_unpacklo_epi32(T0, T1);
126  B1 = _mm_unpackhi_epi32(T0, T1);
127  B2 = _mm_unpacklo_epi32(T2, T3);
128  B3 = _mm_unpackhi_epi32(T2, T3);
129  }
130 
131 }
132 
133 /*
134 * 8 wide IDEA encryption/decryption in SSE2
135 */
136 BOTAN_FUNC_ISA("sse2")
137 void IDEA::sse2_idea_op_8(const uint8_t in[64], uint8_t out[64], const uint16_t EK[52]) const
138  {
139  CT::poison(in, 64);
140  CT::poison(out, 64);
141  CT::poison(EK, 52);
142 
143  const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
144 
145  __m128i B0 = _mm_loadu_si128(in_mm + 0);
146  __m128i B1 = _mm_loadu_si128(in_mm + 1);
147  __m128i B2 = _mm_loadu_si128(in_mm + 2);
148  __m128i B3 = _mm_loadu_si128(in_mm + 3);
149 
150  transpose_in(B0, B1, B2, B3);
151 
152  // byte swap
153  B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
154  B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
155  B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
156  B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
157 
158  for(size_t i = 0; i != 8; ++i)
159  {
160  B0 = mul(B0, EK[6*i+0]);
161  B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6*i+1]));
162  B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6*i+2]));
163  B3 = mul(B3, EK[6*i+3]);
164 
165  __m128i T0 = B2;
166  B2 = _mm_xor_si128(B2, B0);
167  B2 = mul(B2, EK[6*i+4]);
168 
169  __m128i T1 = B1;
170 
171  B1 = _mm_xor_si128(B1, B3);
172  B1 = _mm_add_epi16(B1, B2);
173  B1 = mul(B1, EK[6*i+5]);
174 
175  B2 = _mm_add_epi16(B2, B1);
176 
177  B0 = _mm_xor_si128(B0, B1);
178  B1 = _mm_xor_si128(B1, T0);
179  B3 = _mm_xor_si128(B3, B2);
180  B2 = _mm_xor_si128(B2, T1);
181  }
182 
183  B0 = mul(B0, EK[48]);
184  B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
185  B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
186  B3 = mul(B3, EK[51]);
187 
188  // byte swap
189  B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
190  B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
191  B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
192  B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
193 
194  transpose_out(B0, B2, B1, B3);
195 
196  __m128i* out_mm = reinterpret_cast<__m128i*>(out);
197 
198  _mm_storeu_si128(out_mm + 0, B0);
199  _mm_storeu_si128(out_mm + 1, B2);
200  _mm_storeu_si128(out_mm + 2, B1);
201  _mm_storeu_si128(out_mm + 3, B3);
202 
203  CT::unpoison(in, 64);
204  CT::unpoison(out, 64);
205  CT::unpoison(EK, 52);
206  }
207 
208 }
fe X
Definition: ge.cpp:27
void poison(const T *p, size_t n)
Definition: ct_utils.h:46
#define BOTAN_FUNC_ISA(isa)
Definition: compiler.h:75
Definition: alg_id.cpp:13
fe T
Definition: ge.cpp:37
void unpoison(const T *p, size_t n)
Definition: ct_utils.h:57