Botan 3.8.1
Crypto and TLS for C&
idea_sse2.cpp
Go to the documentation of this file.
1/*
2* IDEA in SSE2
3* (C) 2009 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/internal/idea.h>
9
10#include <botan/internal/ct_utils.h>
11#include <botan/internal/isa_extn.h>
12#include <emmintrin.h>
13
14namespace Botan {
15
16namespace {
17
18BOTAN_FN_ISA_SSE2 inline __m128i mul(__m128i X, uint16_t K_16) {
19 const __m128i zeros = _mm_set1_epi16(0);
20 const __m128i ones = _mm_set1_epi16(1);
21
22 const __m128i K = _mm_set1_epi16(K_16);
23
24 const __m128i X_is_zero = _mm_cmpeq_epi16(X, zeros);
25 const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
26
27 const __m128i mul_lo = _mm_mullo_epi16(X, K);
28 const __m128i mul_hi = _mm_mulhi_epu16(X, K);
29
30 __m128i T = _mm_sub_epi16(mul_lo, mul_hi);
31
32 // Unsigned compare; cmp = 1 if mul_lo < mul_hi else 0
33 const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
34 const __m128i cmp = _mm_min_epu8(_mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
35
36 T = _mm_add_epi16(T, cmp);
37
38 /* Selection: if X[i] is zero then assign 1-K
39 if K is zero then assign 1-X[i]
40
41 Could if() off value of K_16 for the second, but this gives a
42 constant time implementation which is a nice bonus.
43 */
44
45 T = _mm_or_si128(_mm_andnot_si128(X_is_zero, T), _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
46
47 T = _mm_or_si128(_mm_andnot_si128(K_is_zero, T), _mm_and_si128(_mm_sub_epi16(ones, X), K_is_zero));
48
49 return T;
50}
51
52/*
53* 4x8 matrix transpose
54*
55* FIXME: why do I need the extra set of unpack_epi32 here? Inverse in
56* transpose_out doesn't need it. Something with the shuffle? Removing
57* that extra unpack could easily save 3-4 cycles per block, and would
58* also help a lot with register pressure on 32-bit x86
59*/
60BOTAN_FN_ISA_SSE2 void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) {
61 __m128i T0 = _mm_unpackhi_epi32(B0, B1);
62 __m128i T1 = _mm_unpacklo_epi32(B0, B1);
63 __m128i T2 = _mm_unpackhi_epi32(B2, B3);
64 __m128i T3 = _mm_unpacklo_epi32(B2, B3);
65
66 __m128i T4 = _mm_unpacklo_epi32(T0, T1);
67 __m128i T5 = _mm_unpackhi_epi32(T0, T1);
68 __m128i T6 = _mm_unpacklo_epi32(T2, T3);
69 __m128i T7 = _mm_unpackhi_epi32(T2, T3);
70
71 T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
72 T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
73 T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
74 T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
75
76 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
77 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
78 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
79 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
80
81 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
82 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
83 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
84 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
85
86 B0 = _mm_unpacklo_epi64(T0, T2);
87 B1 = _mm_unpackhi_epi64(T0, T2);
88 B2 = _mm_unpacklo_epi64(T1, T3);
89 B3 = _mm_unpackhi_epi64(T1, T3);
90}
91
92/*
93* 4x8 matrix transpose (reverse)
94*/
95BOTAN_FN_ISA_SSE2 void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) {
96 __m128i T0 = _mm_unpacklo_epi64(B0, B1);
97 __m128i T1 = _mm_unpacklo_epi64(B2, B3);
98 __m128i T2 = _mm_unpackhi_epi64(B0, B1);
99 __m128i T3 = _mm_unpackhi_epi64(B2, B3);
100
101 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
102 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
103 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
104 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
105
106 T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
107 T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
108 T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
109 T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
110
111 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
112 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
113 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
114 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
115
116 B0 = _mm_unpacklo_epi32(T0, T1);
117 B1 = _mm_unpackhi_epi32(T0, T1);
118 B2 = _mm_unpacklo_epi32(T2, T3);
119 B3 = _mm_unpackhi_epi32(T2, T3);
120}
121
122} // namespace
123
124/*
125* 8 wide IDEA encryption/decryption in SSE2
126*/
127BOTAN_FN_ISA_SSE2 void IDEA::sse2_idea_op_8(const uint8_t in[64], uint8_t out[64], const uint16_t EK[52]) {
128 CT::poison(in, 64);
129 CT::poison(out, 64);
130 CT::poison(EK, 52);
131
132 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
133
134 __m128i B0 = _mm_loadu_si128(in_mm + 0);
135 __m128i B1 = _mm_loadu_si128(in_mm + 1);
136 __m128i B2 = _mm_loadu_si128(in_mm + 2);
137 __m128i B3 = _mm_loadu_si128(in_mm + 3);
138
139 transpose_in(B0, B1, B2, B3);
140
141 // byte swap
142 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
143 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
144 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
145 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
146
147 for(size_t i = 0; i != 8; ++i) {
148 B0 = mul(B0, EK[6 * i + 0]);
149 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6 * i + 1]));
150 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6 * i + 2]));
151 B3 = mul(B3, EK[6 * i + 3]);
152
153 __m128i T0 = B2;
154 B2 = _mm_xor_si128(B2, B0);
155 B2 = mul(B2, EK[6 * i + 4]);
156
157 __m128i T1 = B1;
158
159 B1 = _mm_xor_si128(B1, B3);
160 B1 = _mm_add_epi16(B1, B2);
161 B1 = mul(B1, EK[6 * i + 5]);
162
163 B2 = _mm_add_epi16(B2, B1);
164
165 B0 = _mm_xor_si128(B0, B1);
166 B1 = _mm_xor_si128(B1, T0);
167 B3 = _mm_xor_si128(B3, B2);
168 B2 = _mm_xor_si128(B2, T1);
169 }
170
171 B0 = mul(B0, EK[48]);
172 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
173 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
174 B3 = mul(B3, EK[51]);
175
176 // byte swap
177 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
178 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
179 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
180 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
181
182 transpose_out(B0, B2, B1, B3);
183
184 __m128i* out_mm = reinterpret_cast<__m128i*>(out);
185
186 _mm_storeu_si128(out_mm + 0, B0);
187 _mm_storeu_si128(out_mm + 1, B2);
188 _mm_storeu_si128(out_mm + 2, B1);
189 _mm_storeu_si128(out_mm + 3, B3);
190
191 CT::unpoison(in, 64);
192 CT::unpoison(out, 64);
193 CT::unpoison(EK, 52);
194}
195
196} // namespace Botan
constexpr void unpoison(const T *p, size_t n)
Definition ct_utils.h:65
constexpr void poison(const T *p, size_t n)
Definition ct_utils.h:54