Botan 3.11.1
Crypto and TLS for C&
idea_sse2.cpp
Go to the documentation of this file.
1/*
2* IDEA in SSE2
3* (C) 2009 Jack Lloyd
4*
5* Botan is released under the Simplified BSD License (see license.txt)
6*/
7
8#include <botan/internal/idea.h>
9
10#include <botan/internal/ct_utils.h>
11#include <botan/internal/isa_extn.h>
12#include <emmintrin.h>
13
14namespace Botan {
15
16// NOLINTBEGIN(portability-simd-intrinsics) TODO add various helper fns
17
18namespace {
19
20BOTAN_FN_ISA_SSE2 inline __m128i mul(__m128i X, uint16_t K_16) {
21 const __m128i zeros = _mm_set1_epi16(0);
22 const __m128i ones = _mm_set1_epi16(1);
23 const __m128i K = _mm_set1_epi16(K_16);
24
25 // If X == 0 or K == 0 then P == X * K == 0
26 const __m128i P_is_zero = _mm_or_si128(_mm_cmpeq_epi16(X, zeros), _mm_cmpeq_epi16(K, zeros));
27
28 // Return value if P == 0: 1 - X - K
29 const __m128i R0 = _mm_sub_epi16(_mm_sub_epi16(ones, X), K);
30
31 const __m128i mul_lo = _mm_mullo_epi16(X, K);
32 const __m128i mul_hi = _mm_mulhi_epu16(X, K);
33
34 __m128i R1 = _mm_sub_epi16(mul_lo, mul_hi);
35
36 // SSE doesn't have unsigned comparisons so emulate with a signed compare by flipping the sign bit
37 const __m128i sign_bit = _mm_set1_epi16(static_cast<int16_t>(0x8000));
38 const __m128i borrow = _mm_cmpgt_epi16(_mm_xor_si128(mul_hi, sign_bit), _mm_xor_si128(mul_lo, sign_bit));
39
40 // R1 = mul_lo - mul_hi + (mul_hi > mul_lo ? 1 : 0)
41 R1 = _mm_sub_epi16(R1, borrow);
42
43 // Return either R1 or R0 (1-X-K) depending on if P == 0 or not
44 return _mm_or_si128(_mm_andnot_si128(P_is_zero, R1), _mm_and_si128(P_is_zero, R0));
45}
46
47/*
48* 4x8 matrix transpose
49*/
50BOTAN_FN_ISA_SSE2 void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) {
51 B0 = _mm_shuffle_epi32(B0, _MM_SHUFFLE(3, 1, 2, 0));
52 B1 = _mm_shuffle_epi32(B1, _MM_SHUFFLE(3, 1, 2, 0));
53 B2 = _mm_shuffle_epi32(B2, _MM_SHUFFLE(3, 1, 2, 0));
54 B3 = _mm_shuffle_epi32(B3, _MM_SHUFFLE(3, 1, 2, 0));
55
56 B0 = _mm_shufflelo_epi16(B0, _MM_SHUFFLE(3, 1, 2, 0));
57 B1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(3, 1, 2, 0));
58 B2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(3, 1, 2, 0));
59 B3 = _mm_shufflelo_epi16(B3, _MM_SHUFFLE(3, 1, 2, 0));
60
61 B0 = _mm_shufflehi_epi16(B0, _MM_SHUFFLE(3, 1, 2, 0));
62 B1 = _mm_shufflehi_epi16(B1, _MM_SHUFFLE(3, 1, 2, 0));
63 B2 = _mm_shufflehi_epi16(B2, _MM_SHUFFLE(3, 1, 2, 0));
64 B3 = _mm_shufflehi_epi16(B3, _MM_SHUFFLE(3, 1, 2, 0));
65
66 const __m128i T0 = _mm_unpacklo_epi32(B0, B1);
67 const __m128i T1 = _mm_unpackhi_epi32(B0, B1);
68 const __m128i T2 = _mm_unpacklo_epi32(B2, B3);
69 const __m128i T3 = _mm_unpackhi_epi32(B2, B3);
70
71 B0 = _mm_unpacklo_epi64(T0, T2);
72 B1 = _mm_unpackhi_epi64(T0, T2);
73 B2 = _mm_unpacklo_epi64(T1, T3);
74 B3 = _mm_unpackhi_epi64(T1, T3);
75}
76
77/*
78* 4x8 matrix transpose (reverse)
79*/
80BOTAN_FN_ISA_SSE2 void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) {
81 __m128i T0 = _mm_unpacklo_epi64(B0, B1);
82 __m128i T1 = _mm_unpacklo_epi64(B2, B3);
83 __m128i T2 = _mm_unpackhi_epi64(B0, B1);
84 __m128i T3 = _mm_unpackhi_epi64(B2, B3);
85
86 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
87 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
88 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
89 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
90
91 T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
92 T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
93 T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
94 T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
95
96 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
97 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
98 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
99 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
100
101 B0 = _mm_unpacklo_epi32(T0, T1);
102 B1 = _mm_unpackhi_epi32(T0, T1);
103 B2 = _mm_unpacklo_epi32(T2, T3);
104 B3 = _mm_unpackhi_epi32(T2, T3);
105}
106
107} // namespace
108
109/*
110* 8 wide IDEA encryption/decryption in SSE2
111*/
112BOTAN_FN_ISA_SSE2 void IDEA::sse2_idea_op_8(const uint8_t in[64], uint8_t out[64], const uint16_t EK[52]) {
113 CT::poison(in, 64);
114 CT::poison(out, 64);
115 CT::poison(EK, 52);
116
117 const __m128i* in_mm = reinterpret_cast<const __m128i*>(in);
118
119 __m128i B0 = _mm_loadu_si128(in_mm + 0);
120 __m128i B1 = _mm_loadu_si128(in_mm + 1);
121 __m128i B2 = _mm_loadu_si128(in_mm + 2);
122 __m128i B3 = _mm_loadu_si128(in_mm + 3);
123
124 transpose_in(B0, B1, B2, B3);
125
126 // byte swap
127 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
128 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
129 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
130 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
131
132 for(size_t i = 0; i != 8; ++i) {
133 B0 = mul(B0, EK[6 * i + 0]);
134 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6 * i + 1]));
135 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6 * i + 2]));
136 B3 = mul(B3, EK[6 * i + 3]);
137
138 const __m128i T0 = B2;
139 B2 = _mm_xor_si128(B2, B0);
140 B2 = mul(B2, EK[6 * i + 4]);
141
142 const __m128i T1 = B1;
143
144 B1 = _mm_xor_si128(B1, B3);
145 B1 = _mm_add_epi16(B1, B2);
146 B1 = mul(B1, EK[6 * i + 5]);
147
148 B2 = _mm_add_epi16(B2, B1);
149
150 B0 = _mm_xor_si128(B0, B1);
151 B1 = _mm_xor_si128(B1, T0);
152 B3 = _mm_xor_si128(B3, B2);
153 B2 = _mm_xor_si128(B2, T1);
154 }
155
156 B0 = mul(B0, EK[48]);
157 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
158 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
159 B3 = mul(B3, EK[51]);
160
161 // byte swap
162 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
163 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
164 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
165 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
166
167 transpose_out(B0, B2, B1, B3);
168
169 __m128i* out_mm = reinterpret_cast<__m128i*>(out);
170
171 _mm_storeu_si128(out_mm + 0, B0);
172 _mm_storeu_si128(out_mm + 1, B2);
173 _mm_storeu_si128(out_mm + 2, B1);
174 _mm_storeu_si128(out_mm + 3, B3);
175
176 CT::unpoison(in, 64);
177 CT::unpoison(out, 64);
178 CT::unpoison(EK, 52);
179}
180
181// NOLINTEND(portability-simd-intrinsics)
182
183} // namespace Botan
constexpr void unpoison(const T *p, size_t n)
Definition ct_utils.h:67
constexpr void poison(const T *p, size_t n)
Definition ct_utils.h:56
void R1(uint32_t A, uint32_t &B, uint32_t C, uint32_t &D, uint32_t E, uint32_t &F, uint32_t G, uint32_t &H, uint32_t TJ, uint32_t Wi, uint32_t Wj)
Definition sm3_fn.h:21