8#include <botan/internal/idea.h>
10#include <botan/internal/ct_utils.h>
11#include <botan/internal/isa_extn.h>
20BOTAN_FN_ISA_SSE2
inline __m128i mul(__m128i X, uint16_t K_16) {
21 const __m128i zeros = _mm_set1_epi16(0);
22 const __m128i ones = _mm_set1_epi16(1);
23 const __m128i K = _mm_set1_epi16(K_16);
26 const __m128i P_is_zero = _mm_or_si128(_mm_cmpeq_epi16(X, zeros), _mm_cmpeq_epi16(K, zeros));
29 const __m128i R0 = _mm_sub_epi16(_mm_sub_epi16(ones, X), K);
31 const __m128i mul_lo = _mm_mullo_epi16(X, K);
32 const __m128i mul_hi = _mm_mulhi_epu16(X, K);
34 __m128i
R1 = _mm_sub_epi16(mul_lo, mul_hi);
37 const __m128i sign_bit = _mm_set1_epi16(
static_cast<int16_t
>(0x8000));
38 const __m128i borrow = _mm_cmpgt_epi16(_mm_xor_si128(mul_hi, sign_bit), _mm_xor_si128(mul_lo, sign_bit));
41 R1 = _mm_sub_epi16(
R1, borrow);
44 return _mm_or_si128(_mm_andnot_si128(P_is_zero,
R1), _mm_and_si128(P_is_zero, R0));
50BOTAN_FN_ISA_SSE2
void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) {
51 B0 = _mm_shuffle_epi32(B0, _MM_SHUFFLE(3, 1, 2, 0));
52 B1 = _mm_shuffle_epi32(B1, _MM_SHUFFLE(3, 1, 2, 0));
53 B2 = _mm_shuffle_epi32(B2, _MM_SHUFFLE(3, 1, 2, 0));
54 B3 = _mm_shuffle_epi32(B3, _MM_SHUFFLE(3, 1, 2, 0));
56 B0 = _mm_shufflelo_epi16(B0, _MM_SHUFFLE(3, 1, 2, 0));
57 B1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(3, 1, 2, 0));
58 B2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(3, 1, 2, 0));
59 B3 = _mm_shufflelo_epi16(B3, _MM_SHUFFLE(3, 1, 2, 0));
61 B0 = _mm_shufflehi_epi16(B0, _MM_SHUFFLE(3, 1, 2, 0));
62 B1 = _mm_shufflehi_epi16(B1, _MM_SHUFFLE(3, 1, 2, 0));
63 B2 = _mm_shufflehi_epi16(B2, _MM_SHUFFLE(3, 1, 2, 0));
64 B3 = _mm_shufflehi_epi16(B3, _MM_SHUFFLE(3, 1, 2, 0));
66 const __m128i T0 = _mm_unpacklo_epi32(B0, B1);
67 const __m128i T1 = _mm_unpackhi_epi32(B0, B1);
68 const __m128i T2 = _mm_unpacklo_epi32(B2, B3);
69 const __m128i T3 = _mm_unpackhi_epi32(B2, B3);
71 B0 = _mm_unpacklo_epi64(T0, T2);
72 B1 = _mm_unpackhi_epi64(T0, T2);
73 B2 = _mm_unpacklo_epi64(T1, T3);
74 B3 = _mm_unpackhi_epi64(T1, T3);
80BOTAN_FN_ISA_SSE2
void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) {
81 __m128i T0 = _mm_unpacklo_epi64(B0, B1);
82 __m128i T1 = _mm_unpacklo_epi64(B2, B3);
83 __m128i T2 = _mm_unpackhi_epi64(B0, B1);
84 __m128i T3 = _mm_unpackhi_epi64(B2, B3);
86 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
87 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
88 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
89 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
91 T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
92 T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
93 T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
94 T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
96 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
97 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
98 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
99 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
101 B0 = _mm_unpacklo_epi32(T0, T1);
102 B1 = _mm_unpackhi_epi32(T0, T1);
103 B2 = _mm_unpacklo_epi32(T2, T3);
104 B3 = _mm_unpackhi_epi32(T2, T3);
112BOTAN_FN_ISA_SSE2
void IDEA::sse2_idea_op_8(
const uint8_t in[64], uint8_t out[64],
const uint16_t EK[52]) {
117 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
119 __m128i B0 = _mm_loadu_si128(in_mm + 0);
120 __m128i B1 = _mm_loadu_si128(in_mm + 1);
121 __m128i B2 = _mm_loadu_si128(in_mm + 2);
122 __m128i B3 = _mm_loadu_si128(in_mm + 3);
124 transpose_in(B0, B1, B2, B3);
127 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
128 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
129 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
130 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
132 for(
size_t i = 0; i != 8; ++i) {
133 B0 = mul(B0, EK[6 * i + 0]);
134 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6 * i + 1]));
135 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6 * i + 2]));
136 B3 = mul(B3, EK[6 * i + 3]);
138 const __m128i T0 = B2;
139 B2 = _mm_xor_si128(B2, B0);
140 B2 = mul(B2, EK[6 * i + 4]);
142 const __m128i T1 = B1;
144 B1 = _mm_xor_si128(B1, B3);
145 B1 = _mm_add_epi16(B1, B2);
146 B1 = mul(B1, EK[6 * i + 5]);
148 B2 = _mm_add_epi16(B2, B1);
150 B0 = _mm_xor_si128(B0, B1);
151 B1 = _mm_xor_si128(B1, T0);
152 B3 = _mm_xor_si128(B3, B2);
153 B2 = _mm_xor_si128(B2, T1);
156 B0 = mul(B0, EK[48]);
157 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
158 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
159 B3 = mul(B3, EK[51]);
162 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
163 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
164 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
165 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
167 transpose_out(B0, B2, B1, B3);
169 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
171 _mm_storeu_si128(out_mm + 0, B0);
172 _mm_storeu_si128(out_mm + 1, B2);
173 _mm_storeu_si128(out_mm + 2, B1);
174 _mm_storeu_si128(out_mm + 3, B3);
constexpr void unpoison(const T *p, size_t n)
constexpr void poison(const T *p, size_t n)
void R1(uint32_t A, uint32_t &B, uint32_t C, uint32_t &D, uint32_t E, uint32_t &F, uint32_t G, uint32_t &H, uint32_t TJ, uint32_t Wi, uint32_t Wj)