8#include <botan/internal/idea.h>
10#include <botan/internal/ct_utils.h>
18 const __m128i zeros = _mm_set1_epi16(0);
19 const __m128i ones = _mm_set1_epi16(1);
21 const __m128i
K = _mm_set1_epi16(K_16);
23 const __m128i X_is_zero = _mm_cmpeq_epi16(
X, zeros);
24 const __m128i K_is_zero = _mm_cmpeq_epi16(K, zeros);
26 const __m128i mul_lo = _mm_mullo_epi16(
X, K);
27 const __m128i mul_hi = _mm_mulhi_epu16(
X, K);
29 __m128i
T = _mm_sub_epi16(mul_lo, mul_hi);
32 const __m128i subs = _mm_subs_epu16(mul_hi, mul_lo);
33 const __m128i cmp = _mm_min_epu8(_mm_or_si128(subs, _mm_srli_epi16(subs, 8)), ones);
35 T = _mm_add_epi16(
T, cmp);
44 T = _mm_or_si128(_mm_andnot_si128(X_is_zero,
T), _mm_and_si128(_mm_sub_epi16(ones, K), X_is_zero));
46 T = _mm_or_si128(_mm_andnot_si128(K_is_zero,
T), _mm_and_si128(_mm_sub_epi16(ones,
X), K_is_zero));
59BOTAN_FUNC_ISA(
"sse2") void transpose_in(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) {
60 __m128i T0 = _mm_unpackhi_epi32(B0, B1);
61 __m128i T1 = _mm_unpacklo_epi32(B0, B1);
62 __m128i T2 = _mm_unpackhi_epi32(B2, B3);
63 __m128i T3 = _mm_unpacklo_epi32(B2, B3);
65 __m128i T4 = _mm_unpacklo_epi32(T0, T1);
66 __m128i T5 = _mm_unpackhi_epi32(T0, T1);
67 __m128i T6 = _mm_unpacklo_epi32(T2, T3);
68 __m128i T7 = _mm_unpackhi_epi32(T2, T3);
70 T0 = _mm_shufflehi_epi16(T4, _MM_SHUFFLE(1, 3, 0, 2));
71 T1 = _mm_shufflehi_epi16(T5, _MM_SHUFFLE(1, 3, 0, 2));
72 T2 = _mm_shufflehi_epi16(T6, _MM_SHUFFLE(1, 3, 0, 2));
73 T3 = _mm_shufflehi_epi16(T7, _MM_SHUFFLE(1, 3, 0, 2));
75 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(1, 3, 0, 2));
76 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(1, 3, 0, 2));
77 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(1, 3, 0, 2));
78 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(1, 3, 0, 2));
80 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
81 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
82 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
83 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
85 B0 = _mm_unpacklo_epi64(T0, T2);
86 B1 = _mm_unpackhi_epi64(T0, T2);
87 B2 = _mm_unpacklo_epi64(T1, T3);
88 B3 = _mm_unpackhi_epi64(T1, T3);
94BOTAN_FUNC_ISA(
"sse2") void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) {
95 __m128i T0 = _mm_unpacklo_epi64(B0, B1);
96 __m128i T1 = _mm_unpacklo_epi64(B2, B3);
97 __m128i T2 = _mm_unpackhi_epi64(B0, B1);
98 __m128i T3 = _mm_unpackhi_epi64(B2, B3);
100 T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
101 T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
102 T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
103 T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
105 T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
106 T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
107 T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
108 T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
110 T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
111 T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
112 T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
113 T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
115 B0 = _mm_unpacklo_epi32(T0, T1);
116 B1 = _mm_unpackhi_epi32(T0, T1);
117 B2 = _mm_unpacklo_epi32(T2, T3);
118 B3 = _mm_unpackhi_epi32(T2, T3);
126BOTAN_FUNC_ISA(
"sse2") void
IDEA::sse2_idea_op_8(const uint8_t in[64], uint8_t out[64], const uint16_t EK[52]) {
131 const __m128i* in_mm =
reinterpret_cast<const __m128i*
>(in);
133 __m128i B0 = _mm_loadu_si128(in_mm + 0);
134 __m128i B1 = _mm_loadu_si128(in_mm + 1);
135 __m128i B2 = _mm_loadu_si128(in_mm + 2);
136 __m128i B3 = _mm_loadu_si128(in_mm + 3);
138 transpose_in(B0, B1, B2, B3);
141 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
142 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
143 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
144 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
146 for(
size_t i = 0; i != 8; ++i) {
147 B0 = mul(B0, EK[6 * i + 0]);
148 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[6 * i + 1]));
149 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[6 * i + 2]));
150 B3 = mul(B3, EK[6 * i + 3]);
153 B2 = _mm_xor_si128(B2, B0);
154 B2 = mul(B2, EK[6 * i + 4]);
158 B1 = _mm_xor_si128(B1, B3);
159 B1 = _mm_add_epi16(B1, B2);
160 B1 = mul(B1, EK[6 * i + 5]);
162 B2 = _mm_add_epi16(B2, B1);
164 B0 = _mm_xor_si128(B0, B1);
165 B1 = _mm_xor_si128(B1, T0);
166 B3 = _mm_xor_si128(B3, B2);
167 B2 = _mm_xor_si128(B2, T1);
170 B0 = mul(B0, EK[48]);
171 B1 = _mm_add_epi16(B1, _mm_set1_epi16(EK[50]));
172 B2 = _mm_add_epi16(B2, _mm_set1_epi16(EK[49]));
173 B3 = mul(B3, EK[51]);
176 B0 = _mm_or_si128(_mm_slli_epi16(B0, 8), _mm_srli_epi16(B0, 8));
177 B1 = _mm_or_si128(_mm_slli_epi16(B1, 8), _mm_srli_epi16(B1, 8));
178 B2 = _mm_or_si128(_mm_slli_epi16(B2, 8), _mm_srli_epi16(B2, 8));
179 B3 = _mm_or_si128(_mm_slli_epi16(B3, 8), _mm_srli_epi16(B3, 8));
181 transpose_out(B0, B2, B1, B3);
183 __m128i* out_mm =
reinterpret_cast<__m128i*
>(out);
185 _mm_storeu_si128(out_mm + 0, B0);
186 _mm_storeu_si128(out_mm + 1, B2);
187 _mm_storeu_si128(out_mm + 2, B1);
188 _mm_storeu_si128(out_mm + 3, B3);
#define BOTAN_FUNC_ISA(isa)
constexpr void unpoison(const T *p, size_t n)
constexpr void poison(const T *p, size_t n)