Botan 3.11.1
Crypto and TLS for C&
idea_avx2.cpp
Go to the documentation of this file.
1/*
2* (C) 2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/idea.h>
8
9#include <botan/internal/ct_utils.h>
10#include <botan/internal/isa_extn.h>
11#include <immintrin.h>
12
13namespace Botan {
14
15namespace {
16
17// NOLINTBEGIN(portability-simd-intrinsics)
18
19/*
20* SIMD type of 16 16-bit elements
21*/
22class SIMD_16x16 final {
23 public:
24 using native_type = __m256i;
25
26 SIMD_16x16(const SIMD_16x16&) = default;
27 SIMD_16x16& operator=(const SIMD_16x16&) = default;
28 SIMD_16x16(SIMD_16x16&&) = default;
29 SIMD_16x16& operator=(SIMD_16x16&&) = default;
30 ~SIMD_16x16() = default;
31
32 BOTAN_FN_ISA_AVX2 explicit SIMD_16x16(native_type x) : m_simd(x) {}
33
34 static SIMD_16x16 BOTAN_FN_ISA_AVX2 load_le(const uint8_t in[]) {
35 return SIMD_16x16(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(in)));
36 }
37
38 void BOTAN_FN_ISA_AVX2 store_le(uint8_t out[]) const {
39 _mm256_storeu_si256(reinterpret_cast<__m256i*>(out), m_simd);
40 }
41
42 static SIMD_16x16 BOTAN_FN_ISA_AVX2 load_be(const uint8_t in[]) { return load_le(in).bswap(); }
43
44 void BOTAN_FN_ISA_AVX2 store_be(uint8_t out[]) const { bswap().store_le(out); }
45
46 SIMD_16x16 BOTAN_FN_ISA_AVX2 bswap() const {
47 // clang-format off
48 const auto bswap_tbl = _mm256_set_epi8(
49 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
50 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
51 // clang-format on
52 return SIMD_16x16(_mm256_shuffle_epi8(m_simd, bswap_tbl));
53 }
54
55 SIMD_16x16 BOTAN_FN_ISA_AVX2 operator-(const SIMD_16x16& o) const {
56 return SIMD_16x16(_mm256_sub_epi16(m_simd, o.m_simd));
57 }
58
59 SIMD_16x16 BOTAN_FN_ISA_AVX2 operator^(const SIMD_16x16& o) const {
60 return SIMD_16x16(_mm256_xor_si256(m_simd, o.m_simd));
61 }
62
63 void BOTAN_FN_ISA_AVX2 operator+=(const SIMD_16x16& o) { m_simd = _mm256_add_epi16(m_simd, o.m_simd); }
64
65 void BOTAN_FN_ISA_AVX2 operator+=(uint16_t v) { m_simd = _mm256_add_epi16(m_simd, _mm256_set1_epi16(v)); }
66
67 void BOTAN_FN_ISA_AVX2 operator^=(const SIMD_16x16& o) { m_simd = _mm256_xor_si256(m_simd, o.m_simd); }
68
69 static inline BOTAN_FN_ISA_AVX2 SIMD_16x16 mul_mod_65537(SIMD_16x16 X, uint16_t K_16) {
70 const auto zeros = SIMD_16x16::splat(0);
71 const auto ones = SIMD_16x16::splat(1);
72 const auto K = SIMD_16x16::splat(K_16);
73
74 // If X == 0 or K == 0 then P == X * K == 0
75 const auto P_is_zero = SIMD_16x16(
76 _mm256_or_si256(_mm256_cmpeq_epi16(X.raw(), zeros.raw()), _mm256_cmpeq_epi16(K.raw(), zeros.raw())));
77
78 // Return value if P == 0: 1 - X - K
79 const auto R0 = ones - X - K;
80
81 const auto mul_lo = SIMD_16x16(_mm256_mullo_epi16(X.raw(), K.raw()));
82 const auto mul_hi = SIMD_16x16(_mm256_mulhi_epu16(X.raw(), K.raw()));
83
84 // AVX2 doesn't have unsigned comparisons so emulate with a signed compare by flipping the sign bit
85 const auto sign_bit = SIMD_16x16::splat(0x8000);
86 const auto borrow = SIMD_16x16(_mm256_cmpgt_epi16((mul_hi ^ sign_bit).raw(), (mul_lo ^ sign_bit).raw()));
87
88 // R1 = mul_lo - mul_hi + (mul_hi > mul_lo ? 1 : 0)
89 const auto R1 = mul_lo - mul_hi - borrow;
90
91 return SIMD_16x16(_mm256_blendv_epi8(R1.raw(), R0.raw(), P_is_zero.raw()));
92 }
93
94 /*
95 * 4x16 matrix transpose
96 */
97 static void BOTAN_FN_ISA_AVX2 transpose_in(SIMD_16x16& B0, SIMD_16x16& B1, SIMD_16x16& B2, SIMD_16x16& B3) {
98 auto B0r = _mm256_shuffle_epi32(B0.raw(), _MM_SHUFFLE(3, 1, 2, 0));
99 auto B1r = _mm256_shuffle_epi32(B1.raw(), _MM_SHUFFLE(3, 1, 2, 0));
100 auto B2r = _mm256_shuffle_epi32(B2.raw(), _MM_SHUFFLE(3, 1, 2, 0));
101 auto B3r = _mm256_shuffle_epi32(B3.raw(), _MM_SHUFFLE(3, 1, 2, 0));
102
103 B0r = _mm256_shufflelo_epi16(B0r, _MM_SHUFFLE(3, 1, 2, 0));
104 B1r = _mm256_shufflelo_epi16(B1r, _MM_SHUFFLE(3, 1, 2, 0));
105 B2r = _mm256_shufflelo_epi16(B2r, _MM_SHUFFLE(3, 1, 2, 0));
106 B3r = _mm256_shufflelo_epi16(B3r, _MM_SHUFFLE(3, 1, 2, 0));
107
108 B0r = _mm256_shufflehi_epi16(B0r, _MM_SHUFFLE(3, 1, 2, 0));
109 B1r = _mm256_shufflehi_epi16(B1r, _MM_SHUFFLE(3, 1, 2, 0));
110 B2r = _mm256_shufflehi_epi16(B2r, _MM_SHUFFLE(3, 1, 2, 0));
111 B3r = _mm256_shufflehi_epi16(B3r, _MM_SHUFFLE(3, 1, 2, 0));
112
113 const auto T0 = _mm256_unpacklo_epi32(B0r, B1r);
114 const auto T1 = _mm256_unpackhi_epi32(B0r, B1r);
115 const auto T2 = _mm256_unpacklo_epi32(B2r, B3r);
116 const auto T3 = _mm256_unpackhi_epi32(B2r, B3r);
117
118 B0 = SIMD_16x16(_mm256_unpacklo_epi64(T0, T2));
119 B1 = SIMD_16x16(_mm256_unpackhi_epi64(T0, T2));
120 B2 = SIMD_16x16(_mm256_unpacklo_epi64(T1, T3));
121 B3 = SIMD_16x16(_mm256_unpackhi_epi64(T1, T3));
122 }
123
124 /*
125 * 4x16 matrix transpose (inverse)
126 */
127 static void BOTAN_FN_ISA_AVX2 transpose_out(SIMD_16x16& B0, SIMD_16x16& B1, SIMD_16x16& B2, SIMD_16x16& B3) {
128 auto T0 = _mm256_unpacklo_epi64(B0.raw(), B1.raw());
129 auto T1 = _mm256_unpacklo_epi64(B2.raw(), B3.raw());
130 auto T2 = _mm256_unpackhi_epi64(B0.raw(), B1.raw());
131 auto T3 = _mm256_unpackhi_epi64(B2.raw(), B3.raw());
132
133 T0 = _mm256_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
134 T1 = _mm256_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
135 T2 = _mm256_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
136 T3 = _mm256_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
137
138 T0 = _mm256_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
139 T1 = _mm256_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
140 T2 = _mm256_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
141 T3 = _mm256_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
142
143 T0 = _mm256_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
144 T1 = _mm256_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
145 T2 = _mm256_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
146 T3 = _mm256_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
147
148 B0 = SIMD_16x16(_mm256_unpacklo_epi32(T0, T1));
149 B1 = SIMD_16x16(_mm256_unpackhi_epi32(T0, T1));
150 B2 = SIMD_16x16(_mm256_unpacklo_epi32(T2, T3));
151 B3 = SIMD_16x16(_mm256_unpackhi_epi32(T2, T3));
152 }
153
154 native_type BOTAN_FN_ISA_AVX2 raw() const { return m_simd; }
155
156 private:
157 static SIMD_16x16 BOTAN_FN_ISA_AVX2 splat(uint16_t v) { return SIMD_16x16(_mm256_set1_epi16(v)); }
158
159 native_type m_simd;
160};
161
162// NOLINTEND(portability-simd-intrinsics)
163
164} // namespace
165
166BOTAN_FN_ISA_AVX2 void IDEA::avx2_idea_op_16(const uint8_t in[128], uint8_t out[128], const uint16_t EK[52]) {
167 CT::poison(in, 128);
168 CT::poison(out, 128);
169 CT::poison(EK, 52);
170
171 auto B0 = SIMD_16x16::load_be(in + 0);
172 auto B1 = SIMD_16x16::load_be(in + 32);
173 auto B2 = SIMD_16x16::load_be(in + 64);
174 auto B3 = SIMD_16x16::load_be(in + 96);
175
176 SIMD_16x16::transpose_in(B0, B1, B2, B3);
177
178 for(size_t i = 0; i != 8; ++i) {
179 B0 = SIMD_16x16::mul_mod_65537(B0, EK[6 * i + 0]);
180 B1 += EK[6 * i + 1];
181 B2 += EK[6 * i + 2];
182 B3 = SIMD_16x16::mul_mod_65537(B3, EK[6 * i + 3]);
183
184 const auto T0 = B2;
185 B2 ^= B0;
186 B2 = SIMD_16x16::mul_mod_65537(B2, EK[6 * i + 4]);
187
188 const auto T1 = B1;
189
190 B1 ^= B3;
191 B1 += B2;
192 B1 = SIMD_16x16::mul_mod_65537(B1, EK[6 * i + 5]);
193
194 B2 += B1;
195
196 B0 ^= B1;
197 B1 ^= T0;
198 B3 ^= B2;
199 B2 ^= T1;
200 }
201
202 B0 = SIMD_16x16::mul_mod_65537(B0, EK[48]);
203 B1 += EK[50];
204 B2 += EK[49];
205 B3 = SIMD_16x16::mul_mod_65537(B3, EK[51]);
206
207 SIMD_16x16::transpose_out(B0, B2, B1, B3);
208
209 B0.store_be(out + 0);
210 B2.store_be(out + 32);
211 B1.store_be(out + 64);
212 B3.store_be(out + 96);
213
214 CT::unpoison(in, 128);
215 CT::unpoison(out, 128);
216 CT::unpoison(EK, 52);
217}
218
219} // namespace Botan
constexpr void unpoison(const T *p, size_t n)
Definition ct_utils.h:67
constexpr void poison(const T *p, size_t n)
Definition ct_utils.h:56
OctetString operator^(const OctetString &k1, const OctetString &k2)
Definition symkey.cpp:109
BigInt operator-(const BigInt &x, const BigInt &y)
Definition bigint.h:1111
constexpr auto store_le(ParamTs &&... params)
Definition loadstor.h:736
std::vector< uint8_t, Alloc > & operator^=(std::vector< uint8_t, Alloc > &out, const std::vector< uint8_t, Alloc2 > &in)
Definition mem_ops.h:445
void R1(uint32_t A, uint32_t &B, uint32_t C, uint32_t &D, uint32_t E, uint32_t &F, uint32_t G, uint32_t &H, uint32_t TJ, uint32_t Wi, uint32_t Wj)
Definition sm3_fn.h:21
std::vector< T, Alloc > & operator+=(std::vector< T, Alloc > &out, const std::vector< T, Alloc2 > &in)
Definition secmem.h:90
constexpr auto load_le(ParamTs &&... params)
Definition loadstor.h:495
constexpr auto store_be(ParamTs &&... params)
Definition loadstor.h:745
constexpr auto load_be(ParamTs &&... params)
Definition loadstor.h:504