7#include <botan/internal/idea.h>
9#include <botan/internal/ct_utils.h>
10#include <botan/internal/isa_extn.h>
22class SIMD_16x16 final {
24 using native_type = __m256i;
26 SIMD_16x16(
const SIMD_16x16&) =
default;
27 SIMD_16x16& operator=(
const SIMD_16x16&) =
default;
28 SIMD_16x16(SIMD_16x16&&) =
default;
29 SIMD_16x16& operator=(SIMD_16x16&&) =
default;
30 ~SIMD_16x16() =
default;
32 BOTAN_FN_ISA_AVX2
explicit SIMD_16x16(native_type x) : m_simd(x) {}
34 static SIMD_16x16 BOTAN_FN_ISA_AVX2
load_le(
const uint8_t in[]) {
35 return SIMD_16x16(_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(in)));
38 void BOTAN_FN_ISA_AVX2
store_le(uint8_t out[])
const {
39 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(out), m_simd);
42 static SIMD_16x16 BOTAN_FN_ISA_AVX2
load_be(
const uint8_t in[]) {
return load_le(in).bswap(); }
44 void BOTAN_FN_ISA_AVX2
store_be(uint8_t out[])
const { bswap().store_le(out); }
46 SIMD_16x16 BOTAN_FN_ISA_AVX2 bswap()
const {
48 const auto bswap_tbl = _mm256_set_epi8(
49 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1,
50 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
52 return SIMD_16x16(_mm256_shuffle_epi8(m_simd, bswap_tbl));
55 SIMD_16x16 BOTAN_FN_ISA_AVX2
operator-(
const SIMD_16x16& o)
const {
56 return SIMD_16x16(_mm256_sub_epi16(m_simd, o.m_simd));
59 SIMD_16x16 BOTAN_FN_ISA_AVX2
operator^(
const SIMD_16x16& o)
const {
60 return SIMD_16x16(_mm256_xor_si256(m_simd, o.m_simd));
63 void BOTAN_FN_ISA_AVX2
operator+=(
const SIMD_16x16& o) { m_simd = _mm256_add_epi16(m_simd, o.m_simd); }
65 void BOTAN_FN_ISA_AVX2
operator+=(uint16_t v) { m_simd = _mm256_add_epi16(m_simd, _mm256_set1_epi16(v)); }
67 void BOTAN_FN_ISA_AVX2
operator^=(
const SIMD_16x16& o) { m_simd = _mm256_xor_si256(m_simd, o.m_simd); }
69 static inline BOTAN_FN_ISA_AVX2 SIMD_16x16 mul_mod_65537(SIMD_16x16 X, uint16_t K_16) {
70 const auto zeros = SIMD_16x16::splat(0);
71 const auto ones = SIMD_16x16::splat(1);
72 const auto K = SIMD_16x16::splat(K_16);
75 const auto X_is_zero = SIMD_16x16(_mm256_cmpeq_epi16(X.raw(), zeros.raw()));
76 const auto K_is_zero = SIMD_16x16(_mm256_cmpeq_epi16(K.raw(), zeros.raw()));
78 const auto ml = SIMD_16x16(_mm256_mullo_epi16(X.raw(), K.raw()));
79 const auto mh = SIMD_16x16(_mm256_mulhi_epu16(X.raw(), K.raw()));
82 const auto bias = SIMD_16x16::splat(0x8000);
83 const auto borrow = (mh ^ bias).cmpgt(ml ^ bias);
86 auto T = ml - mh - borrow;
89 T = T.select_u16(ones - K, X_is_zero);
90 T = T.select_u16(ones - X, K_is_zero);
98 static void BOTAN_FN_ISA_AVX2 transpose_in(SIMD_16x16& B0, SIMD_16x16& B1, SIMD_16x16& B2, SIMD_16x16& B3) {
99 auto B0r = _mm256_shuffle_epi32(B0.raw(), _MM_SHUFFLE(3, 1, 2, 0));
100 auto B1r = _mm256_shuffle_epi32(B1.raw(), _MM_SHUFFLE(3, 1, 2, 0));
101 auto B2r = _mm256_shuffle_epi32(B2.raw(), _MM_SHUFFLE(3, 1, 2, 0));
102 auto B3r = _mm256_shuffle_epi32(B3.raw(), _MM_SHUFFLE(3, 1, 2, 0));
104 B0r = _mm256_shufflelo_epi16(B0r, _MM_SHUFFLE(3, 1, 2, 0));
105 B1r = _mm256_shufflelo_epi16(B1r, _MM_SHUFFLE(3, 1, 2, 0));
106 B2r = _mm256_shufflelo_epi16(B2r, _MM_SHUFFLE(3, 1, 2, 0));
107 B3r = _mm256_shufflelo_epi16(B3r, _MM_SHUFFLE(3, 1, 2, 0));
109 B0r = _mm256_shufflehi_epi16(B0r, _MM_SHUFFLE(3, 1, 2, 0));
110 B1r = _mm256_shufflehi_epi16(B1r, _MM_SHUFFLE(3, 1, 2, 0));
111 B2r = _mm256_shufflehi_epi16(B2r, _MM_SHUFFLE(3, 1, 2, 0));
112 B3r = _mm256_shufflehi_epi16(B3r, _MM_SHUFFLE(3, 1, 2, 0));
114 const auto T0 = _mm256_unpacklo_epi32(B0r, B1r);
115 const auto T1 = _mm256_unpackhi_epi32(B0r, B1r);
116 const auto T2 = _mm256_unpacklo_epi32(B2r, B3r);
117 const auto T3 = _mm256_unpackhi_epi32(B2r, B3r);
119 B0 = SIMD_16x16(_mm256_unpacklo_epi64(T0, T2));
120 B1 = SIMD_16x16(_mm256_unpackhi_epi64(T0, T2));
121 B2 = SIMD_16x16(_mm256_unpacklo_epi64(T1, T3));
122 B3 = SIMD_16x16(_mm256_unpackhi_epi64(T1, T3));
128 static void BOTAN_FN_ISA_AVX2 transpose_out(SIMD_16x16& B0, SIMD_16x16& B1, SIMD_16x16& B2, SIMD_16x16& B3) {
129 auto T0 = _mm256_unpacklo_epi64(B0.raw(), B1.raw());
130 auto T1 = _mm256_unpacklo_epi64(B2.raw(), B3.raw());
131 auto T2 = _mm256_unpackhi_epi64(B0.raw(), B1.raw());
132 auto T3 = _mm256_unpackhi_epi64(B2.raw(), B3.raw());
134 T0 = _mm256_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
135 T1 = _mm256_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
136 T2 = _mm256_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
137 T3 = _mm256_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));
139 T0 = _mm256_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
140 T1 = _mm256_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
141 T2 = _mm256_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
142 T3 = _mm256_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
144 T0 = _mm256_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
145 T1 = _mm256_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
146 T2 = _mm256_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
147 T3 = _mm256_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));
149 B0 = SIMD_16x16(_mm256_unpacklo_epi32(T0, T1));
150 B1 = SIMD_16x16(_mm256_unpackhi_epi32(T0, T1));
151 B2 = SIMD_16x16(_mm256_unpacklo_epi32(T2, T3));
152 B3 = SIMD_16x16(_mm256_unpackhi_epi32(T2, T3));
155 native_type BOTAN_FN_ISA_AVX2 raw()
const {
return m_simd; }
158 static SIMD_16x16 BOTAN_FN_ISA_AVX2 splat(uint16_t v) {
return SIMD_16x16(_mm256_set1_epi16(v)); }
160 SIMD_16x16 BOTAN_FN_ISA_AVX2 cmpgt(
const SIMD_16x16& o)
const {
161 return SIMD_16x16(_mm256_cmpgt_epi16(m_simd, o.m_simd));
164 SIMD_16x16 BOTAN_FN_ISA_AVX2 select_u16(
const SIMD_16x16& other,
const SIMD_16x16& mask)
const {
165 return SIMD_16x16(_mm256_blendv_epi8(m_simd, other.m_simd, mask.m_simd));
175BOTAN_FN_ISA_AVX2
void IDEA::avx2_idea_op_16(
const uint8_t in[128], uint8_t out[128],
const uint16_t EK[52]) {
180 auto B0 = SIMD_16x16::load_be(in + 0);
181 auto B1 = SIMD_16x16::load_be(in + 32);
182 auto B2 = SIMD_16x16::load_be(in + 64);
183 auto B3 = SIMD_16x16::load_be(in + 96);
185 SIMD_16x16::transpose_in(B0, B1, B2, B3);
187 for(
size_t i = 0; i != 8; ++i) {
188 B0 = SIMD_16x16::mul_mod_65537(B0, EK[6 * i + 0]);
191 B3 = SIMD_16x16::mul_mod_65537(B3, EK[6 * i + 3]);
195 B2 = SIMD_16x16::mul_mod_65537(B2, EK[6 * i + 4]);
201 B1 = SIMD_16x16::mul_mod_65537(B1, EK[6 * i + 5]);
211 B0 = SIMD_16x16::mul_mod_65537(B0, EK[48]);
214 B3 = SIMD_16x16::mul_mod_65537(B3, EK[51]);
216 SIMD_16x16::transpose_out(B0, B2, B1, B3);
218 B0.store_be(out + 0);
219 B2.store_be(out + 32);
220 B1.store_be(out + 64);
221 B3.store_be(out + 96);
constexpr void unpoison(const T *p, size_t n)
constexpr void poison(const T *p, size_t n)
OctetString operator^(const OctetString &k1, const OctetString &k2)
BigInt operator-(const BigInt &x, const BigInt &y)
constexpr auto store_le(ParamTs &&... params)
std::vector< uint8_t, Alloc > & operator^=(std::vector< uint8_t, Alloc > &out, const std::vector< uint8_t, Alloc2 > &in)
std::vector< T, Alloc > & operator+=(std::vector< T, Alloc > &out, const std::vector< T, Alloc2 > &in)
constexpr auto load_le(ParamTs &&... params)
constexpr auto store_be(ParamTs &&... params)
constexpr auto load_be(ParamTs &&... params)