Botan 3.11.1
Crypto and TLS for C&
twofish_avx512.cpp
Go to the documentation of this file.
1/*
2* (C) 2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#include <botan/internal/twofish.h>
8
9#include <botan/internal/isa_extn.h>
10#include <botan/internal/simd_avx512.h>
11#include <immintrin.h>
12
13namespace Botan {
14
15namespace {
16
17namespace Twofish_AVX512 {
18
19// NOLINTBEGIN(portability-simd-intrinsics)
20
21template <size_t N>
22BOTAN_FN_ISA_AVX512_GFNI BOTAN_FORCE_INLINE __m512i lookup_sbox(const SIMD_16x32 W, const uint8_t* QS) {
23 static_assert(N < 4);
24
25 // Parallel sbox lookup using permutations + blend
26
27 const auto q0 = _mm512_loadu_si512(QS);
28 const auto q1 = _mm512_loadu_si512(QS + 64);
29 const auto q2 = _mm512_loadu_si512(QS + 128);
30 const auto q3 = _mm512_loadu_si512(QS + 192);
31
32 const auto bytemask = _mm512_set1_epi32(0xFF);
33 const auto idx = _mm512_and_si512(_mm512_srli_epi32(W.raw(), N * 8), bytemask);
34
35 // Select on both Q[0-128] and Q[128-256] using the low 7 bits
36 const __m512i lo = _mm512_permutex2var_epi8(q0, idx, q1);
37 const __m512i hi = _mm512_permutex2var_epi8(q2, idx, q3);
38
39 // Then select between those results using the top bit
40 return _mm512_mask_blend_epi8(_mm512_movepi8_mask(idx), lo, hi);
41}
42
43BOTAN_FN_ISA_AVX512_GFNI
44BOTAN_FORCE_INLINE SIMD_16x32 apply_mds(__m512i q, __m512i mds_gfni) {
45 // clang-format off
46 alignas(64) constexpr uint8_t MDS_PRE_SHUFFLE[64] = {
47 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28,
48 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28,
49 32, 36, 40, 44, 48, 52, 56, 60, 32, 36, 40, 44, 48, 52, 56, 60,
50 32, 36, 40, 44, 48, 52, 56, 60, 32, 36, 40, 44, 48, 52, 56, 60,
51 };
52
53 alignas(64) constexpr uint8_t MDS_POST_SHUFFLE[64] = {
54 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
55 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31,
56 32, 40, 48, 56, 33, 41, 49, 57, 34, 42, 50, 58, 35, 43, 51, 59,
57 36, 44, 52, 60, 37, 45, 53, 61, 38, 46, 54, 62, 39, 47, 55, 63,
58 };
59 // clang-format on
60
61 const __m512i pre = _mm512_permutexvar_epi8(_mm512_load_si512(MDS_PRE_SHUFFLE), q);
62 const __m512i transformed = _mm512_gf2p8affine_epi64_epi8(pre, mds_gfni, 0);
63 return SIMD_16x32(_mm512_permutexvar_epi8(_mm512_load_si512(MDS_POST_SHUFFLE), transformed));
64}
65
66BOTAN_FN_ISA_AVX512_GFNI
67BOTAN_FORCE_INLINE SIMD_16x32 g_func(SIMD_16x32 W, const uint8_t* QS) {
68 constexpr uint64_t GFNI_ID = 0x0102040810204080;
69 constexpr uint64_t GFNI_5B = 0x050B162953A24182;
70 constexpr uint64_t GFNI_EF = 0x070F1F3972E3C183;
71
72 const __m512i MDS0 = _mm512_set_epi64(GFNI_EF, GFNI_EF, GFNI_5B, GFNI_ID, GFNI_EF, GFNI_EF, GFNI_5B, GFNI_ID);
73 const __m512i MDS1 = _mm512_set_epi64(GFNI_ID, GFNI_5B, GFNI_EF, GFNI_EF, GFNI_ID, GFNI_5B, GFNI_EF, GFNI_EF);
74 const __m512i MDS2 = _mm512_set_epi64(GFNI_EF, GFNI_ID, GFNI_EF, GFNI_5B, GFNI_EF, GFNI_ID, GFNI_EF, GFNI_5B);
75 const __m512i MDS3 = _mm512_set_epi64(GFNI_5B, GFNI_EF, GFNI_ID, GFNI_5B, GFNI_5B, GFNI_EF, GFNI_ID, GFNI_5B);
76
77 const auto r0 = apply_mds(lookup_sbox<0>(W, QS), MDS0);
78 const auto r1 = apply_mds(lookup_sbox<1>(W, QS + 256), MDS1);
79 const auto r2 = apply_mds(lookup_sbox<2>(W, QS + 512), MDS2);
80 const auto r3 = apply_mds(lookup_sbox<3>(W, QS + 768), MDS3);
81
82 return (r0 ^ r1 ^ r2 ^ r3);
83}
84
85// NOLINTEND(portability-simd-intrinsics)
86
87BOTAN_FN_ISA_AVX512_GFNI
88BOTAN_FORCE_INLINE void twofish_encrypt_round(
89 SIMD_16x32 A, SIMD_16x32 B, SIMD_16x32& C, SIMD_16x32& D, uint32_t rk1, uint32_t rk2, const uint8_t* QS) {
90 SIMD_16x32 X = g_func(A, QS);
91 SIMD_16x32 Y = g_func(B.rotl<8>(), QS);
92
93 X += Y;
94 Y += X;
95
96 X += SIMD_16x32::splat(rk1);
97 Y += SIMD_16x32::splat(rk2);
98
99 C = (C ^ X).rotr<1>();
100 D = D.rotl<1>() ^ Y;
101}
102
103BOTAN_FN_ISA_AVX512_GFNI
104BOTAN_FORCE_INLINE void twofish_decrypt_round(
105 SIMD_16x32 A, SIMD_16x32 B, SIMD_16x32& C, SIMD_16x32& D, uint32_t rk1, uint32_t rk2, const uint8_t* QS) {
106 SIMD_16x32 X = g_func(A, QS);
107 SIMD_16x32 Y = g_func(B.rotl<8>(), QS);
108
109 X += Y;
110 Y += X;
111
112 X += SIMD_16x32::splat(rk1);
113 Y += SIMD_16x32::splat(rk2);
114
115 C = C.rotl<1>() ^ X;
116 D = (D ^ Y).rotr<1>();
117}
118
119} // namespace Twofish_AVX512
120
121} // namespace
122
123void BOTAN_FN_ISA_AVX512_GFNI Twofish::avx512_encrypt_16(const uint8_t in[16 * 16], uint8_t out[16 * 16]) const {
124 using namespace Twofish_AVX512;
125
126 SIMD_16x32 B0 = SIMD_16x32::load_le(in);
127 SIMD_16x32 B1 = SIMD_16x32::load_le(in + 64);
128 SIMD_16x32 B2 = SIMD_16x32::load_le(in + 128);
129 SIMD_16x32 B3 = SIMD_16x32::load_le(in + 192);
130
131 SIMD_16x32::transpose(B0, B1, B2, B3);
132
133 B0 ^= SIMD_16x32::splat(m_RK[0]);
134 B1 ^= SIMD_16x32::splat(m_RK[1]);
135 B2 ^= SIMD_16x32::splat(m_RK[2]);
136 B3 ^= SIMD_16x32::splat(m_RK[3]);
137
138 const uint8_t* QS = m_QS.data();
139
140 for(size_t k = 8; k != 40; k += 4) {
141 twofish_encrypt_round(B0, B1, B2, B3, m_RK[k], m_RK[k + 1], QS);
142 twofish_encrypt_round(B2, B3, B0, B1, m_RK[k + 2], m_RK[k + 3], QS);
143 }
144
145 B2 ^= SIMD_16x32::splat(m_RK[4]);
146 B3 ^= SIMD_16x32::splat(m_RK[5]);
147 B0 ^= SIMD_16x32::splat(m_RK[6]);
148 B1 ^= SIMD_16x32::splat(m_RK[7]);
149
150 SIMD_16x32::transpose(B2, B3, B0, B1);
151
152 B2.store_le(out);
153 B3.store_le(out + 64);
154 B0.store_le(out + 128);
155 B1.store_le(out + 192);
156
158}
159
160void BOTAN_FN_ISA_AVX512_GFNI Twofish::avx512_decrypt_16(const uint8_t in[16 * 16], uint8_t out[16 * 16]) const {
161 using namespace Twofish_AVX512;
162
163 SIMD_16x32 B0 = SIMD_16x32::load_le(in);
164 SIMD_16x32 B1 = SIMD_16x32::load_le(in + 64);
165 SIMD_16x32 B2 = SIMD_16x32::load_le(in + 128);
166 SIMD_16x32 B3 = SIMD_16x32::load_le(in + 192);
167
168 SIMD_16x32::transpose(B0, B1, B2, B3);
169
170 B0 ^= SIMD_16x32::splat(m_RK[4]);
171 B1 ^= SIMD_16x32::splat(m_RK[5]);
172 B2 ^= SIMD_16x32::splat(m_RK[6]);
173 B3 ^= SIMD_16x32::splat(m_RK[7]);
174
175 const uint8_t* QS = m_QS.data();
176
177 for(size_t k = 40; k != 8; k -= 4) {
178 twofish_decrypt_round(B0, B1, B2, B3, m_RK[k - 2], m_RK[k - 1], QS);
179 twofish_decrypt_round(B2, B3, B0, B1, m_RK[k - 4], m_RK[k - 3], QS);
180 }
181
182 B2 ^= SIMD_16x32::splat(m_RK[0]);
183 B3 ^= SIMD_16x32::splat(m_RK[1]);
184 B0 ^= SIMD_16x32::splat(m_RK[2]);
185 B1 ^= SIMD_16x32::splat(m_RK[3]);
186
187 SIMD_16x32::transpose(B2, B3, B0, B1);
188
189 B2.store_le(out);
190 B3.store_le(out + 64);
191 B0.store_le(out + 128);
192 B1.store_le(out + 192);
193
195}
196
197} // namespace Botan
static BOTAN_FN_ISA_AVX512 void transpose(SIMD_16x32 &B0, SIMD_16x32 &B1, SIMD_16x32 &B2, SIMD_16x32 &B3)
BOTAN_FN_ISA_AVX512 SIMD_16x32 rotl() const
Definition simd_avx512.h:77
static BOTAN_FN_ISA_AVX512 SIMD_16x32 load_le(const uint8_t *in)
Definition simd_avx512.h:63
static BOTAN_FN_ISA_AVX512 void zero_registers()
__m512i BOTAN_FN_ISA_AVX512 raw() const
static BOTAN_FN_ISA_AVX512 SIMD_16x32 splat(uint32_t B)
Definition simd_avx512.h:60
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
BOTAN_FORCE_INLINE constexpr T rotr(T input)
Definition rotate.h:35