22consteval std::array<uint8_t, 256> twofish_q_perm(std::array<uint8_t, 16> t0,
23 std::array<uint8_t, 16> t1,
24 std::array<uint8_t, 16> t2,
25 std::array<uint8_t, 16> t3)
noexcept {
26 std::array<uint8_t, 256> Q = {};
27 for(
size_t x = 0; x != 256; ++x) {
28 const uint8_t a0 =
static_cast<uint8_t
>((x >> 4) & 0x0F);
29 const uint8_t b0 =
static_cast<uint8_t
>(x & 0x0F);
31 const uint8_t a1 = a0 ^ b0;
32 const uint8_t b1 = a0 ^ ((b0 >> 1) | ((b0 & 1) << 3)) ^ ((8 * a0) & 0x0F);
34 const uint8_t a2 = t0[a1];
35 const uint8_t b2 = t1[b1];
37 const uint8_t a3 = a2 ^ b2;
38 const uint8_t b3 = a2 ^ ((b2 >> 1) | ((b2 & 1) << 3)) ^ ((8 * a2) & 0x0F);
40 const uint8_t a4 = t2[a3];
41 const uint8_t b4 = t3[b3];
43 Q[x] =
static_cast<uint8_t
>((b4 << 4) | a4);
49alignas(256)
constexpr auto Q0 = twofish_q_perm(
50 {8, 1, 7, 13, 6, 15, 3, 2, 0, 11, 5, 9, 14, 12, 10, 4},
51 {14, 12, 11, 8, 1, 2, 3, 5, 15, 4, 10, 6, 7, 0, 9, 13},
52 {11, 10, 5, 14, 6, 13, 9, 0, 12, 8, 15, 3, 2, 4, 7, 1},
53 {13, 7, 15, 4, 1, 2, 6, 14, 9, 11, 3, 0, 8, 5, 12, 10});
55alignas(256)
constexpr auto Q1 = twofish_q_perm(
56 {2, 8, 11, 13, 15, 7, 6, 14, 3, 1, 9, 4, 0, 10, 12, 5},
57 {1, 14, 2, 11, 4, 12, 3, 7, 6, 13, 10, 5, 15, 9, 0, 8},
58 {4, 12, 7, 5, 1, 6, 9, 10, 0, 14, 13, 8, 2, 11, 3, 15},
59 {11, 9, 5, 1, 12, 3, 13, 14, 6, 4, 7, 15, 2, 0, 8, 10});
84inline uint8_t mds_div_x(uint8_t q) {
88inline uint32_t mds0(uint8_t q) {
89 const uint8_t q_div_x = mds_div_x(q);
90 const uint8_t q5b = q ^ mds_div_x(q_div_x);
91 const uint8_t qef = q5b ^ q_div_x;
95inline uint32_t mds1(uint8_t q) {
96 const uint8_t q_div_x = mds_div_x(q);
97 const uint8_t q5b = q ^ mds_div_x(q_div_x);
98 const uint8_t qef = q5b ^ q_div_x;
102inline uint32_t mds2(uint8_t q) {
103 const uint8_t q_div_x = mds_div_x(q);
104 const uint8_t q5b = q ^ mds_div_x(q_div_x);
105 const uint8_t qef = q5b ^ q_div_x;
109inline uint32_t mds3(uint8_t q) {
110 const uint8_t q_div_x = mds_div_x(q);
111 const uint8_t q5b = q ^ mds_div_x(q_div_x);
112 const uint8_t qef = q5b ^ q_div_x;
117inline uint32_t gf_mul_rs32(uint32_t rs, uint8_t k) {
118 constexpr uint32_t lo_bit = 0x01010101;
119 constexpr uint32_t mask = 0x7F7F7F7F;
120 constexpr uint32_t poly = 0x4D;
123 for(
size_t i = 0; i != 8; ++i) {
125 r ^= k_lo.if_set_return(rs);
126 rs = ((rs & mask) << 1) ^ (((rs >> 7) & lo_bit) * poly);
181 load_le(in, A0, B0, C0, D0, A1, B1, C1, D1);
192 for(
size_t k = 8; k != 40; k += 4) {
193 TF_E(A0, B0, C0, D0, m_RK[k + 0], m_RK[k + 1], m_SB);
194 TF_E(A1, B1, C1, D1, m_RK[k + 0], m_RK[k + 1], m_SB);
196 TF_E(C0, D0, A0, B0, m_RK[k + 2], m_RK[k + 3], m_SB);
197 TF_E(C1, D1, A1, B1, m_RK[k + 2], m_RK[k + 3], m_SB);
209 store_le(out, C0, D0, A0, B0, C1, D1, A1, B1);
228 for(
size_t k = 8; k != 40; k += 4) {
229 TF_E(A, B, C, D, m_RK[k], m_RK[k + 1], m_SB);
230 TF_E(C, D, A, B, m_RK[k + 2], m_RK[k + 3], m_SB);
257 load_le(in, A0, B0, C0, D0, A1, B1, C1, D1);
268 for(
size_t k = 40; k != 8; k -= 4) {
269 TF_D(A0, B0, C0, D0, m_RK[k - 2], m_RK[k - 1], m_SB);
270 TF_D(A1, B1, C1, D1, m_RK[k - 2], m_RK[k - 1], m_SB);
272 TF_D(C0, D0, A0, B0, m_RK[k - 4], m_RK[k - 3], m_SB);
273 TF_D(C1, D1, A1, B1, m_RK[k - 4], m_RK[k - 3], m_SB);
285 store_le(out, C0, D0, A0, B0, C1, D1, A1, B1);
304 for(
size_t k = 40; k != 8; k -= 4) {
305 TF_D(A, B, C, D, m_RK[k - 2], m_RK[k - 1], m_SB);
306 TF_D(C, D, A, B, m_RK[k - 4], m_RK[k - 3], m_SB);