7#include <botan/internal/sm4.h>
9#include <botan/internal/isa_extn.h>
16alignas(16)
static const uint8_t qswap_tbl[16] = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
18alignas(16)
static const uint8_t bswap_tbl[16] = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
20inline uint32x4_t qswap_32(uint32x4_t B) {
21 return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(B), vld1q_u8(qswap_tbl)));
24inline uint32x4_t bswap_32(uint32x4_t B) {
25 return vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(B)));
32inline uint32x4_t bqswap_32(uint32x4_t B) {
33 return vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(B), vld1q_u8(bswap_tbl)));
36inline void BOTAN_FN_ISA_SM4 SM4_E(uint32x4_t& B0, uint32x4_t& B1, uint32x4_t& B2, uint32x4_t& B3, uint32x4_t K) {
37 B0 = vsm4eq_u32(B0, K);
38 B1 = vsm4eq_u32(B1, K);
39 B2 = vsm4eq_u32(B2, K);
40 B3 = vsm4eq_u32(B3, K);
45void BOTAN_FN_ISA_SM4 SM4::sm4_armv8_encrypt(
const uint8_t input8[], uint8_t output8[],
size_t blocks)
const {
46 const uint32x4_t K0 = vld1q_u32(&m_RK[0]);
47 const uint32x4_t
K1 = vld1q_u32(&m_RK[4]);
48 const uint32x4_t
K2 = vld1q_u32(&m_RK[8]);
49 const uint32x4_t
K3 = vld1q_u32(&m_RK[12]);
50 const uint32x4_t
K4 = vld1q_u32(&m_RK[16]);
51 const uint32x4_t K5 = vld1q_u32(&m_RK[20]);
52 const uint32x4_t K6 = vld1q_u32(&m_RK[24]);
53 const uint32x4_t K7 = vld1q_u32(&m_RK[28]);
55 const uint32_t* input32 =
reinterpret_cast<const uint32_t*
>(
reinterpret_cast<const void*
>(input8));
56 uint32_t* output32 =
reinterpret_cast<uint32_t*
>(
reinterpret_cast<void*
>(output8));
59 uint32x4_t B0 = bswap_32(vld1q_u32(input32));
60 uint32x4_t B1 = bswap_32(vld1q_u32(input32 + 4));
61 uint32x4_t B2 = bswap_32(vld1q_u32(input32 + 8));
62 uint32x4_t B3 = bswap_32(vld1q_u32(input32 + 12));
64 SM4_E(B0, B1, B2, B3, K0);
65 SM4_E(B0, B1, B2, B3, K1);
66 SM4_E(B0, B1, B2, B3, K2);
67 SM4_E(B0, B1, B2, B3, K3);
68 SM4_E(B0, B1, B2, B3, K4);
69 SM4_E(B0, B1, B2, B3, K5);
70 SM4_E(B0, B1, B2, B3, K6);
71 SM4_E(B0, B1, B2, B3, K7);
73 vst1q_u32(output32, bqswap_32(B0));
74 vst1q_u32(output32 + 4, bqswap_32(B1));
75 vst1q_u32(output32 + 8, bqswap_32(B2));
76 vst1q_u32(output32 + 12, bqswap_32(B3));
83 for(
size_t i = 0; i != blocks; ++i) {
84 uint32x4_t B = bswap_32(vld1q_u32(input32));
86 B = vsm4eq_u32(B, K0);
87 B = vsm4eq_u32(B, K1);
88 B = vsm4eq_u32(B, K2);
89 B = vsm4eq_u32(B, K3);
90 B = vsm4eq_u32(B, K4);
91 B = vsm4eq_u32(B, K5);
92 B = vsm4eq_u32(B, K6);
93 B = vsm4eq_u32(B, K7);
95 vst1q_u32(output32, bqswap_32(B));
102void BOTAN_FN_ISA_SM4 SM4::sm4_armv8_decrypt(
const uint8_t input8[], uint8_t output8[],
size_t blocks)
const {
103 const uint32x4_t K0 = qswap_32(vld1q_u32(&m_RK[0]));
104 const uint32x4_t
K1 = qswap_32(vld1q_u32(&m_RK[4]));
105 const uint32x4_t
K2 = qswap_32(vld1q_u32(&m_RK[8]));
106 const uint32x4_t
K3 = qswap_32(vld1q_u32(&m_RK[12]));
107 const uint32x4_t
K4 = qswap_32(vld1q_u32(&m_RK[16]));
108 const uint32x4_t K5 = qswap_32(vld1q_u32(&m_RK[20]));
109 const uint32x4_t K6 = qswap_32(vld1q_u32(&m_RK[24]));
110 const uint32x4_t K7 = qswap_32(vld1q_u32(&m_RK[28]));
112 const uint32_t* input32 =
reinterpret_cast<const uint32_t*
>(
reinterpret_cast<const void*
>(input8));
113 uint32_t* output32 =
reinterpret_cast<uint32_t*
>(
reinterpret_cast<void*
>(output8));
116 uint32x4_t B0 = bswap_32(vld1q_u32(input32));
117 uint32x4_t B1 = bswap_32(vld1q_u32(input32 + 4));
118 uint32x4_t B2 = bswap_32(vld1q_u32(input32 + 8));
119 uint32x4_t B3 = bswap_32(vld1q_u32(input32 + 12));
121 SM4_E(B0, B1, B2, B3, K7);
122 SM4_E(B0, B1, B2, B3, K6);
123 SM4_E(B0, B1, B2, B3, K5);
124 SM4_E(B0, B1, B2, B3, K4);
125 SM4_E(B0, B1, B2, B3, K3);
126 SM4_E(B0, B1, B2, B3, K2);
127 SM4_E(B0, B1, B2, B3, K1);
128 SM4_E(B0, B1, B2, B3, K0);
130 vst1q_u32(output32, bqswap_32(B0));
131 vst1q_u32(output32 + 4, bqswap_32(B1));
132 vst1q_u32(output32 + 8, bqswap_32(B2));
133 vst1q_u32(output32 + 12, bqswap_32(B3));
140 for(
size_t i = 0; i != blocks; ++i) {
141 uint32x4_t B = bswap_32(vld1q_u32(input32));
143 B = vsm4eq_u32(B, K7);
144 B = vsm4eq_u32(B, K6);
145 B = vsm4eq_u32(B, K5);
146 B = vsm4eq_u32(B, K4);
147 B = vsm4eq_u32(B, K3);
148 B = vsm4eq_u32(B, K2);
149 B = vsm4eq_u32(B, K1);
150 B = vsm4eq_u32(B, K0);
152 vst1q_u32(output32, bqswap_32(B));