23 WhirlpoolState() : m_v(_mm512_setzero_si512()) {}
26 explicit WhirlpoolState(__m512i v) : m_v(v) {}
28 WhirlpoolState(
const WhirlpoolState& other) =
default;
29 WhirlpoolState(WhirlpoolState&& other) =
default;
30 WhirlpoolState& operator=(
const WhirlpoolState& other) =
default;
31 WhirlpoolState& operator=(WhirlpoolState&& other) =
default;
32 ~WhirlpoolState() =
default;
36 static WhirlpoolState load_bytes(
const uint8_t src[64]) {
return WhirlpoolState(_mm512_loadu_si512(src)); }
39 static WhirlpoolState
load_be(
const uint64_t src[8]) {
return WhirlpoolState(_mm512_loadu_si512(src)).bswap(); }
42 void store_be(uint64_t dst[8])
const { _mm512_storeu_si512(dst, bswap().m_v); }
45 inline friend WhirlpoolState
operator^(WhirlpoolState a, WhirlpoolState b) {
46 return WhirlpoolState(_mm512_xor_si512(a.m_v, b.m_v));
50 inline WhirlpoolState&
operator^=(WhirlpoolState other) {
51 m_v = _mm512_xor_si512(m_v, other.m_v);
60 inline WhirlpoolState sub_bytes()
const {
62 _mm512_broadcast_i32x4(_mm_setr_epi8(1, 11, 9, 12, 13, 6, 15, 3, 14, 8, 7, 4, 10, 2, 5, 0));
64 _mm512_broadcast_i32x4(_mm_setr_epi8(15, 0, 13, 7, 11, 14, 5, 10, 9, 2, 12, 1, 3, 4, 8, 6));
66 _mm512_broadcast_i32x4(_mm_setr_epi8(7, 12, 11, 13, 14, 4, 9, 15, 6, 3, 8, 10, 2, 5, 1, 0));
68 const __m512i lo_mask = _mm512_set1_epi8(0x0F);
70 const __m512i lo_nib = _mm512_and_si512(m_v, lo_mask);
71 const __m512i hi_nib = _mm512_and_si512(_mm512_srli_epi16(m_v, 4), lo_mask);
74 const __m512i L = _mm512_shuffle_epi8(Ebox, hi_nib);
75 const __m512i R = _mm512_shuffle_epi8(Eibox, lo_nib);
76 const __m512i T = _mm512_shuffle_epi8(Rbox, _mm512_xor_si512(L, R));
79 const __m512i out_hi = _mm512_shuffle_epi8(Ebox, _mm512_xor_si512(L, T));
80 const __m512i out_lo = _mm512_shuffle_epi8(Eibox, _mm512_xor_si512(R, T));
82 return WhirlpoolState(_mm512_or_si512(_mm512_slli_epi16(out_hi, 4), _mm512_and_si512(out_lo, lo_mask)));
92 inline WhirlpoolState shift_columns()
const {
95 alignas(64)
static constexpr uint8_t perm[64] = {
97 0*8+0, 7*8+1, 6*8+2, 5*8+3, 4*8+4, 3*8+5, 2*8+6, 1*8+7,
98 1*8+0, 0*8+1, 7*8+2, 6*8+3, 5*8+4, 4*8+5, 3*8+6, 2*8+7,
99 2*8+0, 1*8+1, 0*8+2, 7*8+3, 6*8+4, 5*8+5, 4*8+6, 3*8+7,
100 3*8+0, 2*8+1, 1*8+2, 0*8+3, 7*8+4, 6*8+5, 5*8+6, 4*8+7,
101 4*8+0, 3*8+1, 2*8+2, 1*8+3, 0*8+4, 7*8+5, 6*8+6, 5*8+7,
102 5*8+0, 4*8+1, 3*8+2, 2*8+3, 1*8+4, 0*8+5, 7*8+6, 6*8+7,
103 6*8+0, 5*8+1, 4*8+2, 3*8+3, 2*8+4, 1*8+5, 0*8+6, 7*8+7,
104 7*8+0, 6*8+1, 5*8+2, 4*8+3, 3*8+4, 2*8+5, 1*8+6, 0*8+7,
107 return WhirlpoolState(_mm512_permutexvar_epi8(_mm512_load_si512(perm), m_v));
117 inline WhirlpoolState mix_rows()
const {
126 _mm512_broadcast_i32x4(_mm_setr_epi8(7, 0, 1, 2, 3, 4, 5, 6, 15, 8, 9, 10, 11, 12, 13, 14));
128 _mm512_broadcast_i32x4(_mm_setr_epi8(6, 7, 0, 1, 2, 3, 4, 5, 14, 15, 8, 9, 10, 11, 12, 13));
130 _mm512_broadcast_i32x4(_mm_setr_epi8(5, 6, 7, 0, 1, 2, 3, 4, 13, 14, 15, 8, 9, 10, 11, 12));
132 _mm512_broadcast_i32x4(_mm_setr_epi8(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11));
134 _mm512_broadcast_i32x4(_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10));
136 _mm512_broadcast_i32x4(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9));
138 _mm512_broadcast_i32x4(_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 9, 10, 11, 12, 13, 14, 15, 8));
140 const __m512i x2 = xtime(m_v);
141 const __m512i x4 = xtime(x2);
142 const __m512i x8 = xtime(x4);
143 const __m512i x5 = _mm512_xor_si512(x4, m_v);
144 const __m512i x9 = _mm512_xor_si512(x8, m_v);
146 const __m512i t01 = _mm512_xor_si512(m_v, _mm512_shuffle_epi8(m_v, rot1));
147 const __m512i t23 = _mm512_xor_si512(_mm512_shuffle_epi8(x4, rot2), _mm512_shuffle_epi8(m_v, rot3));
148 const __m512i t45 = _mm512_xor_si512(_mm512_shuffle_epi8(x8, rot4), _mm512_shuffle_epi8(x5, rot5));
149 const __m512i t67 = _mm512_xor_si512(_mm512_shuffle_epi8(x2, rot6), _mm512_shuffle_epi8(x9, rot7));
151 return WhirlpoolState(_mm512_xor_si512(_mm512_xor_si512(t01, t23), _mm512_xor_si512(t45, t67)));
158 inline WhirlpoolState round()
const {
return sub_bytes().shift_columns().mix_rows(); }
162 static inline WhirlpoolState rc(uint64_t v) {
return WhirlpoolState(_mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, v)); }
166 WhirlpoolState bswap()
const {
167 const __m512i tbl = _mm512_broadcast_i32x4(_mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7));
169 return WhirlpoolState(_mm512_shuffle_epi8(m_v, tbl));
174 static __m512i xtime(__m512i a) {
175 const __m512i poly = _mm512_set1_epi8(0x1D);
176 const __mmask64 top_bits = _mm512_movepi8_mask(a);
177 const __m512i shifted = _mm512_add_epi8(a, a);
178 return _mm512_mask_blend_epi8(top_bits, shifted, _mm512_xor_si512(shifted, poly));