23 WhirlpoolState() : m_lo(_mm256_setzero_si256()), m_hi(_mm256_setzero_si256()) {}
26 WhirlpoolState(__m256i lo, __m256i hi) : m_lo(lo), m_hi(hi) {}
28 WhirlpoolState(
const WhirlpoolState& other) =
default;
29 WhirlpoolState(WhirlpoolState&& other) =
default;
30 WhirlpoolState& operator=(
const WhirlpoolState& other) =
default;
31 WhirlpoolState& operator=(WhirlpoolState&& other) =
default;
32 ~WhirlpoolState() =
default;
35 static WhirlpoolState load_bytes(
const uint8_t src[64]) {
36 return WhirlpoolState(_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src)),
37 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src + 32)));
41 static WhirlpoolState
load_be(
const uint64_t src[8]) {
42 return WhirlpoolState(_mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src)),
43 _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(src + 4)))
48 void store_be(uint64_t dst[8])
const {
50 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst), s.m_lo);
51 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(dst + 4), s.m_hi);
55 inline friend WhirlpoolState
operator^(WhirlpoolState a, WhirlpoolState b) {
56 return WhirlpoolState(_mm256_xor_si256(a.m_lo, b.m_lo), _mm256_xor_si256(a.m_hi, b.m_hi));
60 inline friend WhirlpoolState
operator^(WhirlpoolState a, uint64_t rc) {
61 return WhirlpoolState(_mm256_xor_si256(a.m_lo, _mm256_set_epi64x(0, 0, 0, rc)), a.m_hi);
65 inline WhirlpoolState&
operator^=(WhirlpoolState other) {
66 m_lo = _mm256_xor_si256(m_lo, other.m_lo);
67 m_hi = _mm256_xor_si256(m_hi, other.m_hi);
72 inline WhirlpoolState sub_bytes()
const {
return WhirlpoolState(sub_bytes(m_lo), sub_bytes(m_hi)); }
75 inline WhirlpoolState shift_columns()
const {
82 constexpr char non = -1;
84 const auto sc0 = _mm_setr_epi8(0x0, non, non, non, non, non, non, 0xF, 0x8, 0x1, non, non, non, non, non, non);
85 const auto sc1 = _mm_setr_epi8(non, 0x9, 0x2, non, non, non, non, non, non, non, 0xA, 0x3, non, non, non, non);
86 const auto sc2 = _mm_setr_epi8(non, non, non, 0xB, 0x4, non, non, non, non, non, non, non, 0xC, 0x5, non, non);
87 const auto sc3 = _mm_setr_epi8(non, non, non, non, non, 0xD, 0x6, non, non, non, non, non, non, non, 0xE, 0x7);
89 const auto idx_same_lane = _mm256_broadcastsi128_si256(sc0);
90 const auto idx_other_half = _mm256_broadcastsi128_si256(sc2);
91 const auto idx_other_lane = _mm256_set_m128i(sc1, sc3);
92 const auto idx_other_both = _mm256_set_m128i(sc3, sc1);
95 const auto r_lo = _mm256_permute2x128_si256(m_lo, m_lo, 0x01);
96 const auto r_hi = _mm256_permute2x128_si256(m_hi, m_hi, 0x01);
103 __m256i new_lo = _mm256_shuffle_epi8(m_lo, idx_same_lane);
104 new_lo = _mm256_or_si256(new_lo, _mm256_shuffle_epi8(r_lo, idx_other_lane));
105 new_lo = _mm256_or_si256(new_lo, _mm256_shuffle_epi8(m_hi, idx_other_half));
106 new_lo = _mm256_or_si256(new_lo, _mm256_shuffle_epi8(r_hi, idx_other_both));
109 __m256i new_hi = _mm256_shuffle_epi8(m_hi, idx_same_lane);
110 new_hi = _mm256_or_si256(new_hi, _mm256_shuffle_epi8(r_hi, idx_other_lane));
111 new_hi = _mm256_or_si256(new_hi, _mm256_shuffle_epi8(m_lo, idx_other_half));
112 new_hi = _mm256_or_si256(new_hi, _mm256_shuffle_epi8(r_lo, idx_other_both));
114 return WhirlpoolState(new_lo, new_hi);
118 BOTAN_FORCE_INLINE WhirlpoolState mix_rows()
const {
return WhirlpoolState(mix_rows(m_lo), mix_rows(m_hi)); }
121 BOTAN_FORCE_INLINE WhirlpoolState round()
const {
return sub_bytes().shift_columns().mix_rows(); }
126 _mm256_broadcastsi128_si256(_mm_setr_epi8(1, 11, 9, 12, 13, 6, 15, 3, 14, 8, 7, 4, 10, 2, 5, 0));
128 _mm256_broadcastsi128_si256(_mm_setr_epi8(15, 0, 13, 7, 11, 14, 5, 10, 9, 2, 12, 1, 3, 4, 8, 6));
130 _mm256_broadcastsi128_si256(_mm_setr_epi8(7, 12, 11, 13, 14, 4, 9, 15, 6, 3, 8, 10, 2, 5, 1, 0));
132 const auto lo_mask = _mm256_set1_epi8(0x0F);
134 const auto lo_nib = _mm256_and_si256(v, lo_mask);
135 const auto hi_nib = _mm256_and_si256(_mm256_srli_epi16(v, 4), lo_mask);
137 const auto L = _mm256_shuffle_epi8(Ebox, hi_nib);
138 const auto R = _mm256_shuffle_epi8(Eibox, lo_nib);
139 const auto T = _mm256_shuffle_epi8(Rbox, _mm256_xor_si256(L, R));
141 const auto out_hi = _mm256_shuffle_epi8(Ebox, _mm256_xor_si256(L, T));
142 const auto out_lo = _mm256_shuffle_epi8(Eibox, _mm256_xor_si256(R, T));
144 return _mm256_or_si256(_mm256_slli_epi16(out_hi, 4), out_lo);
150 _mm256_broadcastsi128_si256(_mm_setr_epi8(7, 0, 1, 2, 3, 4, 5, 6, 15, 8, 9, 10, 11, 12, 13, 14));
152 _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 0, 1, 2, 3, 4, 5, 14, 15, 8, 9, 10, 11, 12, 13));
154 _mm256_broadcastsi128_si256(_mm_setr_epi8(5, 6, 7, 0, 1, 2, 3, 4, 13, 14, 15, 8, 9, 10, 11, 12));
156 _mm256_broadcastsi128_si256(_mm_setr_epi8(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11));
158 _mm256_broadcastsi128_si256(_mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10));
160 _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9));
162 _mm256_broadcastsi128_si256(_mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 0, 9, 10, 11, 12, 13, 14, 15, 8));
164 const auto x2 = xtime(v);
165 const auto x4 = xtime(x2);
166 const auto x8 = xtime(x4);
167 const auto x5 = _mm256_xor_si256(x4, v);
168 const auto x9 = _mm256_xor_si256(x8, v);
170 const auto t01 = _mm256_xor_si256(v, _mm256_shuffle_epi8(v, rot1));
171 const auto t23 = _mm256_xor_si256(_mm256_shuffle_epi8(x4, rot2), _mm256_shuffle_epi8(v, rot3));
172 const auto t45 = _mm256_xor_si256(_mm256_shuffle_epi8(x8, rot4), _mm256_shuffle_epi8(x5, rot5));
173 const auto t67 = _mm256_xor_si256(_mm256_shuffle_epi8(x2, rot6), _mm256_shuffle_epi8(x9, rot7));
175 return _mm256_xor_si256(_mm256_xor_si256(t01, t23), _mm256_xor_si256(t45, t67));
179 WhirlpoolState bswap()
const {
182 _mm256_broadcastsi128_si256(_mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8));
184 return WhirlpoolState(_mm256_shuffle_epi8(m_lo, tbl), _mm256_shuffle_epi8(m_hi, tbl));
188 static __m256i xtime(__m256i a) {
189 const auto poly = _mm256_set1_epi8(0x1D);
190 const auto shifted = _mm256_add_epi8(a, a);
192 return _mm256_blendv_epi8(shifted, _mm256_xor_si256(shifted, poly), a);