7#include <botan/internal/poly1305.h>
9#include <botan/internal/isa_extn.h>
18class SIMD_8x44 final {
20 BOTAN_FN_ISA_AVX512 SIMD_8x44() : m_v(_mm512_setzero_si512()) {}
22 static BOTAN_FN_ISA_AVX512 SIMD_8x44 splat(uint64_t x) {
return SIMD_8x44(_mm512_set1_epi64(x)); }
24 static BOTAN_FN_ISA_AVX512 SIMD_8x44 load(
const void* p) {
25 return SIMD_8x44(_mm512_loadu_si512(
reinterpret_cast<const __m512i*
>(p)));
28 BOTAN_FN_ISA_AVX512 SIMD_8x44(
int e7,
int e6,
int e5,
int e4,
int e3,
int e2,
int e1,
int e0) :
29 SIMD_8x44(_mm512_set_epi64(e7, e6, e5, e4, e3, e2, e1, e0)) {}
32 static BOTAN_FN_ISA_AVX512 SIMD_8x44 permute2(
const SIMD_8x44& idx,
const SIMD_8x44& a,
const SIMD_8x44& b) {
33 return SIMD_8x44(_mm512_permutex2var_epi64(a.m_v, idx.m_v, b.m_v));
36 static BOTAN_FN_ISA_AVX512 SIMD_8x44 permute3(
37 const SIMD_8x44& idx0,
const SIMD_8x44& idx1,
const SIMD_8x44& a,
const SIMD_8x44& b,
const SIMD_8x44& c) {
38 return SIMD_8x44::permute2(idx1, SIMD_8x44::permute2(idx0, a, b), c);
43 static BOTAN_FN_ISA_AVX512 SIMD_8x44 shrdi(
const SIMD_8x44& a,
const SIMD_8x44& b) {
44 return SIMD_8x44(_mm512_shrdi_epi64(a.m_v, b.m_v, COUNT));
47 BOTAN_FN_ISA_AVX512 SIMD_8x44 add_lane_zero(uint64_t b) {
48 return SIMD_8x44(_mm512_mask_add_epi64(m_v, 0x01, m_v, _mm512_set1_epi64(b)));
52 BOTAN_FN_ISA_AVX512 SIMD_8x44& ifma_lo(
const SIMD_8x44& a,
const SIMD_8x44& b) {
53 m_v = _mm512_madd52lo_epu64(m_v, a.m_v, b.m_v);
58 BOTAN_FN_ISA_AVX512 SIMD_8x44& ifma_hi(
const SIMD_8x44& a,
const SIMD_8x44& b) {
59 m_v = _mm512_madd52hi_epu64(m_v, a.m_v, b.m_v);
64 BOTAN_FN_ISA_AVX512 SIMD_8x44 mul_20()
const {
65 return SIMD_8x44(_mm512_add_epi64(_mm512_slli_epi64(m_v, 4), _mm512_slli_epi64(m_v, 2)));
69 BOTAN_FN_ISA_AVX512 SIMD_8x44 shr()
const {
70 return SIMD_8x44(_mm512_srli_epi64(m_v, S));
73 BOTAN_FN_ISA_AVX512 uint64_t horizontal_add()
const {
return _mm512_reduce_add_epi64(m_v); }
75 BOTAN_FN_ISA_AVX512 SIMD_8x44
operator&(
const SIMD_8x44& other)
const {
76 return SIMD_8x44(_mm512_and_si512(m_v, other.m_v));
79 BOTAN_FN_ISA_AVX512 SIMD_8x44
operator|(
const SIMD_8x44& other)
const {
80 return SIMD_8x44(_mm512_or_si512(m_v, other.m_v));
83 static BOTAN_FN_ISA_AVX512
void interleave_3x8(SIMD_8x44& r0, SIMD_8x44& r1, SIMD_8x44& r2) {
84 const auto idx1_z0 = SIMD_8x44(0, 3, 6, 9, 12, 15, -1, -1);
85 const auto idx2_z0 = SIMD_8x44(7, 6, 5, 4, 3, 2, 10, 13);
86 const auto idx1_z1 = SIMD_8x44(1, 4, 7, 10, 13, -1, -1, -1);
87 const auto idx2_z1 = SIMD_8x44(7, 6, 5, 4, 3, 8, 11, 14);
88 const auto idx1_z2 = SIMD_8x44(2, 5, 8, 11, 14, -1, -1, -1);
89 const auto idx2_z2 = SIMD_8x44(7, 6, 5, 4, 3, 9, 12, 15);
92 auto z0 = SIMD_8x44::permute3(idx1_z0, idx2_z0, r0, r1, r2);
93 auto z1 = SIMD_8x44::permute3(idx1_z1, idx2_z1, r0, r1, r2);
94 auto z2 = SIMD_8x44::permute3(idx1_z2, idx2_z2, r0, r1, r2);
103 __m512i BOTAN_FN_ISA_AVX512 raw()
const {
return m_v; }
105 explicit BOTAN_FN_ISA_AVX512 SIMD_8x44(__m512i v) : m_v(v) {}
121 constexpr uint64_t M44 = 0xFFFFFFFFFFF;
122 constexpr uint64_t M42 = 0x3FFFFFFFFFF;
123 constexpr uint64_t hibit64 =
static_cast<uint64_t
>(1) << 40;
129 const size_t original_blocks = blocks;
136 SIMD_8x44 r0 = SIMD_8x44::load(&X[5]);
137 SIMD_8x44 r1 = SIMD_8x44::load(&X[5 + 8]);
138 SIMD_8x44 r2 = SIMD_8x44::load(&X[5 + 2 * 8]);
139 SIMD_8x44::interleave_3x8(r0, r1, r2);
141 const auto s1 = r1.mul_20();
142 const auto s2 = r2.mul_20();
147 const auto idx_lo = SIMD_8x44(14, 12, 10, 8, 6, 4, 2, 0);
148 const auto idx_hi = SIMD_8x44(15, 13, 11, 9, 7, 5, 3, 1);
149 const auto mask44 = SIMD_8x44::splat(M44);
150 const auto mask42 = SIMD_8x44::splat(M42);
151 const auto hibit = SIMD_8x44::splat(hibit64);
155 const auto data0 = SIMD_8x44::load(m);
156 const auto data1 = SIMD_8x44::load(m + 64);
159 const auto t0 = SIMD_8x44::permute2(idx_lo, data0, data1);
160 const auto t1 = SIMD_8x44::permute2(idx_hi, data0, data1);
166 auto m0 = t0 & mask44;
167 auto m1 = SIMD_8x44::shrdi<44>(t0, t1) & mask44;
168 auto m2 = (t1.shr<24>() & mask42) | hibit;
171 m0 = m0.add_lane_zero(h0);
172 m1 = m1.add_lane_zero(h1);
173 m2 = m2.add_lane_zero(h2);
176 const SIMD_8x44 d0_lo = SIMD_8x44().ifma_lo(m0, r0).ifma_lo(m1, s2).ifma_lo(m2, s1);
177 const SIMD_8x44 d0_hi = SIMD_8x44().ifma_hi(m0, r0).ifma_hi(m1, s2).ifma_hi(m2, s1);
180 const SIMD_8x44 d1_lo = SIMD_8x44().ifma_lo(m0, r1).ifma_lo(m1, r0).ifma_lo(m2, s2);
181 const SIMD_8x44 d1_hi = SIMD_8x44().ifma_hi(m0, r1).ifma_hi(m1, r0).ifma_hi(m2, s2);
184 const SIMD_8x44 d2_lo = SIMD_8x44().ifma_lo(m0, r2).ifma_lo(m1, r1).ifma_lo(m2, r0);
185 const SIMD_8x44 d2_hi = SIMD_8x44().ifma_hi(m0, r2).ifma_hi(m1, r1).ifma_hi(m2, r0);
188 const uint64_t sum0_lo = d0_lo.horizontal_add();
189 const uint64_t sum0_hi = d0_hi.horizontal_add();
190 uint64_t sum1_lo = d1_lo.horizontal_add();
191 const uint64_t sum1_hi = d1_hi.horizontal_add();
192 uint64_t sum2_lo = d2_lo.horizontal_add();
193 const uint64_t sum2_hi = d2_hi.horizontal_add();
196 sum1_lo += (sum0_lo >> 44) + (sum0_hi << 8);
198 sum2_lo += (sum1_lo >> 44) + (sum1_hi << 8);
202 uint64_t
carry = ((sum2_lo >> 42) + (sum2_hi << 10)) * 5;
219 return (original_blocks - blocks);
ASN1_Type operator|(ASN1_Type x, ASN1_Type y)
void carry(int64_t &h0, int64_t &h1)
std::vector< T, secure_allocator< T > > secure_vector
ECIES_Flags operator&(ECIES_Flags a, ECIES_Flags b)