7#include <botan/internal/poly1305.h>
9#include <botan/internal/isa_extn.h>
18constexpr uint32_t MASK26 = 0x3FFFFFF;
28class SIMD_4x26 final {
30 BOTAN_FN_ISA_AVX2 SIMD_4x26() : m_v(_mm256_setzero_si256()) {}
33 static BOTAN_FN_ISA_AVX2 SIMD_4x26 from_raw(__m256i v) {
return SIMD_4x26(v); }
36 static BOTAN_FN_ISA_AVX2 SIMD_4x26 set(uint32_t v3, uint32_t v2, uint32_t v1, uint32_t v0) {
37 return SIMD_4x26(_mm256_set_epi32(0, v3, 0, v2, 0, v1, 0, v0));
41 BOTAN_FN_ISA_AVX2 SIMD_4x26 mul_5()
const {
return SIMD_4x26(_mm256_add_epi32(_mm256_slli_epi32(m_v, 2), m_v)); }
43 friend SIMD_4x26 BOTAN_FN_ISA_AVX2
operator+(
const SIMD_4x26& x,
const SIMD_4x26& y) {
44 return SIMD_4x26(_mm256_add_epi64(x.raw(), y.raw()));
47 friend SIMD_4x26 BOTAN_FN_ISA_AVX2
operator*(
const SIMD_4x26& x,
const SIMD_4x26& y) {
48 return SIMD_4x26(_mm256_mul_epi32(x.raw(), y.raw()));
52 BOTAN_FN_ISA_AVX2 uint64_t horizontal_add64()
const {
54 _mm256_storeu_si256(
reinterpret_cast<__m256i*
>(tmp), m_v);
55 return tmp[0] + tmp[1] + tmp[2] + tmp[3];
58 __m256i BOTAN_FN_ISA_AVX2 raw()
const {
return m_v; }
61 explicit BOTAN_FN_ISA_AVX2 SIMD_4x26(__m256i v) : m_v(v) {}
74BOTAN_FN_ISA_AVX2
void load_4_blocks_26(SIMD_4x26& msg_0,
80 std::array<uint32_t, 5> h) {
82 const __m256i d0 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(m));
83 const __m256i d1 = _mm256_loadu_si256(
reinterpret_cast<const __m256i*
>(m + 32));
87 const __m256i t0_mixed = _mm256_unpacklo_epi64(d0, d1);
88 const __m256i t1_mixed = _mm256_unpackhi_epi64(d0, d1);
90 const __m256i t0 = _mm256_permute4x64_epi64(t0_mixed, 0b00100111);
91 const __m256i t1 = _mm256_permute4x64_epi64(t1_mixed, 0b00100111);
94 const __m256i mask26 = _mm256_set1_epi64x(MASK26);
95 const __m256i hibit_vec = _mm256_set1_epi64x(1 << 24);
103 __m256i limb0 = _mm256_and_si256(t0, mask26);
104 __m256i limb1 = _mm256_and_si256(_mm256_srli_epi64(t0, 26), mask26);
105 __m256i limb2 = _mm256_and_si256(_mm256_or_si256(_mm256_srli_epi64(t0, 52), _mm256_slli_epi64(t1, 12)), mask26);
106 __m256i limb3 = _mm256_and_si256(_mm256_srli_epi64(t1, 14), mask26);
107 __m256i limb4 = _mm256_or_si256(_mm256_srli_epi64(t1, 40), hibit_vec);
110 limb0 = _mm256_add_epi64(limb0, _mm256_set_epi64x(h[0], 0, 0, 0));
111 limb1 = _mm256_add_epi64(limb1, _mm256_set_epi64x(h[1], 0, 0, 0));
112 limb2 = _mm256_add_epi64(limb2, _mm256_set_epi64x(h[2], 0, 0, 0));
113 limb3 = _mm256_add_epi64(limb3, _mm256_set_epi64x(h[3], 0, 0, 0));
114 limb4 = _mm256_add_epi64(limb4, _mm256_set_epi64x(h[4], 0, 0, 0));
116 msg_0 = SIMD_4x26::from_raw(limb0);
117 msg_1 = SIMD_4x26::from_raw(limb1);
118 msg_2 = SIMD_4x26::from_raw(limb2);
119 msg_3 = SIMD_4x26::from_raw(limb3);
120 msg_4 = SIMD_4x26::from_raw(limb4);
129 const std::array<uint32_t, 5> in) {
130 constexpr uint64_t M44 = 0xFFFFFFFFFFF;
131 constexpr uint64_t M42 = 0x3FFFFFFFFFF;
134 const uint64_t i0 = in[0];
135 const uint64_t i1 = in[1];
136 const uint64_t i2 = in[2];
137 const uint64_t i3 = in[3];
138 const uint64_t i4 = in[4];
140 r0 = (i0 | (i1 << 26)) & M44;
141 r1 = ((i1 >> 18) | (i2 << 8) | (i3 << 34)) & M44;
142 r2 = ((i3 >> 10) | (i4 << 16)) & M42;
146BOTAN_FORCE_INLINE std::array<uint32_t, 5> convert_44_to_26(uint64_t r0, uint64_t r1, uint64_t r2) {
147 std::array<uint32_t, 5> out{};
148 out[0] =
static_cast<uint32_t
>(r0) & MASK26;
149 out[1] =
static_cast<uint32_t
>((r0 >> 26) | (r1 << 18)) & MASK26;
150 out[2] =
static_cast<uint32_t
>(r1 >> 8) & MASK26;
151 out[3] =
static_cast<uint32_t
>((r1 >> 34) | (r2 << 10)) & MASK26;
152 out[4] =
static_cast<uint32_t
>(r2 >> 16) & MASK26;
156inline void BOTAN_FN_ISA_AVX2
157load_r(SIMD_4x26& r0, SIMD_4x26& r1, SIMD_4x26& r2, SIMD_4x26& r3, SIMD_4x26& r4,
const secure_vector<uint64_t>& X) {
159 const auto t = convert_44_to_26(X[5], X[6], X[7]);
160 const auto t2 = convert_44_to_26(X[8], X[9], X[10]);
161 const auto t3 = convert_44_to_26(X[11], X[12], X[13]);
162 const auto t4 = convert_44_to_26(X[14], X[15], X[16]);
164 r0 = SIMD_4x26::set(t4[0], t3[0], t2[0], t[0]);
165 r1 = SIMD_4x26::set(t4[1], t3[1], t2[1], t[1]);
166 r2 = SIMD_4x26::set(t4[2], t3[2], t2[2], t[2]);
167 r3 = SIMD_4x26::set(t4[3], t3[3], t2[3], t[3]);
168 r4 = SIMD_4x26::set(t4[4], t3[4], t2[4], t[4]);
177size_t BOTAN_FN_ISA_AVX2 Poly1305::poly1305_avx2_blocks(
secure_vector<uint64_t>& X,
const uint8_t m[],
size_t blocks) {
182 const size_t incoming_blocks = blocks;
184 auto h = convert_44_to_26(X[2], X[3], X[4]);
191 load_r(r0, r1, r2, r3, r4, X);
193 const auto r1_5 = r1.mul_5();
194 const auto r2_5 = r2.mul_5();
195 const auto r3_5 = r3.mul_5();
196 const auto r4_5 = r4.mul_5();
205 load_4_blocks_26(m0, m1, m2, m3, m4, m, h);
207 const auto d0 = m0 * r0 + m1 * r4_5 + m2 * r3_5 + m3 * r2_5 + m4 * r1_5;
208 const auto d1 = m0 * r1 + m1 * r0 + m2 * r4_5 + m3 * r3_5 + m4 * r2_5;
209 const auto d2 = m0 * r2 + m1 * r1 + m2 * r0 + m3 * r4_5 + m4 * r3_5;
210 const auto d3 = m0 * r3 + m1 * r2 + m2 * r1 + m3 * r0 + m4 * r4_5;
211 const auto d4 = m0 * r4 + m1 * r3 + m2 * r2 + m3 * r1 + m4 * r0;
213 const uint64_t h0_64 = d0.horizontal_add64();
214 uint64_t h1_64 = d1.horizontal_add64();
215 uint64_t h2_64 = d2.horizontal_add64();
216 uint64_t h3_64 = d3.horizontal_add64();
217 uint64_t h4_64 = d4.horizontal_add64();
219 h1_64 += h0_64 >> 26;
220 h[0] =
static_cast<uint32_t
>(h0_64) & MASK26;
221 h2_64 += h1_64 >> 26;
222 h[1] =
static_cast<uint32_t
>(h1_64) & MASK26;
223 h3_64 += h2_64 >> 26;
224 h[2] =
static_cast<uint32_t
>(h2_64) & MASK26;
225 h4_64 += h3_64 >> 26;
226 h[3] =
static_cast<uint32_t
>(h3_64) & MASK26;
228 const uint64_t c = h4_64 >> 26;
229 h[4] =
static_cast<uint32_t
>(h4_64) & MASK26;
231 uint64_t
carry = c * 5;
233 h[0] =
static_cast<uint32_t
>(
carry) & MASK26;
236 h[1] =
static_cast<uint32_t
>(
carry) & MASK26;
239 h[2] =
static_cast<uint32_t
>(
carry) & MASK26;
242 h[3] =
static_cast<uint32_t
>(
carry) & MASK26;
244 h[4] +=
static_cast<uint32_t
>(
carry);
250 convert_26_to_44(X[2], X[3], X[4], h);
252 return (incoming_blocks - blocks);
#define BOTAN_FORCE_INLINE
BigInt operator*(const BigInt &x, const BigInt &y)
OctetString operator+(const OctetString &k1, const OctetString &k2)
void carry(int64_t &h0, int64_t &h1)
std::vector< T, secure_allocator< T > > secure_vector