Botan 3.11.0
Crypto and TLS for C&
poly1305.cpp
Go to the documentation of this file.
1/*
2* Derived from poly1305-donna-64.h by Andrew Moon <liquidsun@gmail.com>
3* in https://github.com/floodyberry/poly1305-donna
4*
5* (C) 2014 Andrew Moon
6* (C) 2014,2025,2026 Jack Lloyd
7*
8* Botan is released under the Simplified BSD License (see license.txt)
9*/
10
11#include <botan/internal/poly1305.h>
12
13#include <botan/internal/buffer_slicer.h>
14#include <botan/internal/ct_utils.h>
15#include <botan/internal/donna128.h>
16#include <botan/internal/loadstor.h>
17
18#if defined(BOTAN_HAS_POLY1305_AVX2) || defined(BOTAN_HAS_POLY1305_AVX512)
19 #include <botan/internal/cpuid.h>
20#endif
21
22namespace Botan {
23
24namespace {
25
26// State layout: pad || accum || r || r^2 || r^3 || ... || r^n
27// This ordering allows extending with more powers of r at the end
28constexpr size_t PAD_BASE = 0; // pad[0..1]
29constexpr size_t H_BASE = 2; // h[0..2] (accumulator)
30constexpr size_t R_BASE = 5; // r^1[0..2], r^2[3..5], r^3[6..8], etc.
31
32// Multiply two values in radix 2^44 representation mod (2^130 - 5)
33// h = a * b mod p
34BOTAN_FORCE_INLINE void poly1305_mul_44(uint64_t& h0,
35 uint64_t& h1,
36 uint64_t& h2,
37 uint64_t a0,
38 uint64_t a1,
39 uint64_t a2,
40 uint64_t b0,
41 uint64_t b1,
42 uint64_t b2) {
43 constexpr uint64_t M44 = 0xFFFFFFFFFFF;
44 constexpr uint64_t M42 = 0x3FFFFFFFFFF;
45
46#if !defined(BOTAN_TARGET_HAS_NATIVE_UINT128)
47 typedef donna128 uint128_t;
48#endif
49
50 const uint64_t s1 = b1 * 20;
51 const uint64_t s2 = b2 * 20;
52
53 const uint128_t d0 = uint128_t(a0) * b0 + uint128_t(a1) * s2 + uint128_t(a2) * s1;
54 const uint64_t c0 = carry_shift(d0, 44);
55
56 const uint128_t d1 = uint128_t(a0) * b1 + uint128_t(a1) * b0 + uint128_t(a2) * s2 + c0;
57 const uint64_t c1 = carry_shift(d1, 44);
58
59 const uint128_t d2 = uint128_t(a0) * b2 + uint128_t(a1) * b1 + uint128_t(a2) * b0 + c1;
60 const uint64_t c2 = carry_shift(d2, 42);
61
62 h0 = (d0 & M44) + c2 * 5;
63 h1 = (d1 & M44) + (h0 >> 44);
64 h0 &= M44;
65 h2 = d2 & M42;
66}
67
68// Extend powers of r from current max to target
69void poly1305_extend_powers(secure_vector<uint64_t>& X, size_t target_powers) {
70 const size_t current_powers = (X.size() - 5) / 3;
71
72 if(current_powers >= target_powers) {
73 return;
74 }
75
76 // Load r^1 for multiplication
77 const uint64_t r0 = X[R_BASE + 0];
78 const uint64_t r1 = X[R_BASE + 1];
79 const uint64_t r2 = X[R_BASE + 2];
80
81 X.resize(5 + target_powers * 3);
82
83 // Compute r^(current+1) through r^target
84 for(size_t i = current_powers + 1; i <= target_powers; ++i) {
85 const size_t offset = R_BASE + (i - 1) * 3;
86 poly1305_mul_44(
87 X[offset + 0], X[offset + 1], X[offset + 2], X[offset - 3], X[offset - 2], X[offset - 1], r0, r1, r2);
88 }
89}
90
91// Initialize Poly1305 state and precompute powers of r
92void poly1305_init(secure_vector<uint64_t>& X, const uint8_t key[32]) {
93 X.clear();
94 X.reserve(2 + 3 + 2 * 3);
95 X.resize(2 + 3 + 3);
96
97 /* Save pad for later (first 2 slots) */
98 X[PAD_BASE + 0] = load_le<uint64_t>(key, 2);
99 X[PAD_BASE + 1] = load_le<uint64_t>(key, 3);
100
101 /* h = 0 (accumulator, next 3 slots) */
102 X[H_BASE + 0] = 0;
103 X[H_BASE + 1] = 0;
104 X[H_BASE + 2] = 0;
105
106 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff (clamping) */
107 const uint64_t t0 = load_le<uint64_t>(key, 0);
108 const uint64_t t1 = load_le<uint64_t>(key, 1);
109
110 const uint64_t r0 = (t0) & 0xffc0fffffff;
111 const uint64_t r1 = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
112 const uint64_t r2 = ((t1 >> 24)) & 0x00ffffffc0f;
113
114 // Store r^1
115 X[R_BASE + 0] = r0;
116 X[R_BASE + 1] = r1;
117 X[R_BASE + 2] = r2;
118
119 poly1305_extend_powers(X, 2);
120}
121
122// Process a single block: h = (h + m) * r mod p
123BOTAN_FORCE_INLINE void poly1305_block_single(uint64_t& h0,
124 uint64_t& h1,
125 uint64_t& h2,
126 uint64_t r0,
127 uint64_t r1,
128 uint64_t r2,
129 uint64_t s1,
130 uint64_t s2,
131 const uint8_t* m,
132 uint64_t hibit) {
133 constexpr uint64_t M44 = 0xFFFFFFFFFFF;
134 constexpr uint64_t M42 = 0x3FFFFFFFFFF;
135
136#if !defined(BOTAN_TARGET_HAS_NATIVE_UINT128)
137 typedef donna128 uint128_t;
138#endif
139
140 const uint64_t t0 = load_le<uint64_t>(m, 0);
141 const uint64_t t1 = load_le<uint64_t>(m, 1);
142
143 h0 += (t0 & M44);
144 h1 += ((t0 >> 44) | (t1 << 20)) & M44;
145 h2 += ((t1 >> 24) & M42) | hibit;
146
147 const uint128_t d0 = uint128_t(h0) * r0 + uint128_t(h1) * s2 + uint128_t(h2) * s1;
148 const uint64_t c0 = carry_shift(d0, 44);
149
150 const uint128_t d1 = uint128_t(h0) * r1 + uint128_t(h1) * r0 + uint128_t(h2) * s2 + c0;
151 const uint64_t c1 = carry_shift(d1, 44);
152
153 const uint128_t d2 = uint128_t(h0) * r2 + uint128_t(h1) * r1 + uint128_t(h2) * r0 + c1;
154 const uint64_t c2 = carry_shift(d2, 42);
155
156 h0 = (d0 & M44) + c2 * 5;
157 h1 = (d1 & M44) + (h0 >> 44);
158 h0 &= M44;
159 h2 = d2 & M42;
160}
161
162// Process two blocks in parallel: h = ((h + m0) * r + m1) * r = (h + m0) * r^2 + m1 * r
163// The multiplications by r^2 and r are independent, enabling ILP
164BOTAN_FORCE_INLINE void poly1305_block_pair(uint64_t& h0,
165 uint64_t& h1,
166 uint64_t& h2,
167 uint64_t r0,
168 uint64_t r1,
169 uint64_t r2,
170 uint64_t s1,
171 uint64_t s2,
172 uint64_t rr0,
173 uint64_t rr1,
174 uint64_t rr2,
175 uint64_t ss1,
176 uint64_t ss2,
177 const uint8_t* m,
178 uint64_t hibit) {
179 constexpr uint64_t M44 = 0xFFFFFFFFFFF;
180 constexpr uint64_t M42 = 0x3FFFFFFFFFF;
181
182#if !defined(BOTAN_TARGET_HAS_NATIVE_UINT128)
183 typedef donna128 uint128_t;
184#endif
185
186 // Load first block (will be multiplied by r^2)
187 const uint64_t m0_t0 = load_le<uint64_t>(m, 0);
188 const uint64_t m0_t1 = load_le<uint64_t>(m, 1);
189
190 // Load second block (will be multiplied by r)
191 const uint64_t m1_t0 = load_le<uint64_t>(m + 16, 0);
192 const uint64_t m1_t1 = load_le<uint64_t>(m + 16, 1);
193
194 // Add first block to h
195 h0 += (m0_t0 & M44);
196 h1 += ((m0_t0 >> 44) | (m0_t1 << 20)) & M44;
197 h2 += ((m0_t1 >> 24) & M42) | hibit;
198
199 // Convert second block to limbs
200 const uint64_t b0 = (m1_t0 & M44);
201 const uint64_t b1 = ((m1_t0 >> 44) | (m1_t1 << 20)) & M44;
202 const uint64_t b2 = ((m1_t1 >> 24) & M42) | hibit;
203
204 // Compute (h + m0) * r^2 + m1 * r
205 const uint128_t d0 = uint128_t(h0) * rr0 + uint128_t(h1) * ss2 + uint128_t(h2) * ss1 + uint128_t(b0) * r0 +
206 uint128_t(b1) * s2 + uint128_t(b2) * s1;
207 const uint64_t c0 = carry_shift(d0, 44);
208
209 const uint128_t d1 = uint128_t(h0) * rr1 + uint128_t(h1) * rr0 + uint128_t(h2) * ss2 + uint128_t(b0) * r1 +
210 uint128_t(b1) * r0 + uint128_t(b2) * s2 + c0;
211 const uint64_t c1 = carry_shift(d1, 44);
212
213 const uint128_t d2 = uint128_t(h0) * rr2 + uint128_t(h1) * rr1 + uint128_t(h2) * rr0 + uint128_t(b0) * r2 +
214 uint128_t(b1) * r1 + uint128_t(b2) * r0 + c1;
215 const uint64_t c2 = carry_shift(d2, 42);
216
217 h0 = (d0 & M44) + c2 * 5;
218 h1 = (d1 & M44) + (h0 >> 44);
219 h0 &= M44;
220 h2 = d2 & M42;
221}
222
223void poly1305_blocks(secure_vector<uint64_t>& X, const uint8_t* m, size_t blocks, bool is_final = false) {
224 const uint64_t hibit = is_final ? 0 : (static_cast<uint64_t>(1) << 40);
225
226 // Load r (at R_BASE + 0)
227 const uint64_t r0 = X[R_BASE + 0];
228 const uint64_t r1 = X[R_BASE + 1];
229 const uint64_t r2 = X[R_BASE + 2];
230 const uint64_t s1 = r1 * 20;
231 const uint64_t s2 = r2 * 20;
232
233 // Load r^2 (at R_BASE + 3)
234 const uint64_t rr0 = X[R_BASE + 3];
235 const uint64_t rr1 = X[R_BASE + 4];
236 const uint64_t rr2 = X[R_BASE + 5];
237
238 // Precompute
239 const uint64_t ss1 = rr1 * 20;
240 const uint64_t ss2 = rr2 * 20;
241
242 // Load accumulator
243 uint64_t h0 = X[H_BASE + 0];
244 uint64_t h1 = X[H_BASE + 1];
245 uint64_t h2 = X[H_BASE + 2];
246
247 while(blocks >= 2) {
248 poly1305_block_pair(h0, h1, h2, r0, r1, r2, s1, s2, rr0, rr1, rr2, ss1, ss2, m, hibit);
249 m += 32;
250 blocks -= 2;
251 }
252
253 // Final block?
254 if(blocks > 0) {
255 poly1305_block_single(h0, h1, h2, r0, r1, r2, s1, s2, m, hibit);
256 }
257
258 // Store accumulator
259 X[H_BASE + 0] = h0;
260 X[H_BASE + 1] = h1;
261 X[H_BASE + 2] = h2;
262}
263
264void poly1305_finish(secure_vector<uint64_t>& X, uint8_t mac[16]) {
265 constexpr uint64_t M44 = 0xFFFFFFFFFFF;
266 constexpr uint64_t M42 = 0x3FFFFFFFFFF;
267
268 /* fully carry h */
269 uint64_t h0 = X[H_BASE + 0];
270 uint64_t h1 = X[H_BASE + 1];
271 uint64_t h2 = X[H_BASE + 2];
272
273 uint64_t c = (h1 >> 44);
274 h1 &= M44;
275 h2 += c;
276 c = (h2 >> 42);
277 h2 &= M42;
278 h0 += c * 5;
279 c = (h0 >> 44);
280 h0 &= M44;
281 h1 += c;
282 c = (h1 >> 44);
283 h1 &= M44;
284 h2 += c;
285 c = (h2 >> 42);
286 h2 &= M42;
287 h0 += c * 5;
288 c = (h0 >> 44);
289 h0 &= M44;
290 h1 += c;
291
292 /* compute h + -p */
293 uint64_t g0 = h0 + 5;
294 c = (g0 >> 44);
295 g0 &= M44;
296 uint64_t g1 = h1 + c;
297 c = (g1 >> 44);
298 g1 &= M44;
299 const uint64_t g2 = h2 + c - (static_cast<uint64_t>(1) << 42);
300
301 /* select h if h < p, or h + -p if h >= p */
302 const auto c_mask = CT::Mask<uint64_t>::expand(c);
303 h0 = c_mask.select(g0, h0);
304 h1 = c_mask.select(g1, h1);
305 h2 = c_mask.select(g2, h2);
306
307 /* h = (h + pad) */
308 const uint64_t t0 = X[PAD_BASE + 0];
309 const uint64_t t1 = X[PAD_BASE + 1];
310
311 h0 += ((t0)&M44);
312 c = (h0 >> 44);
313 h0 &= M44;
314 h1 += (((t0 >> 44) | (t1 << 20)) & M44) + c;
315 c = (h1 >> 44);
316 h1 &= M44;
317 h2 += (((t1 >> 24)) & M42) + c;
318 h2 &= M42;
319
320 /* mac = h % (2^128) */
321 h0 = ((h0) | (h1 << 44));
322 h1 = ((h1 >> 20) | (h2 << 24));
323
324 store_le(mac, h0, h1);
325
326 /* zero out the state */
327 clear_mem(X.data(), X.size());
328}
329
330} // namespace
331
333 zap(m_poly);
334 m_buffer.clear();
335}
336
338 // Minimum size: pad(2) + accum(3) + r(3) + r^2(3) = 11
339 return m_poly.size() >= 11;
340}
341
342void Poly1305::key_schedule(std::span<const uint8_t> key) {
343 m_buffer.clear();
344
345 poly1305_init(m_poly, key.data());
346}
347
348std::string Poly1305::provider() const {
349#if defined(BOTAN_HAS_POLY1305_AVX512)
350 if(auto feat = CPUID::check(CPUID::Feature::AVX512)) {
351 return *feat;
352 }
353#endif
354
355#if defined(BOTAN_HAS_POLY1305_AVX2)
356 if(auto feat = CPUID::check(CPUID::Feature::AVX2)) {
357 return *feat;
358 }
359#endif
360
361 return "base";
362}
363
364void Poly1305::add_data(std::span<const uint8_t> input) {
366
367 BufferSlicer in(input);
368
369 while(!in.empty()) {
370 if(const auto one_block = m_buffer.handle_unaligned_data(in)) {
371 poly1305_blocks(m_poly, one_block->data(), 1);
372 }
373
374 if(m_buffer.in_alignment()) {
375 const auto [aligned_data, full_blocks] = m_buffer.aligned_data_to_process(in);
376 if(full_blocks > 0) {
377 const uint8_t* data_ptr = aligned_data.data();
378 size_t blocks_remaining = full_blocks;
379
380#if defined(BOTAN_HAS_POLY1305_AVX512)
381 if(blocks_remaining >= 8 * 3 && CPUID::has(CPUID::Feature::AVX512)) {
382 // Lazily compute r^3 through r^8 on first AVX512 use
383 poly1305_extend_powers(m_poly, 8);
384 const size_t processed = poly1305_avx512_blocks(m_poly, data_ptr, blocks_remaining);
385 data_ptr += processed * 16;
386 blocks_remaining -= processed;
387 }
388#endif
389
390#if defined(BOTAN_HAS_POLY1305_AVX2)
391 if(blocks_remaining >= 4 * 6 && CPUID::has(CPUID::Feature::AVX2)) {
392 // Lazily compute r^3 and r^4 on first AVX2 use
393 poly1305_extend_powers(m_poly, 4);
394 const size_t processed = poly1305_avx2_blocks(m_poly, data_ptr, blocks_remaining);
395 data_ptr += processed * 16;
396 blocks_remaining -= processed;
397 }
398#endif
399
400 if(blocks_remaining > 0) {
401 poly1305_blocks(m_poly, data_ptr, blocks_remaining);
402 }
403 }
404 }
405 }
406}
407
408void Poly1305::final_result(std::span<uint8_t> out) {
410
411 if(!m_buffer.in_alignment()) {
412 const uint8_t final_byte = 0x01;
413 m_buffer.append({&final_byte, 1});
414 m_buffer.fill_up_with_zeros();
415 poly1305_blocks(m_poly, m_buffer.consume().data(), 1, true);
416 }
417
418 poly1305_finish(m_poly, out.data());
419
420 m_poly.clear();
421 m_buffer.clear();
422}
423
424} // namespace Botan
std::tuple< std::span< const uint8_t >, size_t > aligned_data_to_process(BufferSlicer &slicer) const
std::optional< std::span< const T > > handle_unaligned_data(BufferSlicer &slicer)
static std::optional< std::string > check(CPUID::Feature feat)
Definition cpuid.h:67
static bool has(CPUID::Feature feat)
Definition cpuid.h:94
static constexpr Mask< T > expand(T v)
Definition ct_utils.h:392
void clear() override
Definition poly1305.cpp:332
std::string provider() const override
Definition poly1305.cpp:348
bool has_keying_material() const override
Definition poly1305.cpp:337
void assert_key_material_set() const
Definition sym_algo.h:145
#define BOTAN_FORCE_INLINE
Definition compiler.h:87
constexpr uint64_t carry_shift(const donna128 &a, size_t shift)
Definition donna128.h:129
void zap(std::vector< T, Alloc > &vec)
Definition secmem.h:133
constexpr auto store_le(ParamTs &&... params)
Definition loadstor.h:736
constexpr auto load_le(ParamTs &&... params)
Definition loadstor.h:495
std::vector< T, secure_allocator< T > > secure_vector
Definition secmem.h:68
constexpr void clear_mem(T *ptr, size_t n)
Definition mem_ops.h:118