Botan 3.11.0
Crypto and TLS for C&
pcurves_mul.h
Go to the documentation of this file.
1/*
2* (C) 2024,2025,2026 Jack Lloyd
3*
4* Botan is released under the Simplified BSD License (see license.txt)
5*/
6
7#ifndef BOTAN_PCURVES_MUL_H_
8#define BOTAN_PCURVES_MUL_H_
9
10#include <botan/types.h>
11#include <botan/internal/ct_utils.h>
12#include <botan/internal/pcurves_algos.h>
13#include <vector>
14
15namespace Botan {
16
18
19/*
20* Multiplication algorithm window size parameters
21*/
22
23static constexpr size_t BasePointWindowBits = 6;
24static constexpr size_t VarPointWindowBits = 4;
25static constexpr size_t Mul2PrecompWindowBits = 3;
26static constexpr size_t Mul2WindowBits = 2;
27
28/**
29* Return number of blinding bits to use
30*
31* This can return any value between 0 and the scalar bit length.
32*
33* The field arithmetic and scalar multiplication algorithms are anyway written and tested
34* to be constant time; blinding is just used as a safety net in the case that the compiler
35* rewrites constant time code to include variable time behavior. If utmost performance is
36* of concern and you are in a position to test that your specific compiler for your
37* specific architecture is not inserting variable time behavior where not expected (for
38* example by using the existing valgrind-based CT checking) it is safe to modify this
39* function to just return 0, or some very small blinding factor of 1-4 bits.
40*/
41constexpr size_t scalar_blinding_bits(size_t scalar_bits) {
42 // For blinding use 1/8 the order length for most curves; for P-521 we round down a bit
43 // so the masked scalar fits exactly in 9 or 18 words.
44
45 if(scalar_bits == 521) {
46 return 55;
47 } else {
48 return scalar_bits / 8;
49 }
50}
51
52/**
53* A precomputed table of affine points with constant time lookup
54*
55* If R is zero then the entire table is scanned for each lookup.
56*
57* If R is not zero, then the table must be a multiple of R points long.
58* Each lookup will be examine a range of length R, as in
59* pts[0..R], pts[R..2*R], ...
60*/
61template <typename C, size_t R = 0>
62class AffinePointTable final {
63 public:
64 using AffinePoint = typename C::AffinePoint;
65 using ProjectivePoint = typename C::ProjectivePoint;
66 using WordType = typename C::WordType;
67
68 static constexpr bool WholeRangeSearch = (R == 0);
69
70 explicit AffinePointTable(std::span<const ProjectivePoint> pts) {
71 BOTAN_ASSERT_NOMSG(pts.size() > 1);
72
73 if constexpr(R > 0) {
74 BOTAN_ASSERT_NOMSG(pts.size() % R == 0);
75 }
76
77 // TODO scatter/gather with SIMD lookup
78 m_table = to_affine_batch<C>(pts);
79 }
80
81 /**
82 * If idx is zero then return the identity element. Otherwise return pts[idx - 1]
83 */
84 inline AffinePoint ct_select(size_t idx) const
85 requires(WholeRangeSearch)
86 {
87 BOTAN_DEBUG_ASSERT(idx < m_table.size() + 1);
88
89 auto result = AffinePoint::identity(m_table[0]);
90
91 // Intentionally wrapping; set to maximum size_t if idx == 0
92 const size_t idx1 = static_cast<size_t>(idx - 1);
93 for(size_t i = 0; i != m_table.size(); ++i) {
94 const auto found = CT::Mask<size_t>::is_equal(idx1, i).as_choice();
95 result.conditional_assign(found, m_table[i]);
96 }
97
98 return result;
99 }
100
101 /**
102 * If idx is zero then return the identity element. Otherwise return pts[idx - 1]
103 * out of the table subrange pts[iter*R..(iter+1)*R]
104 */
105 inline AffinePoint ct_select(size_t idx, size_t iter) const
106 requires(!WholeRangeSearch)
107 {
108 BOTAN_DEBUG_ASSERT(idx < R + 1);
109 BOTAN_DEBUG_ASSERT(R * (iter + 1) <= m_table.size());
110
111 auto result = AffinePoint::identity(m_table[R * iter]);
112
113 // Intentionally wrapping; set to maximum size_t if idx == 0
114 const size_t idx1 = static_cast<size_t>(idx - 1);
115 for(size_t i = 0; i != R; ++i) {
116 const auto found = CT::Mask<size_t>::is_equal(idx1, i).as_choice();
117 result.conditional_assign(found, m_table[R * iter + i]);
118 }
119
120 return result;
121 }
122
123 private:
124 std::vector<AffinePoint> m_table;
125};
126
127/*
128* Base point precomputation table
129*
130* This algorithm works by precomputing a set of points such that
131* the online phase of the point multiplication can be effected by
132* a sequence of point additions.
133*
134* The tables, even for W = 1, are large and costly to precompute, so
135* this is only used for the base point.
136*
137* The online phase of the algorithm uess `ceil(SB/W)` additions,
138* and no point doublings. The table is of size
139* `ceil(SB + W - 1)/W * ((1 << W) - 1)`
140* where SB is the bit length of the (blinded) scalar.
141*
142* Each window of the scalar is associated with a window in the table.
143* The table windows are unique to that offset within the scalar.
144*
145* The simplest version to understand is when W = 1. There the table
146* consists of [P, 2*P, 4*P, ..., 2^N*P] where N is the bit length of
147* the group order. The online phase consists of conditionally adding
148* table[i] depending on if bit i of the scalar is set or not.
149*
150* When W = 2, the scalar is examined 2 bits at a time, and the table
151* for a window index `I` is [(2^I)*P, (2^(I+1))*P, (2^I+2^(I+1))*P].
152*
153* This extends similarly for larger W
154*
155* At a certain point, the side channel silent table lookup becomes the
156* dominating cost
157*
158* For all W, each window in the table has an implicit element of
159* the identity element which is used if the scalar bits were all zero.
160* This is omitted to save space; AffinePoint::ct_select is designed
161* to assist in this by returning the identity element if its index
162* argument is zero, or otherwise it returns table[idx - 1]
163*/
164template <typename C, size_t WindowBits>
165std::vector<typename C::AffinePoint> basemul_setup(const typename C::AffinePoint& p, size_t max_scalar_bits) {
166 static_assert(WindowBits >= 1 && WindowBits <= 8);
167
168 // 2^W elements, less the identity element
169 constexpr size_t WindowElements = (1 << WindowBits) - 1;
170
171 const size_t Windows = (max_scalar_bits + WindowBits - 1) / WindowBits;
172
173 const size_t TableSize = Windows * WindowElements;
174
175 std::vector<typename C::ProjectivePoint> table;
176 table.reserve(TableSize);
177
178 auto accum = C::ProjectivePoint::from_affine(p);
179
180 for(size_t i = 0; i != TableSize; i += WindowElements) {
181 table.push_back(accum);
182
183 for(size_t j = 1; j != WindowElements; ++j) {
184 // Conditional ok: loop iteration count is public
185 if(j % 2 == 1) {
186 table.emplace_back(table[i + j / 2].dbl());
187 } else {
188 table.emplace_back(table[i + j - 1] + table[i]);
189 }
190 }
191
192 accum = table[i + (WindowElements / 2)].dbl();
193 }
194
195 // Variable time batch conversion is fine since generator is public
196 return to_affine_batch<C, true>(table);
197}
198
199template <typename C, size_t WindowBits, typename BlindedScalar>
200typename C::ProjectivePoint basemul_exec(std::span<const typename C::AffinePoint> table,
201 const BlindedScalar& scalar,
203 // 2^W elements, less the identity element
204 static constexpr size_t WindowElements = (1 << WindowBits) - 1;
205
206 // TODO: C++23 - use std::mdspan to access table?
207
208 auto accum = [&]() {
209 const size_t w_0 = scalar.get_window(0);
210 const auto tbl_0 = table.first(WindowElements);
211 auto pt = C::ProjectivePoint::from_affine(C::AffinePoint::ct_select(tbl_0, w_0));
212 CT::poison(pt);
213 pt.randomize_rep(rng);
214 return pt;
215 }();
216
217 const size_t windows = (scalar.bits() + WindowBits - 1) / WindowBits;
218
219 for(size_t i = 1; i != windows; ++i) {
220 const size_t w_i = scalar.get_window(WindowBits * i);
221 const auto tbl_i = table.subspan(WindowElements * i, WindowElements);
222
223 /*
224 None of these additions can be doublings, because in each iteration, the
225 discrete logarithms of the points we're selecting out of the table are
226 larger than the largest possible dlog of accum.
227 */
228 accum += C::AffinePoint::ct_select(tbl_i, w_i);
229
230 // Conditional ok: loop iteration count is public
231 if(i <= 3) {
232 accum.randomize_rep(rng);
233 }
234 }
235
236 CT::unpoison(accum);
237 return accum;
238}
239
240/*
241* Base point precomputation table with Booth recoding
242*
243* Same structure as basemul, but uses Booth recoding to halve the
244* table size per window. Instead of storing 2^W - 1 entries per
245* window, we store 2^(W-1) entries (multiples 1..2^(W-1) of the
246* window base point). The sign is handled by conditional negation
247* after the constant-time table lookup.
248*
249* The scalar is prepared with one extra blinding bit (WindowBits+1),
250* and windows overlap by one bit to allow carry propagation from
251* the Booth encoding.
252*/
253template <typename C, size_t WindowBits>
254std::vector<typename C::AffinePoint> basemul_booth_setup(const typename C::AffinePoint& p, size_t max_scalar_bits) {
255 static_assert(WindowBits >= 1 && WindowBits <= 8);
256
257 // 2^(W-1) elements per window [1*base .. 2^(W-1)*base]
258 constexpr size_t WindowElements = 1 << (WindowBits - 1);
259
260 const size_t Windows = (max_scalar_bits + WindowBits - 1) / WindowBits;
261
262 const size_t TableSize = Windows * WindowElements;
263
264 std::vector<typename C::ProjectivePoint> table;
265 table.reserve(TableSize);
266
267 auto accum = C::ProjectivePoint::from_affine(p);
268
269 for(size_t i = 0; i != TableSize; i += WindowElements) {
270 table.push_back(accum);
271
272 for(size_t j = 1; j != WindowElements; ++j) {
273 // Conditional ok: loop iteration count is public
274 if(j % 2 == 1) {
275 table.emplace_back(table[i + j / 2].dbl());
276 } else {
277 table.emplace_back(table[i + j - 1] + table[i]);
278 }
279 }
280
281 // Advance to next window's base: 2^W * current_base
282 // The last entry is 2^(W-1) * base, so doubling gives 2^W * base
283 accum = table[i + WindowElements - 1].dbl();
284 }
285
286 // Variable time batch conversion is fine since generator is public
287 return to_affine_batch<C, true>(table);
288}
289
290/*
291* Booth recoding for base point multiplication
292*/
293template <size_t WindowBits, std::unsigned_integral T>
294constexpr std::pair<size_t, CT::Choice> booth_recode(T x) {
295 static_assert(WindowBits >= 1 && WindowBits <= 8);
296
297 auto s_mask = CT::Mask<T>::expand(x >> WindowBits);
298 const T neg_x = (1 << (WindowBits + 1)) - x - 1;
299 T d = s_mask.select(neg_x, x);
300 d = (d >> 1) + (d & 1);
301
302 return std::make_pair(static_cast<size_t>(d), s_mask.as_choice());
303}
304
305template <typename C, size_t WindowBits, typename BlindedScalar>
306typename C::ProjectivePoint basemul_booth_exec(std::span<const typename C::AffinePoint> table,
307 const BlindedScalar& scalar,
309 static constexpr size_t WindowElements = 1 << (WindowBits - 1);
310
311 const size_t windows = (scalar.bits() + WindowBits) / WindowBits;
312
313 auto accum = [&]() {
314 // First window: extract W bits, shift left 1 to insert implicit carry in of zero
315 const size_t w_bits = scalar.get_window(0) & ((1 << WindowBits) - 1);
316 const size_t raw = w_bits << 1;
317 const auto [tidx, tneg] = booth_recode<WindowBits>(raw);
318 const auto tbl_0 = table.first(WindowElements);
319
320 auto pt = C::ProjectivePoint::from_affine(C::AffinePoint::ct_select(tbl_0, tidx));
321 pt.conditional_assign(tneg, pt.negate());
322 CT::poison(pt);
323 pt.randomize_rep(rng);
324 return pt;
325 }();
326
327 for(size_t i = 1; i != windows; ++i) {
328 // Extract W+1 bits overlapping by 1 with the previous window
329 const size_t bit_pos = WindowBits * i - 1;
330 const size_t raw = scalar.get_window(bit_pos);
331 const auto [tidx, tneg] = booth_recode<WindowBits>(raw);
332
333 const auto tbl_i = table.subspan(WindowElements * i, WindowElements);
334
335 accum = C::ProjectivePoint::add_or_sub(accum, C::AffinePoint::ct_select(tbl_i, tidx), tneg);
336
337 // Conditional ok: loop iteration count is public
338 if(i <= 3) {
339 accum.randomize_rep(rng);
340 }
341 }
342
343 CT::unpoison(accum);
344 return accum;
345}
346
347/*
348* Variable point table mul setup and online phase
349*/
350template <typename C, size_t TableSize>
351AffinePointTable<C> varpoint_setup(const typename C::AffinePoint& p) {
352 static_assert(TableSize > 2);
353
354 std::vector<typename C::ProjectivePoint> table;
355 table.reserve(TableSize);
356 table.push_back(C::ProjectivePoint::from_affine(p));
357
358 for(size_t i = 1; i != TableSize; ++i) {
359 // Conditional ok: loop iteration count is public
360 if(i % 2 == 1) {
361 table.push_back(table[i / 2].dbl());
362 } else {
363 table.push_back(table[i - 1] + p);
364 }
365 }
366
367 return AffinePointTable<C>(table);
368}
369
370template <typename C, size_t WindowBits, typename BlindedScalar>
371typename C::ProjectivePoint varpoint_exec(const AffinePointTable<C>& table,
372 const BlindedScalar& scalar,
374 const size_t windows = (scalar.bits() + WindowBits - 1) / WindowBits;
375
376 auto accum = [&]() {
377 const size_t w_0 = scalar.get_window((windows - 1) * WindowBits);
378 auto pt = C::ProjectivePoint::from_affine(table.ct_select(w_0));
379 CT::poison(pt);
380 pt.randomize_rep(rng);
381 return pt;
382 }();
383
384 for(size_t i = 1; i != windows; ++i) {
385 accum = accum.dbl_n(WindowBits);
386 auto w_i = scalar.get_window((windows - i - 1) * WindowBits);
387
388 /*
389 This point addition cannot be a doubling (except once)
390
391 Consider the sequence of points that are operated on, and specifically
392 their discrete logarithms. We start out at the point at infinity
393 (dlog 0) and then add the initial window which is precisely P*w_0
394
395 We then perform WindowBits doublings, so accum's dlog at the point
396 of the addition in the first iteration of the loop (when i == 1) is
397 at least 2^W * w_0.
398
399 Since we know w_0 > 0, then in every iteration of the loop, accums
400 dlog will always be greater than the dlog of the table element we
401 just looked up (something between 0 and 2^W-1), and thus the
402 addition into accum cannot be a doubling.
403
404 However due to blinding this argument fails, since we perform
405 multiplications using a scalar that is larger than the group
406 order. In this case it's possible that the dlog of accum becomes
407 `order + x` (or, effectively, `x`) and `x` is smaller than 2^W.
408 In this case, a doubling may occur. Future iterations of the loop
409 cannot be doublings by the same argument above. Since the blinding
410 factor is always less than the group order (substantially so),
411 it is not possible for the dlog of accum to overflow a second time.
412 */
413
414 accum += table.ct_select(w_i);
415
416 // Conditional ok: loop iteration count is public
417 if(i <= 3) {
418 accum.randomize_rep(rng);
419 }
420 }
421
422 CT::unpoison(accum);
423 return accum;
424}
425
426/*
427* Effect 2-ary multiplication ie x*G + y*H
428*
429* This is done using a windowed variant of what is usually called
430* Shamir's trick.
431*
432* The W = 1 case is simple; we precompute an extra point GH = G + H,
433* and then examine 1 bit in each of x and y. If one or the other bits
434* are set then add G or H resp. If both bits are set, add GH.
435*
436* The example below is a precomputed table for W=2. The flattened table
437* begins at (x_i,y_i) = (1,0), i.e. the identity element is omitted.
438* The indices in each cell refer to the cell's location in m_table.
439*
440* x-> 0 1 2 3
441* 0 |/ (ident) |0 x |1 2x |2 3x |
442* 1 |3 y |4 x+y |5 2x+y |6 3x+y |
443* y = 2 |7 2y |8 x+2y |9 2(x+y) |10 3x+2y |
444* 3 |11 3y |12 x+3y |13 2x+3y |14 3x+3y |
445*/
446
447template <typename C, size_t WindowBits>
448std::vector<typename C::ProjectivePoint> mul2_setup(const typename C::AffinePoint& p,
449 const typename C::AffinePoint& q) {
450 static_assert(WindowBits >= 1 && WindowBits <= 4);
451
452 // 2^(2*W) elements, less the identity element
453 constexpr size_t TableSize = (1 << (2 * WindowBits)) - 1;
454 constexpr size_t WindowSize = (1 << WindowBits);
455
456 std::vector<typename C::ProjectivePoint> table;
457 table.reserve(TableSize);
458
459 for(size_t i = 0; i != TableSize; ++i) {
460 const size_t t_i = (i + 1);
461 const size_t p_i = t_i % WindowSize;
462 const size_t q_i = (t_i >> WindowBits) % WindowSize;
463
464 // Conditionals ok: all based on t_i/p_i/q_i which in turn are derived from public i
465
466 // Returns x_i * x + y_i * y
467 auto next_tbl_e = [&]() {
468 if(p_i % 2 == 0 && q_i % 2 == 0) {
469 // Where possible using doubling (eg indices 1, 7, 9 in
470 // the table above)
471 return table[(t_i / 2) - 1].dbl();
472 } else if(p_i > 0 && q_i > 0) {
473 // A combination of p and q
474 if(p_i == 1) {
475 return p + table[(q_i << WindowBits) - 1];
476 } else if(q_i == 1) {
477 return table[p_i - 1] + q;
478 } else {
479 return table[p_i - 1] + table[(q_i << WindowBits) - 1];
480 }
481 } else if(p_i > 0 && q_i == 0) {
482 // A multiple of p without a q component
483 if(p_i == 1) {
484 // Just p
485 return C::ProjectivePoint::from_affine(p);
486 } else {
487 // p * p_{i-1}
488 return p + table[p_i - 1 - 1];
489 }
490 } else if(p_i == 0 && q_i > 0) {
491 if(q_i == 1) {
492 // Just q
493 return C::ProjectivePoint::from_affine(q);
494 } else {
495 // q * q_{i-1}
496 return q + table[((q_i - 1) << WindowBits) - 1];
497 }
498 } else {
500 }
501 };
502
503 table.emplace_back(next_tbl_e());
504 }
505
506 return table;
507}
508
509template <typename C, size_t WindowBits, typename BlindedScalar>
510typename C::ProjectivePoint mul2_exec(const AffinePointTable<C>& table,
511 const BlindedScalar& x,
512 const BlindedScalar& y,
514 const size_t Windows = (x.bits() + WindowBits - 1) / WindowBits;
515
516 auto accum = [&]() {
517 const size_t w_1 = x.get_window((Windows - 1) * WindowBits);
518 const size_t w_2 = y.get_window((Windows - 1) * WindowBits);
519 const size_t window = w_1 + (w_2 << WindowBits);
520 auto pt = C::ProjectivePoint::from_affine(table.ct_select(window));
521 CT::poison(pt);
522 pt.randomize_rep(rng);
523 return pt;
524 }();
525
526 for(size_t i = 1; i != Windows; ++i) {
527 accum = accum.dbl_n(WindowBits);
528
529 const size_t w_1 = x.get_window((Windows - i - 1) * WindowBits);
530 const size_t w_2 = y.get_window((Windows - i - 1) * WindowBits);
531 const size_t window = w_1 + (w_2 << WindowBits);
532 accum += table.ct_select(window);
533
534 // Conditional ok: loop iteration count is public
535 if(i <= 3) {
536 accum.randomize_rep(rng);
537 }
538 }
539
540 CT::unpoison(accum);
541 return accum;
542}
543
544} // namespace Botan
545
546#endif
#define BOTAN_ASSERT_NOMSG(expr)
Definition assert.h:75
#define BOTAN_DEBUG_ASSERT(expr)
Definition assert.h:129
#define BOTAN_ASSERT_UNREACHABLE()
Definition assert.h:163
typename C::ProjectivePoint ProjectivePoint
Definition pcurves_mul.h:65
AffinePoint ct_select(size_t idx) const
Definition pcurves_mul.h:84
typename C::AffinePoint AffinePoint
Definition pcurves_mul.h:64
AffinePoint ct_select(size_t idx, size_t iter) const
static constexpr bool WholeRangeSearch
Definition pcurves_mul.h:68
AffinePointTable(std::span< const ProjectivePoint > pts)
Definition pcurves_mul.h:70
static constexpr Mask< T > expand(T v)
Definition ct_utils.h:392
static constexpr Mask< T > is_equal(T x, T y)
Definition ct_utils.h:442
constexpr void unpoison(const T *p, size_t n)
Definition ct_utils.h:67
constexpr void poison(const T *p, size_t n)
Definition ct_utils.h:56
C::ProjectivePoint varpoint_exec(const AffinePointTable< C > &table, const BlindedScalar &scalar, RandomNumberGenerator &rng)
constexpr size_t scalar_blinding_bits(size_t scalar_bits)
Definition pcurves_mul.h:41
C::ProjectivePoint mul2_exec(const AffinePointTable< C > &table, const BlindedScalar &x, const BlindedScalar &y, RandomNumberGenerator &rng)
auto to_affine_batch(std::span< const typename C::ProjectivePoint > projective)
C::ProjectivePoint basemul_exec(std::span< const typename C::AffinePoint > table, const BlindedScalar &scalar, RandomNumberGenerator &rng)
C::ProjectivePoint basemul_booth_exec(std::span< const typename C::AffinePoint > table, const BlindedScalar &scalar, RandomNumberGenerator &rng)
std::vector< typename C::ProjectivePoint > mul2_setup(const typename C::AffinePoint &p, const typename C::AffinePoint &q)
std::vector< typename C::AffinePoint > basemul_booth_setup(const typename C::AffinePoint &p, size_t max_scalar_bits)
constexpr std::pair< size_t, CT::Choice > booth_recode(T x)
AffinePointTable< C > varpoint_setup(const typename C::AffinePoint &p)
std::vector< typename C::AffinePoint > basemul_setup(const typename C::AffinePoint &p, size_t max_scalar_bits)