7#include <botan/argon2.h>
14class SIMD_2x64
final {
16 SIMD_2x64& operator=(
const SIMD_2x64& other) =
default;
17 SIMD_2x64(
const SIMD_2x64& other) =
default;
19 SIMD_2x64& operator=(SIMD_2x64&& other) =
default;
20 SIMD_2x64(SIMD_2x64&& other) =
default;
22 ~SIMD_2x64() =
default;
25 SIMD_2x64() { m_simd = _mm_setzero_si128(); }
27 static SIMD_2x64
load_le(
const void* in) {
28 return SIMD_2x64(_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(in)));
31 void store_le(uint64_t out[2])
const { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
33 void store_le(uint8_t out[])
const { _mm_storeu_si128(
reinterpret_cast<__m128i*
>(out), m_simd); }
35 SIMD_2x64
operator+(
const SIMD_2x64& other)
const {
36 SIMD_2x64 retval(*
this);
41 SIMD_2x64
operator^(
const SIMD_2x64& other)
const {
42 SIMD_2x64 retval(*
this);
47 void operator+=(
const SIMD_2x64& other) { m_simd = _mm_add_epi64(m_simd, other.m_simd); }
49 void operator^=(
const SIMD_2x64& other) { m_simd = _mm_xor_si128(m_simd, other.m_simd); }
53 SIMD_2x64
rotr() const
54 requires(ROT > 0 && ROT < 64)
56 if constexpr(ROT == 16) {
57 auto tab = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
58 return SIMD_2x64(_mm_shuffle_epi8(m_simd, tab));
59 }
else if constexpr(ROT == 24) {
60 auto tab = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
61 return SIMD_2x64(_mm_shuffle_epi8(m_simd, tab));
62 }
else if constexpr(ROT == 32) {
63 auto tab = _mm_setr_epi8(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11);
64 return SIMD_2x64(_mm_shuffle_epi8(m_simd, tab));
66 return SIMD_2x64(_mm_or_si128(_mm_srli_epi64(m_simd,
static_cast<int>(ROT)),
67 _mm_slli_epi64(m_simd,
static_cast<int>(64 - ROT))));
72 SIMD_2x64
rotl()
const {
73 return this->
rotr<64 - ROT>();
77 static SIMD_2x64 mul2_32(SIMD_2x64 x, SIMD_2x64 y) {
78 const __m128i m = _mm_mul_epu32(x.m_simd, y.m_simd);
79 return SIMD_2x64(_mm_add_epi64(m, m));
84 static SIMD_2x64 alignr(SIMD_2x64 a, SIMD_2x64
b)
85 requires(
T > 0 &&
T < 16)
87 return SIMD_2x64(_mm_alignr_epi8(a.m_simd,
b.m_simd,
T));
91 static void twist(SIMD_2x64& B0, SIMD_2x64& B1, SIMD_2x64& C0, SIMD_2x64& C1, SIMD_2x64& D0, SIMD_2x64& D1) {
94 T0 = SIMD_2x64::alignr<8>(B1, B0);
95 T1 = SIMD_2x64::alignr<8>(B0, B1);
103 T0 = SIMD_2x64::alignr<8>(D0, D1);
104 T1 = SIMD_2x64::alignr<8>(D1, D0);
110 static void untwist(SIMD_2x64& B0, SIMD_2x64& B1, SIMD_2x64& C0, SIMD_2x64& C1, SIMD_2x64& D0, SIMD_2x64& D1) {
113 T0 = SIMD_2x64::alignr<8>(B0, B1);
114 T1 = SIMD_2x64::alignr<8>(B1, B0);
122 T0 = SIMD_2x64::alignr<8>(D1, D0);
123 T1 = SIMD_2x64::alignr<8>(D0, D1);
128 explicit SIMD_2x64(__m128i x) : m_simd(x) {}
142 A0 += B0 + SIMD_2x64::mul2_32(A0, B0);
143 A1 += B1 + SIMD_2x64::mul2_32(A1, B1);
149 C0 += D0 + SIMD_2x64::mul2_32(C0, D0);
150 C1 += D1 + SIMD_2x64::mul2_32(C1, D1);
156 A0 += B0 + SIMD_2x64::mul2_32(A0, B0);
157 A1 += B1 + SIMD_2x64::mul2_32(A1, B1);
163 C0 += D0 + SIMD_2x64::mul2_32(C0, D0);
164 C1 += D1 + SIMD_2x64::mul2_32(C1, D1);
179 blamka_G(A0, A1, B0, B1, C0, C1, D0, D1);
181 SIMD_2x64::twist(B0, B1, C0, C1, D0, D1);
182 blamka_G(A0, A1, B0, B1, C0, C1, D0, D1);
183 SIMD_2x64::untwist(B0, B1, C0, C1, D0, D1);
188void Argon2::blamka_ssse3(uint64_t N[128], uint64_t
T[128]) {
189 for(
size_t i = 0; i != 8; ++i) {
191 for(
size_t j = 0; j != 4; ++j) {
192 Tv[2 * j] = SIMD_2x64::load_le(&N[16 * i + 4 * j]);
193 Tv[2 * j + 1] = SIMD_2x64::load_le(&N[16 * i + 4 * j + 2]);
196 blamka_R(Tv[0], Tv[1], Tv[2], Tv[3], Tv[4], Tv[5], Tv[6], Tv[7]);
198 for(
size_t j = 0; j != 4; ++j) {
199 Tv[2 * j].store_le(&
T[16 * i + 4 * j]);
200 Tv[2 * j + 1].store_le(&
T[16 * i + 4 * j + 2]);
204 for(
size_t i = 0; i != 8; ++i) {
206 for(
size_t j = 0; j != 4; ++j) {
207 Tv[2 * j] = SIMD_2x64::load_le(&
T[2 * i + 32 * j]);
208 Tv[2 * j + 1] = SIMD_2x64::load_le(&
T[2 * i + 32 * j + 16]);
211 blamka_R(Tv[0], Tv[1], Tv[2], Tv[3], Tv[4], Tv[5], Tv[6], Tv[7]);
213 for(
size_t j = 0; j != 4; ++j) {
214 Tv[2 * j].store_le(&
T[2 * i + 32 * j]);
215 Tv[2 * j + 1].store_le(&
T[2 * i + 32 * j + 16]);
219 for(
size_t i = 0; i != 128 / 4; ++i) {
220 SIMD_2x64 n0 = SIMD_2x64::load_le(&N[4 * i]);
221 SIMD_2x64 n1 = SIMD_2x64::load_le(&N[4 * i + 2]);
222 SIMD_2x64 t0 = SIMD_2x64::load_le(&
T[4 * i]);
223 SIMD_2x64 t1 = SIMD_2x64::load_le(&
T[4 * i + 2]);
227 n0.store_le(&N[4 * i]);
228 n1.store_le(&N[4 * i + 2]);
int(* final)(unsigned char *, CTX *)
#define BOTAN_FUNC_ISA(isa)
#define BOTAN_FORCE_INLINE
constexpr T rotl(T input)
OctetString operator^(const OctetString &k1, const OctetString &k2)
OctetString operator+(const OctetString &k1, const OctetString &k2)
constexpr T rotr(T input)
constexpr auto store_le(ParamTs &&... params)
std::vector< uint8_t, Alloc > & operator^=(std::vector< uint8_t, Alloc > &out, const std::vector< uint8_t, Alloc2 > &in)
std::vector< T, Alloc > & operator+=(std::vector< T, Alloc > &out, const std::vector< T, Alloc2 > &in)
constexpr auto load_le(ParamTs &&... params)