7#include <botan/argon2.h>
9#include <botan/compiler.h>
16class SIMD_2x64
final {
18 SIMD_2x64& operator=(
const SIMD_2x64& other) =
default;
19 SIMD_2x64(
const SIMD_2x64& other) =
default;
21 SIMD_2x64& operator=(SIMD_2x64&& other) =
default;
22 SIMD_2x64(SIMD_2x64&& other) =
default;
24 ~SIMD_2x64() =
default;
27 SIMD_2x64() { m_simd = _mm_setzero_si128(); }
29 static SIMD_2x64
load_le(
const void* in) {
30 return SIMD_2x64(_mm_loadu_si128(
reinterpret_cast<const __m128i*
>(in)));
33 void store_le(uint64_t out[2])
const { this->
store_le(
reinterpret_cast<uint8_t*
>(out)); }
35 void store_le(uint8_t out[])
const { _mm_storeu_si128(
reinterpret_cast<__m128i*
>(out), m_simd); }
37 SIMD_2x64
operator+(
const SIMD_2x64& other)
const {
38 SIMD_2x64 retval(*
this);
43 SIMD_2x64
operator^(
const SIMD_2x64& other)
const {
44 SIMD_2x64 retval(*
this);
49 void operator+=(
const SIMD_2x64& other) { m_simd = _mm_add_epi64(m_simd, other.m_simd); }
51 void operator^=(
const SIMD_2x64& other) { m_simd = _mm_xor_si128(m_simd, other.m_simd); }
55 SIMD_2x64
rotr() const
56 requires(ROT > 0 && ROT < 64)
58 if constexpr(ROT == 16) {
59 auto tab = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
60 return SIMD_2x64(_mm_shuffle_epi8(m_simd, tab));
61 }
else if constexpr(ROT == 24) {
62 auto tab = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
63 return SIMD_2x64(_mm_shuffle_epi8(m_simd, tab));
64 }
else if constexpr(ROT == 32) {
65 auto tab = _mm_setr_epi8(4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11);
66 return SIMD_2x64(_mm_shuffle_epi8(m_simd, tab));
68 return SIMD_2x64(_mm_or_si128(_mm_srli_epi64(m_simd,
static_cast<int>(ROT)),
69 _mm_slli_epi64(m_simd,
static_cast<int>(64 - ROT))));
74 SIMD_2x64
rotl()
const {
75 return this->
rotr<64 - ROT>();
79 static SIMD_2x64 mul2_32(SIMD_2x64 x, SIMD_2x64 y) {
80 const __m128i m = _mm_mul_epu32(x.m_simd, y.m_simd);
81 return SIMD_2x64(_mm_add_epi64(m, m));
86 static SIMD_2x64 alignr(SIMD_2x64 a, SIMD_2x64
b)
87 requires(
T > 0 &&
T < 16)
89 return SIMD_2x64(_mm_alignr_epi8(a.m_simd,
b.m_simd,
T));
93 static void twist(SIMD_2x64& B0, SIMD_2x64& B1, SIMD_2x64& C0, SIMD_2x64& C1, SIMD_2x64& D0, SIMD_2x64& D1) {
96 T0 = SIMD_2x64::alignr<8>(B1, B0);
97 T1 = SIMD_2x64::alignr<8>(B0, B1);
105 T0 = SIMD_2x64::alignr<8>(D0, D1);
106 T1 = SIMD_2x64::alignr<8>(D1, D0);
112 static void untwist(SIMD_2x64& B0, SIMD_2x64& B1, SIMD_2x64& C0, SIMD_2x64& C1, SIMD_2x64& D0, SIMD_2x64& D1) {
115 T0 = SIMD_2x64::alignr<8>(B0, B1);
116 T1 = SIMD_2x64::alignr<8>(B1, B0);
124 T0 = SIMD_2x64::alignr<8>(D1, D0);
125 T1 = SIMD_2x64::alignr<8>(D0, D1);
130 explicit SIMD_2x64(__m128i x) : m_simd(x) {}
144 A0 += B0 + SIMD_2x64::mul2_32(A0, B0);
145 A1 += B1 + SIMD_2x64::mul2_32(A1, B1);
151 C0 += D0 + SIMD_2x64::mul2_32(C0, D0);
152 C1 += D1 + SIMD_2x64::mul2_32(C1, D1);
158 A0 += B0 + SIMD_2x64::mul2_32(A0, B0);
159 A1 += B1 + SIMD_2x64::mul2_32(A1, B1);
165 C0 += D0 + SIMD_2x64::mul2_32(C0, D0);
166 C1 += D1 + SIMD_2x64::mul2_32(C1, D1);
181 blamka_G(A0, A1, B0, B1, C0, C1, D0, D1);
183 SIMD_2x64::twist(B0, B1, C0, C1, D0, D1);
184 blamka_G(A0, A1, B0, B1, C0, C1, D0, D1);
185 SIMD_2x64::untwist(B0, B1, C0, C1, D0, D1);
190void Argon2::blamka_ssse3(uint64_t N[128], uint64_t
T[128]) {
191 for(
size_t i = 0; i != 8; ++i) {
193 for(
size_t j = 0; j != 4; ++j) {
194 Tv[2 * j] = SIMD_2x64::load_le(&N[16 * i + 4 * j]);
195 Tv[2 * j + 1] = SIMD_2x64::load_le(&N[16 * i + 4 * j + 2]);
198 blamka_R(Tv[0], Tv[1], Tv[2], Tv[3], Tv[4], Tv[5], Tv[6], Tv[7]);
200 for(
size_t j = 0; j != 4; ++j) {
201 Tv[2 * j].store_le(&
T[16 * i + 4 * j]);
202 Tv[2 * j + 1].store_le(&
T[16 * i + 4 * j + 2]);
206 for(
size_t i = 0; i != 8; ++i) {
208 for(
size_t j = 0; j != 4; ++j) {
209 Tv[2 * j] = SIMD_2x64::load_le(&
T[2 * i + 32 * j]);
210 Tv[2 * j + 1] = SIMD_2x64::load_le(&
T[2 * i + 32 * j + 16]);
213 blamka_R(Tv[0], Tv[1], Tv[2], Tv[3], Tv[4], Tv[5], Tv[6], Tv[7]);
215 for(
size_t j = 0; j != 4; ++j) {
216 Tv[2 * j].store_le(&
T[2 * i + 32 * j]);
217 Tv[2 * j + 1].store_le(&
T[2 * i + 32 * j + 16]);
221 for(
size_t i = 0; i != 128 / 4; ++i) {
222 SIMD_2x64 n0 = SIMD_2x64::load_le(&N[4 * i]);
223 SIMD_2x64 n1 = SIMD_2x64::load_le(&N[4 * i + 2]);
224 SIMD_2x64 t0 = SIMD_2x64::load_le(&
T[4 * i]);
225 SIMD_2x64 t1 = SIMD_2x64::load_le(&
T[4 * i + 2]);
229 n0.store_le(&N[4 * i]);
230 n1.store_le(&N[4 * i + 2]);
int(* final)(unsigned char *, CTX *)
#define BOTAN_FUNC_ISA(isa)
#define BOTAN_FORCE_INLINE
constexpr T rotl(T input)
OctetString operator^(const OctetString &k1, const OctetString &k2)
OctetString operator+(const OctetString &k1, const OctetString &k2)
constexpr T rotr(T input)
constexpr auto store_le(ParamTs &&... params)
std::vector< uint8_t, Alloc > & operator^=(std::vector< uint8_t, Alloc > &out, const std::vector< uint8_t, Alloc2 > &in)
std::vector< T, Alloc > & operator+=(std::vector< T, Alloc > &out, const std::vector< T, Alloc2 > &in)
constexpr auto load_le(ParamTs &&... params)