Botan  2.7.0
Crypto and TLS for C++11
simd_32.h
Go to the documentation of this file.
1 /*
2 * Lightweight wrappers for SIMD operations
3 * (C) 2009,2011,2016,2017 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #ifndef BOTAN_SIMD_32_H_
9 #define BOTAN_SIMD_32_H_
10 
11 #include <botan/types.h>
12 #include <botan/loadstor.h>
13 #include <botan/bswap.h>
14 #include <botan/cpuid.h>
15 
16 #if defined(BOTAN_TARGET_SUPPORTS_SSE2)
17  #include <emmintrin.h>
18  #define BOTAN_SIMD_USE_SSE2
19 
20 #elif defined(BOTAN_TARGET_SUPPORTS_ALTIVEC)
21  #include <altivec.h>
22  #undef vector
23  #undef bool
24  #define BOTAN_SIMD_USE_ALTIVEC
25 
26 #elif defined(BOTAN_TARGET_SUPPORTS_NEON)
27  #include <arm_neon.h>
28  #define BOTAN_SIMD_USE_NEON
29 #endif
30 
31 namespace Botan {
32 
33 /**
34 * 4x32 bit SIMD register
35 *
36 * This class is not a general purpose SIMD type, and only offers
37 * instructions needed for evaluation of specific crypto primitives.
38 * For example it does not currently have equality operators of any
39 * kind.
40 *
41 * Implemented for SSE2, VMX (Altivec), and NEON.
42 */
43 class SIMD_4x32 final
44  {
45  public:
46 
47  SIMD_4x32& operator=(const SIMD_4x32& other) = default;
48  SIMD_4x32(const SIMD_4x32& other) = default;
49 
50 #if !defined(BOTAN_BUILD_COMPILER_IS_MSVC_2013)
51  SIMD_4x32& operator=(SIMD_4x32&& other) = default;
52  SIMD_4x32(SIMD_4x32&& other) = default;
53 #endif
54 
55  /**
56  * Zero initialize SIMD register with 4 32-bit elements
57  */
58  SIMD_4x32() // zero initialized
59  {
60 #if defined(BOTAN_SIMD_USE_SSE2)
61  m_sse = _mm_setzero_si128();
62 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
63  m_vmx = vec_splat_u32(0);
64 #elif defined(BOTAN_SIMD_USE_NEON)
65  m_neon = vdupq_n_u32(0);
66 #else
67  m_scalar[0] = 0;
68  m_scalar[1] = 0;
69  m_scalar[2] = 0;
70  m_scalar[3] = 0;
71 #endif
72  }
73 
74  /**
75  * Load SIMD register with 4 32-bit elements
76  */
77  explicit SIMD_4x32(const uint32_t B[4])
78  {
79 #if defined(BOTAN_SIMD_USE_SSE2)
80  m_sse = _mm_loadu_si128(reinterpret_cast<const __m128i*>(B));
81 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
82  m_vmx = (__vector unsigned int){B[0], B[1], B[2], B[3]};
83 #elif defined(BOTAN_SIMD_USE_NEON)
84  m_neon = vld1q_u32(B);
85 #else
86  m_scalar[0] = B[0];
87  m_scalar[1] = B[1];
88  m_scalar[2] = B[2];
89  m_scalar[3] = B[3];
90 #endif
91  }
92 
93  /**
94  * Load SIMD register with 4 32-bit elements
95  */
96  SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
97  {
98 #if defined(BOTAN_SIMD_USE_SSE2)
99  m_sse = _mm_set_epi32(B3, B2, B1, B0);
100 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
101  m_vmx = (__vector unsigned int){B0, B1, B2, B3};
102 #elif defined(BOTAN_SIMD_USE_NEON)
103  // Better way to do this?
104  const uint32_t B[4] = { B0, B1, B2, B3 };
105  m_neon = vld1q_u32(B);
106 #else
107  m_scalar[0] = B0;
108  m_scalar[1] = B1;
109  m_scalar[2] = B2;
110  m_scalar[3] = B3;
111 #endif
112  }
113 
114  /**
115  * Load SIMD register with one 32-bit element repeated
116  */
117  static SIMD_4x32 splat(uint32_t B)
118  {
119 #if defined(BOTAN_SIMD_USE_SSE2)
120  return SIMD_4x32(_mm_set1_epi32(B));
121 #elif defined(BOTAN_SIMD_USE_ARM)
122  return SIMD_4x32(vdupq_n_u32(B));
123 #else
124  return SIMD_4x32(B, B, B, B);
125 #endif
126  }
127 
128  /**
129  * Load a SIMD register with little-endian convention
130  */
131  static SIMD_4x32 load_le(const void* in)
132  {
133 #if defined(BOTAN_SIMD_USE_SSE2)
134  return SIMD_4x32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(in)));
135 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
136 
137  uint32_t R[4];
138  Botan::load_le(R, static_cast<const uint8_t*>(in), 4);
139  return SIMD_4x32(R);
140 
141 #elif defined(BOTAN_SIMD_USE_NEON)
142 
143  uint32_t in32[4];
144  std::memcpy(in32, in, 16);
146  {
147  bswap_4(in32);
148  }
149  return SIMD_4x32(vld1q_u32(in32));
150 
151 #else
152  SIMD_4x32 out;
153  Botan::load_le(out.m_scalar, static_cast<const uint8_t*>(in), 4);
154  return out;
155 #endif
156  }
157 
158  /**
159  * Load a SIMD register with big-endian convention
160  */
161  static SIMD_4x32 load_be(const void* in)
162  {
163 #if defined(BOTAN_SIMD_USE_SSE2)
164 
165  return load_le(in).bswap();
166 
167 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
168 
169  uint32_t R[4];
170  Botan::load_be(R, static_cast<const uint8_t*>(in), 4);
171  return SIMD_4x32(R);
172 
173 #elif defined(BOTAN_SIMD_USE_NEON)
174 
175  uint32_t in32[4];
176  std::memcpy(in32, in, 16);
178  {
179  bswap_4(in32);
180  }
181  return SIMD_4x32(vld1q_u32(in32));
182 
183 #else
184  SIMD_4x32 out;
185  Botan::load_be(out.m_scalar, static_cast<const uint8_t*>(in), 4);
186  return out;
187 #endif
188  }
189 
190  /**
191  * Load a SIMD register with little-endian convention
192  */
193  void store_le(uint8_t out[]) const
194  {
195 #if defined(BOTAN_SIMD_USE_SSE2)
196 
197  _mm_storeu_si128(reinterpret_cast<__m128i*>(out), m_sse);
198 
199 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
200 
201  union {
202  __vector unsigned int V;
203  uint32_t R[4];
204  } vec;
205  vec.V = m_vmx;
206  Botan::store_le(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]);
207 
208 #elif defined(BOTAN_SIMD_USE_NEON)
209 
211  {
212  SIMD_4x32 swap = bswap();
213  swap.store_be(out);
214  }
215  else
216  {
217  uint32_t out32[4] = { 0 };
218  vst1q_u32(out32, m_neon);
219  copy_out_le(out, 16, out32);
220  }
221 #else
222  Botan::store_le(out, m_scalar[0], m_scalar[1], m_scalar[2], m_scalar[3]);
223 #endif
224  }
225 
226  /**
227  * Load a SIMD register with big-endian convention
228  */
229  void store_be(uint8_t out[]) const
230  {
231 #if defined(BOTAN_SIMD_USE_SSE2)
232 
233  bswap().store_le(out);
234 
235 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
236 
237  union {
238  __vector unsigned int V;
239  uint32_t R[4];
240  } vec;
241  vec.V = m_vmx;
242  Botan::store_be(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]);
243 
244 #elif defined(BOTAN_SIMD_USE_NEON)
245 
247  {
248  SIMD_4x32 swap = bswap();
249  swap.store_le(out);
250  }
251  else
252  {
253  uint32_t out32[4] = { 0 };
254  vst1q_u32(out32, m_neon);
255  copy_out_be(out, 16, out32);
256  }
257 
258 #else
259  Botan::store_be(out, m_scalar[0], m_scalar[1], m_scalar[2], m_scalar[3]);
260 #endif
261  }
262 
263 
264  /*
265  * This is used for SHA-2/SHACAL2
266  * Return rotr(ROT1) ^ rotr(ROT2) ^ rotr(ROT3)
267  */
268  template<size_t ROT1, size_t ROT2, size_t ROT3>
269  SIMD_4x32 rho() const
270  {
271  SIMD_4x32 res;
272 
273 #if defined(BOTAN_SIMD_USE_SSE2)
274 
275  res.m_sse = _mm_or_si128(_mm_slli_epi32(m_sse, static_cast<int>(32-ROT1)),
276  _mm_srli_epi32(m_sse, static_cast<int>(ROT1)));
277  res.m_sse = _mm_xor_si128(
278  res.m_sse,
279  _mm_or_si128(_mm_slli_epi32(m_sse, static_cast<int>(32-ROT2)),
280  _mm_srli_epi32(m_sse, static_cast<int>(ROT2))));
281  res.m_sse = _mm_xor_si128(
282  res.m_sse,
283  _mm_or_si128(_mm_slli_epi32(m_sse, static_cast<int>(32-ROT3)),
284  _mm_srli_epi32(m_sse, static_cast<int>(ROT3))));
285 
286 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
287 
288  const unsigned int r1 = static_cast<unsigned int>(32-ROT1);
289  const unsigned int r2 = static_cast<unsigned int>(32-ROT2);
290  const unsigned int r3 = static_cast<unsigned int>(32-ROT3);
291  res.m_vmx = vec_rl(m_vmx, (__vector unsigned int){r1, r1, r1, r1});
292  res.m_vmx = vec_xor(res.m_vmx, vec_rl(m_vmx, (__vector unsigned int){r2, r2, r2, r2}));
293  res.m_vmx = vec_xor(res.m_vmx, vec_rl(m_vmx, (__vector unsigned int){r3, r3, r3, r3}));
294 
295 #elif defined(BOTAN_SIMD_USE_NEON)
296  res.m_neon = vorrq_u32(vshlq_n_u32(m_neon, static_cast<int>(32-ROT1)),
297  vshrq_n_u32(m_neon, static_cast<int>(ROT1)));
298 
299  res.m_neon = veorq_u32(
300  res.m_neon,
301  vorrq_u32(vshlq_n_u32(m_neon, static_cast<int>(32-ROT2)),
302  vshrq_n_u32(m_neon, static_cast<int>(ROT2))));
303 
304  res.m_neon = veorq_u32(
305  res.m_neon,
306  vorrq_u32(vshlq_n_u32(m_neon, static_cast<int>(32-ROT3)),
307  vshrq_n_u32(m_neon, static_cast<int>(ROT3))));
308 
309 #else
310 
311  for(size_t i = 0; i != 4; ++i)
312  {
313  res.m_scalar[i] = Botan::rotr<ROT1>(m_scalar[i]) ^
314  Botan::rotr<ROT2>(m_scalar[i]) ^
315  Botan::rotr<ROT3>(m_scalar[i]);
316  }
317 #endif
318 
319  return res;
320  }
321 
322  /**
323  * Left rotation by a compile time constant
324  */
325  template<size_t ROT>
326  SIMD_4x32 rotl() const
327  {
328  static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant");
329 
330 #if defined(BOTAN_SIMD_USE_SSE2)
331 
332  return SIMD_4x32(_mm_or_si128(_mm_slli_epi32(m_sse, static_cast<int>(ROT)),
333  _mm_srli_epi32(m_sse, static_cast<int>(32-ROT))));
334 
335 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
336 
337  const unsigned int r = static_cast<unsigned int>(ROT);
338  return SIMD_4x32(vec_rl(m_vmx, (__vector unsigned int){r, r, r, r}));
339 
340 #elif defined(BOTAN_SIMD_USE_NEON)
341  return SIMD_4x32(vorrq_u32(vshlq_n_u32(m_neon, static_cast<int>(ROT)),
342  vshrq_n_u32(m_neon, static_cast<int>(32-ROT))));
343 
344 #else
345  return SIMD_4x32(Botan::rotl<ROT>(m_scalar[0]),
346  Botan::rotl<ROT>(m_scalar[1]),
347  Botan::rotl<ROT>(m_scalar[2]),
348  Botan::rotl<ROT>(m_scalar[3]));
349 #endif
350  }
351 
352  /**
353  * Right rotation by a compile time constant
354  */
355  template<size_t ROT>
356  SIMD_4x32 rotr() const
357  {
358  return this->rotl<32-ROT>();
359  }
360 
361  /**
362  * Add elements of a SIMD vector
363  */
364  SIMD_4x32 operator+(const SIMD_4x32& other) const
365  {
366  SIMD_4x32 retval(*this);
367  retval += other;
368  return retval;
369  }
370 
371  /**
372  * Subtract elements of a SIMD vector
373  */
374  SIMD_4x32 operator-(const SIMD_4x32& other) const
375  {
376  SIMD_4x32 retval(*this);
377  retval -= other;
378  return retval;
379  }
380 
381  /**
382  * XOR elements of a SIMD vector
383  */
384  SIMD_4x32 operator^(const SIMD_4x32& other) const
385  {
386  SIMD_4x32 retval(*this);
387  retval ^= other;
388  return retval;
389  }
390 
391  /**
392  * Binary OR elements of a SIMD vector
393  */
394  SIMD_4x32 operator|(const SIMD_4x32& other) const
395  {
396  SIMD_4x32 retval(*this);
397  retval |= other;
398  return retval;
399  }
400 
401  /**
402  * Binary AND elements of a SIMD vector
403  */
404  SIMD_4x32 operator&(const SIMD_4x32& other) const
405  {
406  SIMD_4x32 retval(*this);
407  retval &= other;
408  return retval;
409  }
410 
411  void operator+=(const SIMD_4x32& other)
412  {
413 #if defined(BOTAN_SIMD_USE_SSE2)
414  m_sse = _mm_add_epi32(m_sse, other.m_sse);
415 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
416  m_vmx = vec_add(m_vmx, other.m_vmx);
417 #elif defined(BOTAN_SIMD_USE_NEON)
418  m_neon = vaddq_u32(m_neon, other.m_neon);
419 #else
420  m_scalar[0] += other.m_scalar[0];
421  m_scalar[1] += other.m_scalar[1];
422  m_scalar[2] += other.m_scalar[2];
423  m_scalar[3] += other.m_scalar[3];
424 #endif
425  }
426 
427  void operator-=(const SIMD_4x32& other)
428  {
429 #if defined(BOTAN_SIMD_USE_SSE2)
430  m_sse = _mm_sub_epi32(m_sse, other.m_sse);
431 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
432  m_vmx = vec_sub(m_vmx, other.m_vmx);
433 #elif defined(BOTAN_SIMD_USE_NEON)
434  m_neon = vsubq_u32(m_neon, other.m_neon);
435 #else
436  m_scalar[0] -= other.m_scalar[0];
437  m_scalar[1] -= other.m_scalar[1];
438  m_scalar[2] -= other.m_scalar[2];
439  m_scalar[3] -= other.m_scalar[3];
440 #endif
441  }
442 
443  void operator^=(const SIMD_4x32& other)
444  {
445 #if defined(BOTAN_SIMD_USE_SSE2)
446  m_sse = _mm_xor_si128(m_sse, other.m_sse);
447 
448 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
449  m_vmx = vec_xor(m_vmx, other.m_vmx);
450 #elif defined(BOTAN_SIMD_USE_NEON)
451  m_neon = veorq_u32(m_neon, other.m_neon);
452 #else
453  m_scalar[0] ^= other.m_scalar[0];
454  m_scalar[1] ^= other.m_scalar[1];
455  m_scalar[2] ^= other.m_scalar[2];
456  m_scalar[3] ^= other.m_scalar[3];
457 #endif
458  }
459 
460  void operator|=(const SIMD_4x32& other)
461  {
462 #if defined(BOTAN_SIMD_USE_SSE2)
463  m_sse = _mm_or_si128(m_sse, other.m_sse);
464 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
465  m_vmx = vec_or(m_vmx, other.m_vmx);
466 #elif defined(BOTAN_SIMD_USE_NEON)
467  m_neon = vorrq_u32(m_neon, other.m_neon);
468 #else
469  m_scalar[0] |= other.m_scalar[0];
470  m_scalar[1] |= other.m_scalar[1];
471  m_scalar[2] |= other.m_scalar[2];
472  m_scalar[3] |= other.m_scalar[3];
473 #endif
474  }
475 
476  void operator&=(const SIMD_4x32& other)
477  {
478 #if defined(BOTAN_SIMD_USE_SSE2)
479  m_sse = _mm_and_si128(m_sse, other.m_sse);
480 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
481  m_vmx = vec_and(m_vmx, other.m_vmx);
482 #elif defined(BOTAN_SIMD_USE_NEON)
483  m_neon = vandq_u32(m_neon, other.m_neon);
484 #else
485  m_scalar[0] &= other.m_scalar[0];
486  m_scalar[1] &= other.m_scalar[1];
487  m_scalar[2] &= other.m_scalar[2];
488  m_scalar[3] &= other.m_scalar[3];
489 #endif
490  }
491 
492 
493  template<int SHIFT> SIMD_4x32 shl() const
494  {
495 #if defined(BOTAN_SIMD_USE_SSE2)
496  return SIMD_4x32(_mm_slli_epi32(m_sse, SHIFT));
497 
498 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
499  const unsigned int s = static_cast<unsigned int>(SHIFT);
500  return SIMD_4x32(vec_sl(m_vmx, (__vector unsigned int){s, s, s, s}));
501 #elif defined(BOTAN_SIMD_USE_NEON)
502  return SIMD_4x32(vshlq_n_u32(m_neon, SHIFT));
503 #else
504  return SIMD_4x32(m_scalar[0] << SHIFT,
505  m_scalar[1] << SHIFT,
506  m_scalar[2] << SHIFT,
507  m_scalar[3] << SHIFT);
508 #endif
509  }
510 
511  template<int SHIFT> SIMD_4x32 shr() const
512  {
513 #if defined(BOTAN_SIMD_USE_SSE2)
514  return SIMD_4x32(_mm_srli_epi32(m_sse, SHIFT));
515 
516 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
517  const unsigned int s = static_cast<unsigned int>(SHIFT);
518  return SIMD_4x32(vec_sr(m_vmx, (__vector unsigned int){s, s, s, s}));
519 #elif defined(BOTAN_SIMD_USE_NEON)
520  return SIMD_4x32(vshrq_n_u32(m_neon, SHIFT));
521 #else
522  return SIMD_4x32(m_scalar[0] >> SHIFT, m_scalar[1] >> SHIFT,
523  m_scalar[2] >> SHIFT, m_scalar[3] >> SHIFT);
524 
525 #endif
526  }
527 
529  {
530 #if defined(BOTAN_SIMD_USE_SSE2)
531  return SIMD_4x32(_mm_xor_si128(m_sse, _mm_set1_epi32(0xFFFFFFFF)));
532 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
533  return SIMD_4x32(vec_nor(m_vmx, m_vmx));
534 #elif defined(BOTAN_SIMD_USE_NEON)
535  return SIMD_4x32(vmvnq_u32(m_neon));
536 #else
537  return SIMD_4x32(~m_scalar[0], ~m_scalar[1], ~m_scalar[2], ~m_scalar[3]);
538 #endif
539  }
540 
541  // (~reg) & other
542  SIMD_4x32 andc(const SIMD_4x32& other) const
543  {
544 #if defined(BOTAN_SIMD_USE_SSE2)
545  return SIMD_4x32(_mm_andnot_si128(m_sse, other.m_sse));
546 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
547  /*
548  AltiVec does arg1 & ~arg2 rather than SSE's ~arg1 & arg2
549  so swap the arguments
550  */
551  return SIMD_4x32(vec_andc(other.m_vmx, m_vmx));
552 #elif defined(BOTAN_SIMD_USE_NEON)
553  // NEON is also a & ~b
554  return SIMD_4x32(vbicq_u32(other.m_neon, m_neon));
555 #else
556  return SIMD_4x32((~m_scalar[0]) & other.m_scalar[0],
557  (~m_scalar[1]) & other.m_scalar[1],
558  (~m_scalar[2]) & other.m_scalar[2],
559  (~m_scalar[3]) & other.m_scalar[3]);
560 #endif
561  }
562 
563  /**
564  * Return copy *this with each word byte swapped
565  */
566  SIMD_4x32 bswap() const
567  {
568 #if defined(BOTAN_SIMD_USE_SSE2)
569 
570  __m128i T = m_sse;
571  T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
572  T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
573  return SIMD_4x32(_mm_or_si128(_mm_srli_epi16(T, 8), _mm_slli_epi16(T, 8)));
574 
575 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
576 
577  union {
578  __vector unsigned int V;
579  uint32_t R[4];
580  } vec;
581 
582  vec.V = m_vmx;
583  bswap_4(vec.R);
584  return SIMD_4x32(vec.R[0], vec.R[1], vec.R[2], vec.R[3]);
585 
586 #elif defined(BOTAN_SIMD_USE_NEON)
587 
588  //return SIMD_4x32(vrev64q_u32(m_neon));
589 
590  // FIXME this is really slow
591  SIMD_4x32 ror8 = this->rotr<8>();
592  SIMD_4x32 rol8 = this->rotl<8>();
593 
594  const SIMD_4x32 mask1 = SIMD_4x32::splat(0xFF00FF00);
595  const SIMD_4x32 mask2 = SIMD_4x32::splat(0x00FF00FF);
596  return (ror8 & mask1) | (rol8 & mask2);
597 #else
598  // scalar
599  return SIMD_4x32(reverse_bytes(m_scalar[0]),
600  reverse_bytes(m_scalar[1]),
601  reverse_bytes(m_scalar[2]),
602  reverse_bytes(m_scalar[3]));
603 #endif
604  }
605 
606  /**
607  * 4x4 Transposition on SIMD registers
608  */
609  static void transpose(SIMD_4x32& B0, SIMD_4x32& B1,
610  SIMD_4x32& B2, SIMD_4x32& B3)
611  {
612 #if defined(BOTAN_SIMD_USE_SSE2)
613  const __m128i T0 = _mm_unpacklo_epi32(B0.m_sse, B1.m_sse);
614  const __m128i T1 = _mm_unpacklo_epi32(B2.m_sse, B3.m_sse);
615  const __m128i T2 = _mm_unpackhi_epi32(B0.m_sse, B1.m_sse);
616  const __m128i T3 = _mm_unpackhi_epi32(B2.m_sse, B3.m_sse);
617 
618  B0.m_sse = _mm_unpacklo_epi64(T0, T1);
619  B1.m_sse = _mm_unpackhi_epi64(T0, T1);
620  B2.m_sse = _mm_unpacklo_epi64(T2, T3);
621  B3.m_sse = _mm_unpackhi_epi64(T2, T3);
622 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
623  const __vector unsigned int T0 = vec_mergeh(B0.m_vmx, B2.m_vmx);
624  const __vector unsigned int T1 = vec_mergeh(B1.m_vmx, B3.m_vmx);
625  const __vector unsigned int T2 = vec_mergel(B0.m_vmx, B2.m_vmx);
626  const __vector unsigned int T3 = vec_mergel(B1.m_vmx, B3.m_vmx);
627 
628  B0.m_vmx = vec_mergeh(T0, T1);
629  B1.m_vmx = vec_mergel(T0, T1);
630  B2.m_vmx = vec_mergeh(T2, T3);
631  B3.m_vmx = vec_mergel(T2, T3);
632 #elif defined(BOTAN_SIMD_USE_NEON)
633 
634 #if defined(BOTAN_TARGET_ARCH_IS_ARM32)
635 
636  const uint32x4x2_t T0 = vzipq_u32(B0.m_neon, B2.m_neon);
637  const uint32x4x2_t T1 = vzipq_u32(B1.m_neon, B3.m_neon);
638  const uint32x4x2_t O0 = vzipq_u32(T0.val[0], T1.val[0]);
639  const uint32x4x2_t O1 = vzipq_u32(T0.val[1], T1.val[1]);
640 
641  B0.m_neon = O0.val[0];
642  B1.m_neon = O0.val[1];
643  B2.m_neon = O1.val[0];
644  B3.m_neon = O1.val[1];
645 
646 #elif defined(BOTAN_TARGET_ARCH_IS_ARM64)
647  const uint32x4_t T0 = vzip1q_u32(B0.m_neon, B2.m_neon);
648  const uint32x4_t T2 = vzip2q_u32(B0.m_neon, B2.m_neon);
649 
650  const uint32x4_t T1 = vzip1q_u32(B1.m_neon, B3.m_neon);
651  const uint32x4_t T3 = vzip2q_u32(B1.m_neon, B3.m_neon);
652 
653  B0.m_neon = vzip1q_u32(T0, T1);
654  B1.m_neon = vzip2q_u32(T0, T1);
655 
656  B2.m_neon = vzip1q_u32(T2, T3);
657  B3.m_neon = vzip2q_u32(T2, T3);
658 #endif
659 
660 #else
661  // scalar
662  SIMD_4x32 T0(B0.m_scalar[0], B1.m_scalar[0], B2.m_scalar[0], B3.m_scalar[0]);
663  SIMD_4x32 T1(B0.m_scalar[1], B1.m_scalar[1], B2.m_scalar[1], B3.m_scalar[1]);
664  SIMD_4x32 T2(B0.m_scalar[2], B1.m_scalar[2], B2.m_scalar[2], B3.m_scalar[2]);
665  SIMD_4x32 T3(B0.m_scalar[3], B1.m_scalar[3], B2.m_scalar[3], B3.m_scalar[3]);
666 
667  B0 = T0;
668  B1 = T1;
669  B2 = T2;
670  B3 = T3;
671 #endif
672  }
673 
674  private:
675 
676 #if defined(BOTAN_SIMD_USE_SSE2)
677  explicit SIMD_4x32(__m128i in) : m_sse(in) {}
678 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
679  explicit SIMD_4x32(__vector unsigned int in) : m_vmx(in) {}
680 #elif defined(BOTAN_SIMD_USE_NEON)
681  explicit SIMD_4x32(uint32x4_t in) : m_neon(in) {}
682 #endif
683 
684 #if defined(BOTAN_SIMD_USE_SSE2)
685  __m128i m_sse;
686 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
687  __vector unsigned int m_vmx;
688 #elif defined(BOTAN_SIMD_USE_NEON)
689  uint32x4_t m_neon;
690 #else
691  uint32_t m_scalar[4];
692 #endif
693  };
694 
696 
697 }
698 
699 #endif
SIMD_4x32(const uint32_t B[4])
Definition: simd_32.h:77
SIMD_4x32 operator-(const SIMD_4x32 &other) const
Definition: simd_32.h:374
SIMD_4x32 operator|(const SIMD_4x32 &other) const
Definition: simd_32.h:394
SIMD_4x32 shl() const
Definition: simd_32.h:493
SIMD_4x32 rotr() const
Definition: simd_32.h:356
SIMD_4x32 bswap() const
Definition: simd_32.h:566
void operator &=(const SIMD_4x32 &other)
Definition: simd_32.h:476
SIMD_4x32 operator~() const
Definition: simd_32.h:528
void store_le(uint8_t out[]) const
Definition: simd_32.h:193
void store_be(uint16_t in, uint8_t out[2])
Definition: loadstor.h:434
void store_be(uint8_t out[]) const
Definition: simd_32.h:229
void copy_out_le(uint8_t out[], size_t out_bytes, const T in[])
Definition: loadstor.h:675
SIMD_4x32 andc(const SIMD_4x32 &other) const
Definition: simd_32.h:542
static SIMD_4x32 load_le(const void *in)
Definition: simd_32.h:131
SIMD_4x32 operator^(const SIMD_4x32 &other) const
Definition: simd_32.h:384
void operator^=(const SIMD_4x32 &other)
Definition: simd_32.h:443
static void transpose(SIMD_4x32 &B0, SIMD_4x32 &B1, SIMD_4x32 &B2, SIMD_4x32 &B3)
Definition: simd_32.h:609
void operator+=(const SIMD_4x32 &other)
Definition: simd_32.h:411
T load_be(const uint8_t in[], size_t off)
Definition: loadstor.h:105
static SIMD_4x32 load_be(const void *in)
Definition: simd_32.h:161
SIMD_4x32 operator &(const SIMD_4x32 &other) const
Definition: simd_32.h:404
SIMD_4x32 rho() const
Definition: simd_32.h:269
T load_le(const uint8_t in[], size_t off)
Definition: loadstor.h:121
static bool is_little_endian()
Definition: cpuid.h:75
Definition: alg_id.cpp:13
void bswap_4(T x[4])
Definition: bswap.h:89
uint16_t reverse_bytes(uint16_t val)
Definition: bswap.h:24
static SIMD_4x32 splat(uint32_t B)
Definition: simd_32.h:117
SIMD_4x32 SIMD_32
Definition: simd_32.h:695
SIMD_4x32 operator+(const SIMD_4x32 &other) const
Definition: simd_32.h:364
void operator|=(const SIMD_4x32 &other)
Definition: simd_32.h:460
void operator-=(const SIMD_4x32 &other)
Definition: simd_32.h:427
fe T
Definition: ge.cpp:37
SIMD_4x32 & operator=(const SIMD_4x32 &other)=default
SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
Definition: simd_32.h:96
static bool is_big_endian()
Definition: cpuid.h:80
SIMD_4x32 rotl() const
Definition: simd_32.h:326
SIMD_4x32 shr() const
Definition: simd_32.h:511
void copy_out_be(uint8_t out[], size_t out_bytes, const T in[])
Definition: loadstor.h:654
void store_le(uint16_t in, uint8_t out[2])
Definition: loadstor.h:450