Botan  2.4.0
Crypto and TLS for C++11
simd_32.h
Go to the documentation of this file.
1 /*
2 * Lightweight wrappers for SIMD operations
3 * (C) 2009,2011,2016,2017 Jack Lloyd
4 *
5 * Botan is released under the Simplified BSD License (see license.txt)
6 */
7 
8 #ifndef BOTAN_SIMD_32_H_
9 #define BOTAN_SIMD_32_H_
10 
11 #include <botan/types.h>
12 #include <botan/loadstor.h>
13 #include <botan/bswap.h>
14 #include <botan/cpuid.h>
15 
16 #if defined(BOTAN_TARGET_SUPPORTS_SSE2)
17  #include <emmintrin.h>
18  #define BOTAN_SIMD_USE_SSE2
19 
20 #elif defined(BOTAN_TARGET_SUPPORTS_ALTIVEC)
21  #include <altivec.h>
22  #undef vector
23  #undef bool
24  #define BOTAN_SIMD_USE_ALTIVEC
25 
26 #elif defined(BOTAN_TARGET_SUPPORTS_NEON)
27  #include <arm_neon.h>
28  #define BOTAN_SIMD_USE_NEON
29 #endif
30 
31 namespace Botan {
32 
33 /**
34 * 4x32 bit SIMD register
35 *
36 * This class is not a general purpose SIMD type, and only offers
37 * instructions needed for evaluation of specific crypto primitives.
38 * For example it does not currently have equality operators of any
39 * kind.
40 *
41 * Implemented for SSE2, VMX (Altivec), and NEON.
42 */
43 class SIMD_4x32 final
44  {
45  public:
46 
47  SIMD_4x32& operator=(const SIMD_4x32& other) = default;
48  SIMD_4x32(const SIMD_4x32& other) = default;
49 
50 #if !defined(BOTAN_BUILD_COMPILER_IS_MSVC_2013)
51  SIMD_4x32& operator=(SIMD_4x32&& other) = default;
52  SIMD_4x32(SIMD_4x32&& other) = default;
53 #endif
54 
55  /**
56  * Zero initialize SIMD register with 4 32-bit elements
57  */
58  SIMD_4x32() // zero initialized
59  {
60 #if defined(BOTAN_SIMD_USE_SSE2)
61  m_sse = _mm_setzero_si128();
62 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
63  m_vmx = vec_splat_u32(0);
64 #elif defined(BOTAN_SIMD_USE_NEON)
65  m_neon = vdupq_n_u32(0);
66 #else
67  m_scalar[0] = 0;
68  m_scalar[1] = 0;
69  m_scalar[2] = 0;
70  m_scalar[3] = 0;
71 #endif
72  }
73 
74  /**
75  * Load SIMD register with 4 32-bit elements
76  */
77  explicit SIMD_4x32(const uint32_t B[4])
78  {
79 #if defined(BOTAN_SIMD_USE_SSE2)
80  m_sse = _mm_loadu_si128(reinterpret_cast<const __m128i*>(B));
81 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
82  m_vmx = (__vector unsigned int){B[0], B[1], B[2], B[3]};
83 #elif defined(BOTAN_SIMD_USE_NEON)
84  m_neon = vld1q_u32(B);
85 #else
86  m_scalar[0] = B[0];
87  m_scalar[1] = B[1];
88  m_scalar[2] = B[2];
89  m_scalar[3] = B[3];
90 #endif
91  }
92 
93  /**
94  * Load SIMD register with 4 32-bit elements
95  */
96  SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
97  {
98 #if defined(BOTAN_SIMD_USE_SSE2)
99  m_sse = _mm_set_epi32(B3, B2, B1, B0);
100 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
101  m_vmx = (__vector unsigned int){B0, B1, B2, B3};
102 #elif defined(BOTAN_SIMD_USE_NEON)
103  // Better way to do this?
104  const uint32_t B[4] = { B0, B1, B2, B3 };
105  m_neon = vld1q_u32(B);
106 #else
107  m_scalar[0] = B0;
108  m_scalar[1] = B1;
109  m_scalar[2] = B2;
110  m_scalar[3] = B3;
111 #endif
112  }
113 
114  /**
115  * Load SIMD register with one 32-bit element repeated
116  */
117  static SIMD_4x32 splat(uint32_t B)
118  {
119 #if defined(BOTAN_SIMD_USE_SSE2)
120  return SIMD_4x32(_mm_set1_epi32(B));
121 #elif defined(BOTAN_SIMD_USE_ARM)
122  return SIMD_4x32(vdupq_n_u32(B));
123 #else
124  return SIMD_4x32(B, B, B, B);
125 #endif
126  }
127 
128  /**
129  * Load a SIMD register with little-endian convention
130  */
131  static SIMD_4x32 load_le(const void* in)
132  {
133 #if defined(BOTAN_SIMD_USE_SSE2)
134  return SIMD_4x32(_mm_loadu_si128(reinterpret_cast<const __m128i*>(in)));
135 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
136  const uint32_t* in_32 = static_cast<const uint32_t*>(in);
137 
138  __vector unsigned int R0 = vec_ld(0, in_32);
139  __vector unsigned int R1 = vec_ld(12, in_32);
140 
141  __vector unsigned char perm = vec_lvsl(0, in_32);
142 
144  {
145  perm = vec_xor(perm, vec_splat_u8(3)); // bswap vector
146  }
147 
148  R0 = vec_perm(R0, R1, perm);
149 
150  return SIMD_4x32(R0);
151 #elif defined(BOTAN_SIMD_USE_NEON)
152 
153  uint32_t in32[4];
154  std::memcpy(in32, in, 16);
156  {
157  bswap_4(in32);
158  }
159  return SIMD_4x32(vld1q_u32(in32));
160 
161 #else
162  SIMD_4x32 out;
163  Botan::load_le(out.m_scalar, static_cast<const uint8_t*>(in), 4);
164  return out;
165 #endif
166  }
167 
168  /**
169  * Load a SIMD register with big-endian convention
170  */
171  static SIMD_4x32 load_be(const void* in)
172  {
173 #if defined(BOTAN_SIMD_USE_SSE2)
174 
175  return load_le(in).bswap();
176 
177 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
178 
179  const uint32_t* in_32 = static_cast<const uint32_t*>(in);
180  __vector unsigned int R0 = vec_ld(0, in_32);
181  __vector unsigned int R1 = vec_ld(12, in_32);
182  __vector unsigned char perm = vec_lvsl(0, in_32);
183 
185  {
186  perm = vec_xor(perm, vec_splat_u8(3)); // bswap vector
187  }
188 
189  R0 = vec_perm(R0, R1, perm);
190  return SIMD_4x32(R0);
191 
192 #elif defined(BOTAN_SIMD_USE_NEON)
193 
194  uint32_t in32[4];
195  std::memcpy(in32, in, 16);
197  {
198  bswap_4(in32);
199  }
200  return SIMD_4x32(vld1q_u32(in32));
201 
202 #else
203  SIMD_4x32 out;
204  Botan::load_be(out.m_scalar, static_cast<const uint8_t*>(in), 4);
205  return out;
206 #endif
207  }
208 
209  /**
210  * Load a SIMD register with little-endian convention
211  */
212  void store_le(uint8_t out[]) const
213  {
214 #if defined(BOTAN_SIMD_USE_SSE2)
215 
216  _mm_storeu_si128(reinterpret_cast<__m128i*>(out), m_sse);
217 
218 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
219 
220  union {
221  __vector unsigned int V;
222  uint32_t R[4];
223  } vec;
224  vec.V = m_vmx;
225  Botan::store_le(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]);
226 
227 #elif defined(BOTAN_SIMD_USE_NEON)
228 
230  {
231  SIMD_4x32 swap = bswap();
232  swap.store_be(out);
233  }
234  else
235  {
236  uint32_t out32[4] = { 0 };
237  vst1q_u32(out32, m_neon);
238  copy_out_le(out, 16, out32);
239  }
240 #else
241  Botan::store_le(out, m_scalar[0], m_scalar[1], m_scalar[2], m_scalar[3]);
242 #endif
243  }
244 
245  /**
246  * Load a SIMD register with big-endian convention
247  */
248  void store_be(uint8_t out[]) const
249  {
250 #if defined(BOTAN_SIMD_USE_SSE2)
251 
252  bswap().store_le(out);
253 
254 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
255 
256  union {
257  __vector unsigned int V;
258  uint32_t R[4];
259  } vec;
260  vec.V = m_vmx;
261  Botan::store_be(out, vec.R[0], vec.R[1], vec.R[2], vec.R[3]);
262 
263 #elif defined(BOTAN_SIMD_USE_NEON)
264 
266  {
267  SIMD_4x32 swap = bswap();
268  swap.store_le(out);
269  }
270  else
271  {
272  uint32_t out32[4] = { 0 };
273  vst1q_u32(out32, m_neon);
274  copy_out_be(out, 16, out32);
275  }
276 
277 #else
278  Botan::store_be(out, m_scalar[0], m_scalar[1], m_scalar[2], m_scalar[3]);
279 #endif
280  }
281 
282 
283  /*
284  * This is used for SHA-2/SHACAL2
285  * Return rotr(ROT1) ^ rotr(ROT2) ^ rotr(ROT3)
286  */
287  template<size_t ROT1, size_t ROT2, size_t ROT3>
288  SIMD_4x32 rho() const
289  {
290  SIMD_4x32 res;
291 
292 #if defined(BOTAN_SIMD_USE_SSE2)
293 
294  res.m_sse = _mm_or_si128(_mm_slli_epi32(m_sse, static_cast<int>(32-ROT1)),
295  _mm_srli_epi32(m_sse, static_cast<int>(ROT1)));
296  res.m_sse = _mm_xor_si128(
297  res.m_sse,
298  _mm_or_si128(_mm_slli_epi32(m_sse, static_cast<int>(32-ROT2)),
299  _mm_srli_epi32(m_sse, static_cast<int>(ROT2))));
300  res.m_sse = _mm_xor_si128(
301  res.m_sse,
302  _mm_or_si128(_mm_slli_epi32(m_sse, static_cast<int>(32-ROT3)),
303  _mm_srli_epi32(m_sse, static_cast<int>(ROT3))));
304 
305 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
306 
307  const unsigned int r1 = static_cast<unsigned int>(32-ROT1);
308  const unsigned int r2 = static_cast<unsigned int>(32-ROT2);
309  const unsigned int r3 = static_cast<unsigned int>(32-ROT3);
310  res.m_vmx = vec_rl(m_vmx, (__vector unsigned int){r1, r1, r1, r1});
311  res.m_vmx = vec_xor(res.m_vmx, vec_rl(m_vmx, (__vector unsigned int){r2, r2, r2, r2}));
312  res.m_vmx = vec_xor(res.m_vmx, vec_rl(m_vmx, (__vector unsigned int){r3, r3, r3, r3}));
313 
314 #elif defined(BOTAN_SIMD_USE_NEON)
315  res.m_neon = vorrq_u32(vshlq_n_u32(m_neon, static_cast<int>(32-ROT1)),
316  vshrq_n_u32(m_neon, static_cast<int>(ROT1)));
317 
318  res.m_neon = veorq_u32(
319  res.m_neon,
320  vorrq_u32(vshlq_n_u32(m_neon, static_cast<int>(32-ROT2)),
321  vshrq_n_u32(m_neon, static_cast<int>(ROT2))));
322 
323  res.m_neon = veorq_u32(
324  res.m_neon,
325  vorrq_u32(vshlq_n_u32(m_neon, static_cast<int>(32-ROT3)),
326  vshrq_n_u32(m_neon, static_cast<int>(ROT3))));
327 
328 #else
329 
330  for(size_t i = 0; i != 4; ++i)
331  {
332  res.m_scalar[i] = Botan::rotr<ROT1>(m_scalar[i]) ^
333  Botan::rotr<ROT2>(m_scalar[i]) ^
334  Botan::rotr<ROT3>(m_scalar[i]);
335  }
336 #endif
337 
338  return res;
339  }
340 
341  /**
342  * Left rotation by a compile time constant
343  */
344  template<size_t ROT>
345  SIMD_4x32 rotl() const
346  {
347  static_assert(ROT > 0 && ROT < 32, "Invalid rotation constant");
348 
349 #if defined(BOTAN_SIMD_USE_SSE2)
350 
351  return SIMD_4x32(_mm_or_si128(_mm_slli_epi32(m_sse, static_cast<int>(ROT)),
352  _mm_srli_epi32(m_sse, static_cast<int>(32-ROT))));
353 
354 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
355 
356  const unsigned int r = static_cast<unsigned int>(ROT);
357  return SIMD_4x32(vec_rl(m_vmx, (__vector unsigned int){r, r, r, r}));
358 
359 #elif defined(BOTAN_SIMD_USE_NEON)
360  return SIMD_4x32(vorrq_u32(vshlq_n_u32(m_neon, static_cast<int>(ROT)),
361  vshrq_n_u32(m_neon, static_cast<int>(32-ROT))));
362 
363 #else
364  return SIMD_4x32(Botan::rotl<ROT>(m_scalar[0]),
365  Botan::rotl<ROT>(m_scalar[1]),
366  Botan::rotl<ROT>(m_scalar[2]),
367  Botan::rotl<ROT>(m_scalar[3]));
368 #endif
369  }
370 
371  /**
372  * Right rotation by a compile time constant
373  */
374  template<size_t ROT>
375  SIMD_4x32 rotr() const
376  {
377  return this->rotl<32-ROT>();
378  }
379 
380  /**
381  * Add elements of a SIMD vector
382  */
383  SIMD_4x32 operator+(const SIMD_4x32& other) const
384  {
385  SIMD_4x32 retval(*this);
386  retval += other;
387  return retval;
388  }
389 
390  /**
391  * Subtract elements of a SIMD vector
392  */
393  SIMD_4x32 operator-(const SIMD_4x32& other) const
394  {
395  SIMD_4x32 retval(*this);
396  retval -= other;
397  return retval;
398  }
399 
400  /**
401  * XOR elements of a SIMD vector
402  */
403  SIMD_4x32 operator^(const SIMD_4x32& other) const
404  {
405  SIMD_4x32 retval(*this);
406  retval ^= other;
407  return retval;
408  }
409 
410  /**
411  * Binary OR elements of a SIMD vector
412  */
413  SIMD_4x32 operator|(const SIMD_4x32& other) const
414  {
415  SIMD_4x32 retval(*this);
416  retval |= other;
417  return retval;
418  }
419 
420  /**
421  * Binary AND elements of a SIMD vector
422  */
423  SIMD_4x32 operator&(const SIMD_4x32& other) const
424  {
425  SIMD_4x32 retval(*this);
426  retval &= other;
427  return retval;
428  }
429 
430  void operator+=(const SIMD_4x32& other)
431  {
432 #if defined(BOTAN_SIMD_USE_SSE2)
433  m_sse = _mm_add_epi32(m_sse, other.m_sse);
434 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
435  m_vmx = vec_add(m_vmx, other.m_vmx);
436 #elif defined(BOTAN_SIMD_USE_NEON)
437  m_neon = vaddq_u32(m_neon, other.m_neon);
438 #else
439  m_scalar[0] += other.m_scalar[0];
440  m_scalar[1] += other.m_scalar[1];
441  m_scalar[2] += other.m_scalar[2];
442  m_scalar[3] += other.m_scalar[3];
443 #endif
444  }
445 
446  void operator-=(const SIMD_4x32& other)
447  {
448 #if defined(BOTAN_SIMD_USE_SSE2)
449  m_sse = _mm_sub_epi32(m_sse, other.m_sse);
450 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
451  m_vmx = vec_sub(m_vmx, other.m_vmx);
452 #elif defined(BOTAN_SIMD_USE_NEON)
453  m_neon = vsubq_u32(m_neon, other.m_neon);
454 #else
455  m_scalar[0] -= other.m_scalar[0];
456  m_scalar[1] -= other.m_scalar[1];
457  m_scalar[2] -= other.m_scalar[2];
458  m_scalar[3] -= other.m_scalar[3];
459 #endif
460  }
461 
462  void operator^=(const SIMD_4x32& other)
463  {
464 #if defined(BOTAN_SIMD_USE_SSE2)
465  m_sse = _mm_xor_si128(m_sse, other.m_sse);
466 
467 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
468  m_vmx = vec_xor(m_vmx, other.m_vmx);
469 #elif defined(BOTAN_SIMD_USE_NEON)
470  m_neon = veorq_u32(m_neon, other.m_neon);
471 #else
472  m_scalar[0] ^= other.m_scalar[0];
473  m_scalar[1] ^= other.m_scalar[1];
474  m_scalar[2] ^= other.m_scalar[2];
475  m_scalar[3] ^= other.m_scalar[3];
476 #endif
477  }
478 
479  void operator|=(const SIMD_4x32& other)
480  {
481 #if defined(BOTAN_SIMD_USE_SSE2)
482  m_sse = _mm_or_si128(m_sse, other.m_sse);
483 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
484  m_vmx = vec_or(m_vmx, other.m_vmx);
485 #elif defined(BOTAN_SIMD_USE_NEON)
486  m_neon = vorrq_u32(m_neon, other.m_neon);
487 #else
488  m_scalar[0] |= other.m_scalar[0];
489  m_scalar[1] |= other.m_scalar[1];
490  m_scalar[2] |= other.m_scalar[2];
491  m_scalar[3] |= other.m_scalar[3];
492 #endif
493  }
494 
495  void operator&=(const SIMD_4x32& other)
496  {
497 #if defined(BOTAN_SIMD_USE_SSE2)
498  m_sse = _mm_and_si128(m_sse, other.m_sse);
499 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
500  m_vmx = vec_and(m_vmx, other.m_vmx);
501 #elif defined(BOTAN_SIMD_USE_NEON)
502  m_neon = vandq_u32(m_neon, other.m_neon);
503 #else
504  m_scalar[0] &= other.m_scalar[0];
505  m_scalar[1] &= other.m_scalar[1];
506  m_scalar[2] &= other.m_scalar[2];
507  m_scalar[3] &= other.m_scalar[3];
508 #endif
509  }
510 
511 
512  template<int SHIFT> SIMD_4x32 shl() const
513  {
514 #if defined(BOTAN_SIMD_USE_SSE2)
515  return SIMD_4x32(_mm_slli_epi32(m_sse, SHIFT));
516 
517 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
518  const unsigned int s = static_cast<unsigned int>(SHIFT);
519  return SIMD_4x32(vec_sl(m_vmx, (__vector unsigned int){s, s, s, s}));
520 #elif defined(BOTAN_SIMD_USE_NEON)
521  return SIMD_4x32(vshlq_n_u32(m_neon, SHIFT));
522 #else
523  return SIMD_4x32(m_scalar[0] << SHIFT,
524  m_scalar[1] << SHIFT,
525  m_scalar[2] << SHIFT,
526  m_scalar[3] << SHIFT);
527 #endif
528  }
529 
530  template<int SHIFT> SIMD_4x32 shr() const
531  {
532 #if defined(BOTAN_SIMD_USE_SSE2)
533  return SIMD_4x32(_mm_srli_epi32(m_sse, SHIFT));
534 
535 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
536  const unsigned int s = static_cast<unsigned int>(SHIFT);
537  return SIMD_4x32(vec_sr(m_vmx, (__vector unsigned int){s, s, s, s}));
538 #elif defined(BOTAN_SIMD_USE_NEON)
539  return SIMD_4x32(vshrq_n_u32(m_neon, SHIFT));
540 #else
541  return SIMD_4x32(m_scalar[0] >> SHIFT, m_scalar[1] >> SHIFT,
542  m_scalar[2] >> SHIFT, m_scalar[3] >> SHIFT);
543 
544 #endif
545  }
546 
548  {
549 #if defined(BOTAN_SIMD_USE_SSE2)
550  return SIMD_4x32(_mm_xor_si128(m_sse, _mm_set1_epi32(0xFFFFFFFF)));
551 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
552  return SIMD_4x32(vec_nor(m_vmx, m_vmx));
553 #elif defined(BOTAN_SIMD_USE_NEON)
554  return SIMD_4x32(vmvnq_u32(m_neon));
555 #else
556  return SIMD_4x32(~m_scalar[0], ~m_scalar[1], ~m_scalar[2], ~m_scalar[3]);
557 #endif
558  }
559 
560  // (~reg) & other
561  SIMD_4x32 andc(const SIMD_4x32& other) const
562  {
563 #if defined(BOTAN_SIMD_USE_SSE2)
564  return SIMD_4x32(_mm_andnot_si128(m_sse, other.m_sse));
565 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
566  /*
567  AltiVec does arg1 & ~arg2 rather than SSE's ~arg1 & arg2
568  so swap the arguments
569  */
570  return SIMD_4x32(vec_andc(other.m_vmx, m_vmx));
571 #elif defined(BOTAN_SIMD_USE_NEON)
572  // NEON is also a & ~b
573  return SIMD_4x32(vbicq_u32(other.m_neon, m_neon));
574 #else
575  return SIMD_4x32((~m_scalar[0]) & other.m_scalar[0],
576  (~m_scalar[1]) & other.m_scalar[1],
577  (~m_scalar[2]) & other.m_scalar[2],
578  (~m_scalar[3]) & other.m_scalar[3]);
579 #endif
580  }
581 
582  /**
583  * Return copy *this with each word byte swapped
584  */
585  SIMD_4x32 bswap() const
586  {
587 #if defined(BOTAN_SIMD_USE_SSE2)
588 
589  __m128i T = m_sse;
590  T = _mm_shufflehi_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
591  T = _mm_shufflelo_epi16(T, _MM_SHUFFLE(2, 3, 0, 1));
592  return SIMD_4x32(_mm_or_si128(_mm_srli_epi16(T, 8), _mm_slli_epi16(T, 8)));
593 
594 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
595 
596  __vector unsigned char perm = vec_lvsl(0, static_cast<uint32_t*>(nullptr));
597  perm = vec_xor(perm, vec_splat_u8(3));
598  return SIMD_4x32(vec_perm(m_vmx, m_vmx, perm));
599 
600 #elif defined(BOTAN_SIMD_USE_NEON)
601 
602  //return SIMD_4x32(vrev64q_u32(m_neon));
603 
604  // FIXME this is really slow
605  SIMD_4x32 ror8 = this->rotr<8>();
606  SIMD_4x32 rol8 = this->rotl<8>();
607 
608  const SIMD_4x32 mask1 = SIMD_4x32::splat(0xFF00FF00);
609  const SIMD_4x32 mask2 = SIMD_4x32::splat(0x00FF00FF);
610  return (ror8 & mask1) | (rol8 & mask2);
611 #else
612  // scalar
613  return SIMD_4x32(reverse_bytes(m_scalar[0]),
614  reverse_bytes(m_scalar[1]),
615  reverse_bytes(m_scalar[2]),
616  reverse_bytes(m_scalar[3]));
617 #endif
618  }
619 
620  /**
621  * 4x4 Transposition on SIMD registers
622  */
623  static void transpose(SIMD_4x32& B0, SIMD_4x32& B1,
624  SIMD_4x32& B2, SIMD_4x32& B3)
625  {
626 #if defined(BOTAN_SIMD_USE_SSE2)
627  const __m128i T0 = _mm_unpacklo_epi32(B0.m_sse, B1.m_sse);
628  const __m128i T1 = _mm_unpacklo_epi32(B2.m_sse, B3.m_sse);
629  const __m128i T2 = _mm_unpackhi_epi32(B0.m_sse, B1.m_sse);
630  const __m128i T3 = _mm_unpackhi_epi32(B2.m_sse, B3.m_sse);
631 
632  B0.m_sse = _mm_unpacklo_epi64(T0, T1);
633  B1.m_sse = _mm_unpackhi_epi64(T0, T1);
634  B2.m_sse = _mm_unpacklo_epi64(T2, T3);
635  B3.m_sse = _mm_unpackhi_epi64(T2, T3);
636 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
637  const __vector unsigned int T0 = vec_mergeh(B0.m_vmx, B2.m_vmx);
638  const __vector unsigned int T1 = vec_mergeh(B1.m_vmx, B3.m_vmx);
639  const __vector unsigned int T2 = vec_mergel(B0.m_vmx, B2.m_vmx);
640  const __vector unsigned int T3 = vec_mergel(B1.m_vmx, B3.m_vmx);
641 
642  B0.m_vmx = vec_mergeh(T0, T1);
643  B1.m_vmx = vec_mergel(T0, T1);
644  B2.m_vmx = vec_mergeh(T2, T3);
645  B3.m_vmx = vec_mergel(T2, T3);
646 #elif defined(BOTAN_SIMD_USE_NEON)
647 
648 #if defined(BOTAN_TARGET_ARCH_IS_ARM32)
649 
650  const uint32x4x2_t T0 = vzipq_u32(B0.m_neon, B2.m_neon);
651  const uint32x4x2_t T1 = vzipq_u32(B1.m_neon, B3.m_neon);
652  const uint32x4x2_t O0 = vzipq_u32(T0.val[0], T1.val[0]);
653  const uint32x4x2_t O1 = vzipq_u32(T0.val[1], T1.val[1]);
654 
655  B0.m_neon = O0.val[0];
656  B1.m_neon = O0.val[1];
657  B2.m_neon = O1.val[0];
658  B3.m_neon = O1.val[1];
659 
660 #elif defined(BOTAN_TARGET_ARCH_IS_ARM64)
661  const uint32x4_t T0 = vzip1q_u32(B0.m_neon, B2.m_neon);
662  const uint32x4_t T2 = vzip2q_u32(B0.m_neon, B2.m_neon);
663 
664  const uint32x4_t T1 = vzip1q_u32(B1.m_neon, B3.m_neon);
665  const uint32x4_t T3 = vzip2q_u32(B1.m_neon, B3.m_neon);
666 
667  B0.m_neon = vzip1q_u32(T0, T1);
668  B1.m_neon = vzip2q_u32(T0, T1);
669 
670  B2.m_neon = vzip1q_u32(T2, T3);
671  B3.m_neon = vzip2q_u32(T2, T3);
672 #endif
673 
674 #else
675  // scalar
676  SIMD_4x32 T0(B0.m_scalar[0], B1.m_scalar[0], B2.m_scalar[0], B3.m_scalar[0]);
677  SIMD_4x32 T1(B0.m_scalar[1], B1.m_scalar[1], B2.m_scalar[1], B3.m_scalar[1]);
678  SIMD_4x32 T2(B0.m_scalar[2], B1.m_scalar[2], B2.m_scalar[2], B3.m_scalar[2]);
679  SIMD_4x32 T3(B0.m_scalar[3], B1.m_scalar[3], B2.m_scalar[3], B3.m_scalar[3]);
680 
681  B0 = T0;
682  B1 = T1;
683  B2 = T2;
684  B3 = T3;
685 #endif
686  }
687 
688  private:
689 
690 #if defined(BOTAN_SIMD_USE_SSE2)
691  explicit SIMD_4x32(__m128i in) : m_sse(in) {}
692 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
693  explicit SIMD_4x32(__vector unsigned int in) : m_vmx(in) {}
694 #elif defined(BOTAN_SIMD_USE_NEON)
695  explicit SIMD_4x32(uint32x4_t in) : m_neon(in) {}
696 #endif
697 
698 #if defined(BOTAN_SIMD_USE_SSE2)
699  __m128i m_sse;
700 #elif defined(BOTAN_SIMD_USE_ALTIVEC)
701  __vector unsigned int m_vmx;
702 #elif defined(BOTAN_SIMD_USE_NEON)
703  uint32x4_t m_neon;
704 #else
705  uint32_t m_scalar[4];
706 #endif
707  };
708 
710 
711 }
712 
713 #endif
SIMD_4x32(const uint32_t B[4])
Definition: simd_32.h:77
SIMD_4x32 operator-(const SIMD_4x32 &other) const
Definition: simd_32.h:393
SIMD_4x32 operator|(const SIMD_4x32 &other) const
Definition: simd_32.h:413
SIMD_4x32 shl() const
Definition: simd_32.h:512
SIMD_4x32 rotr() const
Definition: simd_32.h:375
SIMD_4x32 bswap() const
Definition: simd_32.h:585
void operator &=(const SIMD_4x32 &other)
Definition: simd_32.h:495
SIMD_4x32 operator~() const
Definition: simd_32.h:547
void store_le(uint8_t out[]) const
Definition: simd_32.h:212
void store_be(uint16_t in, uint8_t out[2])
Definition: loadstor.h:434
void store_be(uint8_t out[]) const
Definition: simd_32.h:248
void copy_out_le(uint8_t out[], size_t out_bytes, const T in[])
Definition: loadstor.h:675
SIMD_4x32 andc(const SIMD_4x32 &other) const
Definition: simd_32.h:561
static SIMD_4x32 load_le(const void *in)
Definition: simd_32.h:131
SIMD_4x32 operator^(const SIMD_4x32 &other) const
Definition: simd_32.h:403
void operator^=(const SIMD_4x32 &other)
Definition: simd_32.h:462
static void transpose(SIMD_4x32 &B0, SIMD_4x32 &B1, SIMD_4x32 &B2, SIMD_4x32 &B3)
Definition: simd_32.h:623
void operator+=(const SIMD_4x32 &other)
Definition: simd_32.h:430
T load_be(const uint8_t in[], size_t off)
Definition: loadstor.h:105
static SIMD_4x32 load_be(const void *in)
Definition: simd_32.h:171
SIMD_4x32 operator &(const SIMD_4x32 &other) const
Definition: simd_32.h:423
SIMD_4x32 rho() const
Definition: simd_32.h:288
T load_le(const uint8_t in[], size_t off)
Definition: loadstor.h:121
static bool is_little_endian()
Definition: cpuid.h:75
Definition: alg_id.cpp:13
void bswap_4(T x[4])
Definition: bswap.h:89
uint16_t reverse_bytes(uint16_t val)
Definition: bswap.h:24
static SIMD_4x32 splat(uint32_t B)
Definition: simd_32.h:117
SIMD_4x32 SIMD_32
Definition: simd_32.h:709
SIMD_4x32 operator+(const SIMD_4x32 &other) const
Definition: simd_32.h:383
void operator|=(const SIMD_4x32 &other)
Definition: simd_32.h:479
void operator-=(const SIMD_4x32 &other)
Definition: simd_32.h:446
fe T
Definition: ge.cpp:37
SIMD_4x32 & operator=(const SIMD_4x32 &other)=default
SIMD_4x32(uint32_t B0, uint32_t B1, uint32_t B2, uint32_t B3)
Definition: simd_32.h:96
static bool is_big_endian()
Definition: cpuid.h:80
SIMD_4x32 rotl() const
Definition: simd_32.h:345
SIMD_4x32 shr() const
Definition: simd_32.h:530
void copy_out_be(uint8_t out[], size_t out_bytes, const T in[])
Definition: loadstor.h:654
void store_le(uint16_t in, uint8_t out[2])
Definition: loadstor.h:450