Crypto++  8.3
Free C++ class library of cryptographic schemes
ppc_simd.h
Go to the documentation of this file.
1 // ppc_simd.h - written and placed in public domain by Jeffrey Walton
2 
3 /// \file ppc_simd.h
4 /// \brief Support functions for PowerPC and vector operations
5 /// \details This header provides an agnostic interface into Clang, GCC
6 /// and IBM XL C/C++ compilers modulo their different built-in functions
7 /// for accessing vector intructions.
8 /// \details The abstractions are necesssary to support back to GCC 4.8 and
9 /// XLC 11 and 12. GCC 4.8 and 4.9 are still popular, and they are the
10 /// default compiler for GCC112, GCC119 and others on the compile farm.
11 /// Older IBM XL C/C++ compilers also have the need due to lack of
12 /// <tt>vec_xl</tt> and <tt>vec_xst</tt> support on some platforms. Modern
13 /// compilers provide best support and don't need many of the hacks
14 /// below.
15 /// \details The library is tested with the following PowerPC machines and
16 /// compilers. GCC110, GCC111, GCC112, GCC119 and GCC135 are provided by
17 /// the <A HREF="https://cfarm.tetaneutral.net/">GCC Compile Farm</A>
18 /// - PowerMac G5, OSX 10.5, POWER4, Apple GCC 4.0
19 /// - PowerMac G5, OSX 10.5, POWER4, Macports GCC 5.0
20 /// - GCC110, Linux, POWER7, GCC 4.8.5
21 /// - GCC110, Linux, POWER7, XLC 12.01
22 /// - GCC111, AIX, POWER7, GCC 4.8.1
23 /// - GCC111, AIX, POWER7, XLC 12.01
24 /// - GCC112, Linux, POWER8, GCC 4.8.5
25 /// - GCC112, Linux, POWER8, XLC 13.01
26 /// - GCC112, Linux, POWER8, Clang 7.0
27 /// - GCC119, AIX, POWER8, GCC 7.2.0
28 /// - GCC119, AIX, POWER8, XLC 13.01
29 /// - GCC135, Linux, POWER9, GCC 7.0
30 /// \details 12 machines are used for testing because the three compilers form
31 /// five or six profiles. The profiles are listed below.
32 /// - GCC (Linux GCC, Macports GCC, etc. Consistent across machines)
33 /// - XLC 13.0 and earlier (all IBM components)
34 /// - XLC 13.1 and later on Linux (LLVM front-end, no compatibility macros)
35 /// - XLC 13.1 and later on Linux (LLVM front-end, -qxlcompatmacros option)
36 /// - early LLVM Clang (traditional Clang compiler)
37 /// - late LLVM Clang (traditional Clang compiler)
38 /// \details The LLVM front-end makes it tricky to write portable code because
39 /// LLVM pretends to be other compilers but cannot consume other compiler's
40 /// builtins. When using XLC with -qxlcompatmacros the compiler pretends to
41 /// be GCC, Clang and XLC all at once but it can only consume it's variety
42 /// of builtins.
43 /// \details At Crypto++ 8.0 the various <tt>Vector{FuncName}</tt> were
44 /// renamed to <tt>Vec{FuncName}</tt>. For example, <tt>VectorAnd</tt> was
45 /// changed to <tt>VecAnd</tt>. The name change helped consolidate two
46 /// slightly different implementations.
47 /// \details At Crypto++ 8.3 the library added select 64-bit functions for
48 /// 32-bit Altivec. For example, <tt>VecAdd64</tt> and <tt>VecSub64</tt>
49 /// take 32-bit vectors and adds or subtracts them as if there were vectors
50 /// with two 64-bit elements. The functions dramtically improve performance
51 /// for some algorithms on some platforms, like SIMON128 and SPECK128 on
52 /// Power6 and earlier. For example, SPECK128 improved from 70 cpb to
53 /// 10 cpb on an old PowerMac. Use the functions like shown below.
54 /// <pre>
55 /// \#if defined(_ARCH_PWR8)
56 /// \# define speck128_t uint64x2_p
57 /// \#else
58 /// \# define speck128_t uint32x4_p
59 /// \#endif
60 ///
61 /// speck128_t rk, x1, x2, y1, y2;
62 /// rk = (speck128_t)VecLoadAligned(ptr);
63 /// x1 = VecRotateRight64<8>(x1);
64 /// x1 = VecAdd64(x1, y1);
65 /// ...</pre>
66 /// \since Crypto++ 6.0, LLVM Clang compiler support since Crypto++ 8.0
67 
68 // Use __ALTIVEC__, _ARCH_PWR7, __VSX__, and _ARCH_PWR8 when detecting
69 // actual availaibility of the feature for the source file being compiled.
70 // The preprocessor macros depend on compiler options like -maltivec; and
71 // not compiler versions.
72 
73 // For GCC see https://gcc.gnu.org/onlinedocs/gcc/Basic-PowerPC-Built-in-Functions.html
74 // For XLC see the Compiler Reference manual. For Clang you have to experiment.
75 // Clang does not document the compiler options, does not reject options it does
76 // not understand, and pretends to be other compilers even though it cannot
77 // process the builtins and intrinsics. Clang will waste hours of your time.
78 
79 // DO NOT USE this pattern in VecLoad and VecStore. We have to use the
80 // code paths guarded by preprocessor macros because XLC 12 generates
81 // bad code in some places. To verify the bad code generation test on
82 // GCC111 with XLC 12.01 installed. XLC 13.01 on GCC112 and GCC119 are OK.
83 //
84 // inline uint32x4_p VecLoad(const byte src[16])
85 // {
86 // #if defined(__VSX__) || defined(_ARCH_PWR8)
87 // return (uint32x4_p) *(uint8x16_p*)((byte*)src);
88 // #else
89 // return VecLoad_ALTIVEC(src);
90 // #endif
91 // }
92 
93 // We should be able to perform the load using inline asm on Power7 with
94 // VSX or Power8. The inline asm will avoid C undefined behavior due to
95 // casting from byte* to word32*. We are safe because our byte* are
96 // 16-byte aligned for Altivec. Below is the big endian load. Little
97 // endian would need to follow with xxpermdi for the reversal.
98 //
99 // __asm__ ("lxvw4x %x0, %1, %2" : "=wa"(v) : "r"(0), "r"(src) : );
100 
101 // GCC and XLC use integer math for the address (D-form or byte-offset
102 // in the ISA manual). LLVM uses pointer math for the address (DS-form
103 // or indexed in the ISA manual). To keep them consistent we calculate
104 // the address from the offset and pass to a load or store function
105 // using a 0 offset.
106 
107 #ifndef CRYPTOPP_PPC_CRYPTO_H
108 #define CRYPTOPP_PPC_CRYPTO_H
109 
110 #include "config.h"
111 #include "misc.h"
112 
113 #if defined(__ALTIVEC__)
114 # include <altivec.h>
115 # undef vector
116 # undef pixel
117 # undef bool
118 #endif
119 
120 // XL C++ on AIX does not define VSX and does not
121 // provide an option to set it. We have to set it
122 // for the code below. This define must stay in
123 // sync with the define in test_ppc_power7.cxx.
124 #if defined(_AIX) && defined(_ARCH_PWR7) && defined(__xlC__)
125 # define __VSX__ 1
126 #endif
127 
128 // XL C++ on AIX does not define CRYPTO and does not
129 // provide an option to set it. We have to set it
130 // for the code below. This define must stay in
131 // sync with the define in test_ppc_power8.cxx
132 #if defined(_AIX) && defined(_ARCH_PWR8) && defined(__xlC__)
133 # define __CRYPTO__ 1
134 #endif
135 
136 /// \brief Cast array to vector pointer
137 /// \details CONST_V8_CAST casts a const array to a vector
138 /// pointer for a byte array. The Power ABI says source arrays
139 /// are non-const, so this define removes the const. XLC++ will
140 /// fail the compile if the source array is const.
141 #define CONST_V8_CAST(x) ((unsigned char*)(x))
142 /// \brief Cast array to vector pointer
143 /// \details CONST_V32_CAST casts a const array to a vector
144 /// pointer for a word array. The Power ABI says source arrays
145 /// are non-const, so this define removes the const. XLC++ will
146 /// fail the compile if the source array is const.
147 #define CONST_V32_CAST(x) ((unsigned int*)(x))
148 /// \brief Cast array to vector pointer
149 /// \details CONST_V64_CAST casts a const array to a vector
150 /// pointer for a double word array. The Power ABI says source arrays
151 /// are non-const, so this define removes the const. XLC++ will
152 /// fail the compile if the source array is const.
153 #define CONST_V64_CAST(x) ((unsigned long long*)(x))
154 /// \brief Cast array to vector pointer
155 /// \details NCONST_V8_CAST casts an array to a vector
156 /// pointer for a byte array. The Power ABI says source arrays
157 /// are non-const, so this define removes the const. XLC++ will
158 /// fail the compile if the source array is const.
159 #define NCONST_V8_CAST(x) ((unsigned char*)(x))
160 /// \brief Cast array to vector pointer
161 /// \details NCONST_V32_CAST casts an array to a vector
162 /// pointer for a word array. The Power ABI says source arrays
163 /// are non-const, so this define removes the const. XLC++ will
164 /// fail the compile if the source array is const.
165 #define NCONST_V32_CAST(x) ((unsigned int*)(x))
166 /// \brief Cast array to vector pointer
167 /// \details NCONST_V64_CAST casts an array to a vector
168 /// pointer for a double word array. The Power ABI says source arrays
169 /// are non-const, so this define removes the const. XLC++ will
170 /// fail the compile if the source array is const.
171 #define NCONST_V64_CAST(x) ((unsigned long long*)(x))
172 
173 // VecLoad_ALTIVEC and VecStore_ALTIVEC are
174 // too noisy on modern compilers
175 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
176 # pragma GCC diagnostic push
177 # pragma GCC diagnostic ignored "-Wdeprecated"
178 #endif
179 
180 NAMESPACE_BEGIN(CryptoPP)
181 
182 #if defined(__ALTIVEC__) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
183 
184 /// \brief Vector of 8-bit elements
185 /// \par Wraps
186 /// __vector unsigned char
187 /// \since Crypto++ 6.0
188 typedef __vector unsigned char uint8x16_p;
189 /// \brief Vector of 16-bit elements
190 /// \par Wraps
191 /// __vector unsigned short
192 /// \since Crypto++ 6.0
193 typedef __vector unsigned short uint16x8_p;
194 /// \brief Vector of 32-bit elements
195 /// \par Wraps
196 /// __vector unsigned int
197 /// \since Crypto++ 6.0
198 typedef __vector unsigned int uint32x4_p;
199 
200 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
201 /// \brief Vector of 64-bit elements
202 /// \details uint64x2_p is available on POWER7 with VSX and above. Most
203 /// supporting functions, like 64-bit <tt>vec_add</tt> (<tt>vaddudm</tt>)
204 /// and <tt>vec_sub</tt> (<tt>vsubudm</tt>), did not arrive until POWER8.
205 /// \par Wraps
206 /// __vector unsigned long long
207 /// \since Crypto++ 6.0
208 typedef __vector unsigned long long uint64x2_p;
209 #endif // VSX or ARCH_PWR8
210 
211 /// \brief The 0 vector
212 /// \return a 32-bit vector of 0's
213 /// \since Crypto++ 8.0
215 {
216  const uint32x4_p v = {0,0,0,0};
217  return v;
218 }
219 
220 /// \brief The 1 vector
221 /// \return a 32-bit vector of 1's
222 /// \since Crypto++ 8.0
224 {
225  const uint32x4_p v = {1,1,1,1};
226  return v;
227 }
228 
229 /// \brief Reverse bytes in a vector
230 /// \tparam T vector type
231 /// \param data the vector
232 /// \return vector
233 /// \details VecReverse() reverses the bytes in a vector
234 /// \par Wraps
235 /// vec_perm
236 /// \since Crypto++ 6.0
237 template <class T>
238 inline T VecReverse(const T data)
239 {
240 #if defined(CRYPTOPP_BIG_ENDIAN)
241  const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
242  return (T)vec_perm(data, data, mask);
243 #else
244  const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
245  return (T)vec_perm(data, data, mask);
246 #endif
247 }
248 
249 /// \brief Reverse bytes in a vector
250 /// \tparam T vector type
251 /// \param data the vector
252 /// \return vector
253 /// \details VecReverseLE() reverses the bytes in a vector on
254 /// little-endian systems.
255 /// \par Wraps
256 /// vec_perm
257 /// \since Crypto++ 6.0
258 template <class T>
259 inline T VecReverseLE(const T data)
260 {
261 #if defined(CRYPTOPP_LITTLE_ENDIAN)
262  const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
263  return (T)vec_perm(data, data, mask);
264 #else
265  return data;
266 #endif
267 }
268 
269 /// \brief Reverse bytes in a vector
270 /// \tparam T vector type
271 /// \param data the vector
272 /// \return vector
273 /// \details VecReverseBE() reverses the bytes in a vector on
274 /// big-endian systems.
275 /// \par Wraps
276 /// vec_perm
277 /// \since Crypto++ 6.0
278 template <class T>
279 inline T VecReverseBE(const T data)
280 {
281 #if defined(CRYPTOPP_BIG_ENDIAN)
282  const uint8x16_p mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0};
283  return (T)vec_perm(data, data, mask);
284 #else
285  return data;
286 #endif
287 }
288 
289 /// \name LOAD OPERATIONS
290 //@{
291 
292 /// \brief Loads a vector from a byte array
293 /// \param src the byte array
294 /// \details Loads a vector in native endian format from a byte array.
295 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
296 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
297 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>. The fixups using
298 /// <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are relatively expensive so
299 /// you should provide aligned memory adresses.
300 /// \par Wraps
301 /// vec_ld, vec_lvsl, vec_perm
302 /// \sa VecLoad, VecLoadAligned
303 /// \since Crypto++ 6.0
304 inline uint32x4_p VecLoad_ALTIVEC(const byte src[16])
305 {
306  // Avoid IsAlignedOn for convenience.
307  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
308  if (addr % 16 == 0)
309  {
310  return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
311  }
312  else
313  {
314  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
315  const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
316  const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
317  const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
318  return (uint32x4_p)vec_perm(low, high, perm);
319  }
320 }
321 
322 /// \brief Loads a vector from a byte array
323 /// \param src the byte array
324 /// \param off offset into the src byte array
325 /// \details Loads a vector in native endian format from a byte array.
326 /// \details VecLoad_ALTIVEC() uses <tt>vec_ld</tt> if the effective address
327 /// of <tt>src</tt> is aligned. If unaligned it uses <tt>vec_lvsl</tt>,
328 /// <tt>vec_ld</tt>, <tt>vec_perm</tt> and <tt>src</tt>.
329 /// \details The fixups using <tt>vec_lvsl</tt> and <tt>vec_perm</tt> are
330 /// relatively expensive so you should provide aligned memory adresses.
331 /// \par Wraps
332 /// vec_ld, vec_lvsl, vec_perm
333 /// \sa VecLoad, VecLoadAligned
334 /// \since Crypto++ 6.0
335 inline uint32x4_p VecLoad_ALTIVEC(int off, const byte src[16])
336 {
337  // Avoid IsAlignedOn for convenience.
338  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
339  if (addr % 16 == 0)
340  {
341  return (uint32x4_p)vec_ld(0, CONST_V8_CAST(addr));
342  }
343  else
344  {
345  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
346  const uint8x16_p perm = vec_lvsl(0, CONST_V8_CAST(addr));
347  const uint8x16_p low = vec_ld(0, CONST_V8_CAST(addr));
348  const uint8x16_p high = vec_ld(15, CONST_V8_CAST(addr));
349  return (uint32x4_p)vec_perm(low, high, perm);
350  }
351 }
352 
353 /// \brief Loads a vector from a byte array
354 /// \param src the byte array
355 /// \details VecLoad() loads a vector from a byte array.
356 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
357 /// The instruction does not require aligned effective memory addresses.
358 /// VecLoad_ALTIVEC() is used if POWER9 is not available.
359 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
360 /// are required to fix up unaligned memory addresses.
361 /// \par Wraps
362 /// vec_xl on POWER9 and above, Altivec load on POWER8 and below
363 /// \sa VecLoad_ALTIVEC, VecLoadAligned
364 /// \since Crypto++ 6.0
365 inline uint32x4_p VecLoad(const byte src[16])
366 {
367  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
368  // word pointers. The ISA lacks loads for short* and char*.
369  // Power9/ISA 3.0 provides vec_xl for all datatypes.
370 
371  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
372  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
373  CRYPTOPP_UNUSED(addr);
374 
375 #if defined(_ARCH_PWR9)
376  return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
377 #else
379 #endif
380 }
381 
382 /// \brief Loads a vector from a byte array
383 /// \param src the byte array
384 /// \param off offset into the src byte array
385 /// \details VecLoad() loads a vector from a byte array.
386 /// \details VecLoad() uses POWER9's <tt>vec_xl</tt> if available.
387 /// The instruction does not require aligned effective memory addresses.
388 /// VecLoad_ALTIVEC() is used if POWER9 is not available.
389 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
390 /// are required to fix up unaligned memory addresses.
391 /// \par Wraps
392 /// vec_xl on POWER9 and above, Altivec load on POWER8 and below
393 /// \sa VecLoad_ALTIVEC, VecLoadAligned
394 /// \since Crypto++ 6.0
395 inline uint32x4_p VecLoad(int off, const byte src[16])
396 {
397  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
398  // word pointers. The ISA lacks loads for short* and char*.
399  // Power9/ISA 3.0 provides vec_xl for all datatypes.
400 
401  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
402  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
403  CRYPTOPP_UNUSED(addr);
404 
405 #if defined(_ARCH_PWR9)
406  return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
407 #else
409 #endif
410 }
411 
412 /// \brief Loads a vector from a word array
413 /// \param src the word array
414 /// \details VecLoad() loads a vector from a word array.
415 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
416 /// The instruction does not require aligned effective memory addresses.
417 /// VecLoad_ALTIVEC() is used if POWER7 is not available.
418 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
419 /// are required to fix up unaligned memory addresses.
420 /// \par Wraps
421 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
422 /// \sa VecLoad_ALTIVEC, VecLoadAligned
423 /// \since Crypto++ 8.0
424 inline uint32x4_p VecLoad(const word32 src[4])
425 {
426  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
427  // word pointers. The ISA lacks loads for short* and char*.
428  // Power9/ISA 3.0 provides vec_xl for all datatypes.
429 
430  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
431  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
432  CRYPTOPP_UNUSED(addr);
433 
434 #if defined(_ARCH_PWR9)
435  return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
436 #elif defined(__VSX__) || defined(_ARCH_PWR8)
437  return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
438 #else
440 #endif
441 }
442 
443 /// \brief Loads a vector from a word array
444 /// \param src the word array
445 /// \param off offset into the word array
446 /// \details VecLoad() loads a vector from a word array.
447 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
448 /// The instruction does not require aligned effective memory addresses.
449 /// VecLoad_ALTIVEC() is used if POWER7 is not available.
450 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
451 /// are required to fix up unaligned memory addresses.
452 /// \par Wraps
453 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
454 /// \sa VecLoad_ALTIVEC, VecLoadAligned
455 /// \since Crypto++ 8.0
456 inline uint32x4_p VecLoad(int off, const word32 src[4])
457 {
458  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
459  // word pointers. The ISA lacks loads for short* and char*.
460  // Power9/ISA 3.0 provides vec_xl for all datatypes.
461 
462  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
463  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
464  CRYPTOPP_UNUSED(addr);
465 
466 #if defined(_ARCH_PWR9)
467  return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
468 #elif defined(__VSX__) || defined(_ARCH_PWR8)
469  return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
470 #else
472 #endif
473 }
474 
475 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
476 
477 /// \brief Loads a vector from a double word array
478 /// \param src the double word array
479 /// \details VecLoad() loads a vector from a double word array.
480 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
481 /// The instruction does not require aligned effective memory addresses.
482 /// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
483 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
484 /// are required to fix up unaligned memory addresses.
485 /// \details VecLoad() with 64-bit elements is available on POWER7 and above.
486 /// \par Wraps
487 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
488 /// \sa VecLoad_ALTIVEC, VecLoadAligned
489 /// \since Crypto++ 8.0
490 inline uint64x2_p VecLoad(const word64 src[2])
491 {
492  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
493  // word pointers. The ISA lacks loads for short* and char*.
494  // Power9/ISA 3.0 provides vec_xl for all datatypes.
495 
496  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
497  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
498  CRYPTOPP_UNUSED(addr);
499 
500 #if defined(_ARCH_PWR9)
501  return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
502 #elif defined(__VSX__) || defined(_ARCH_PWR8)
503  // The 32-bit cast is not a typo. Compiler workaround.
504  return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
505 #else
507 #endif
508 }
509 
510 /// \brief Loads a vector from a double word array
511 /// \param src the double word array
512 /// \param off offset into the double word array
513 /// \details VecLoad() loads a vector from a double word array.
514 /// \details VecLoad() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
515 /// The instruction does not require aligned effective memory addresses.
516 /// VecLoad_ALTIVEC() is used if POWER7 and VSX are not available.
517 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
518 /// are required to fix up unaligned memory addresses.
519 /// \details VecLoad() with 64-bit elements is available on POWER8 and above.
520 /// \par Wraps
521 /// vec_xl on VSX or POWER8 and above, Altivec load on POWER7 and below
522 /// \sa VecLoad_ALTIVEC, VecLoadAligned
523 /// \since Crypto++ 8.0
524 inline uint64x2_p VecLoad(int off, const word64 src[2])
525 {
526  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
527  // word pointers. The ISA lacks loads for short* and char*.
528  // Power9/ISA 3.0 provides vec_xl for all datatypes.
529 
530  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
531  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
532  CRYPTOPP_UNUSED(addr);
533 
534 #if defined(_ARCH_PWR9)
535  return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
536 #elif defined(__VSX__) || defined(_ARCH_PWR8)
537  // The 32-bit cast is not a typo. Compiler workaround.
538  return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
539 #else
541 #endif
542 }
543 
544 #endif // VSX or ARCH_PWR8
545 
546 /// \brief Loads a vector from an aligned byte array
547 /// \param src the byte array
548 /// \details VecLoadAligned() loads a vector from an aligned byte array.
549 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
550 /// <tt>vec_ld</tt> is used if POWER9 is not available. The effective
551 /// address of <tt>src</tt> must be 16-byte aligned for Altivec.
552 /// \par Wraps
553 /// vec_xl on POWER9, vec_ld on POWER8 and below
554 /// \sa VecLoad_ALTIVEC, VecLoad
555 /// \since Crypto++ 8.0
556 inline uint32x4_p VecLoadAligned(const byte src[16])
557 {
558  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
559  // word pointers. The ISA lacks loads for short* and char*.
560  // Power9/ISA 3.0 provides vec_xl for all datatypes.
561 
562  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
563  CRYPTOPP_ASSERT(addr % 16 == 0);
564  CRYPTOPP_UNUSED(addr);
565 
566 #if defined(_ARCH_PWR9)
567  return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
568 #else
569  return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
570 #endif
571 }
572 
573 /// \brief Loads a vector from an aligned byte array
574 /// \param src the byte array
575 /// \param off offset into the src byte array
576 /// \details VecLoadAligned() loads a vector from an aligned byte array.
577 /// \details VecLoadAligned() uses POWER9's <tt>vec_xl</tt> if available.
578 /// <tt>vec_ld</tt> is used if POWER9 is not available. The effective
579 /// address of <tt>src</tt> must be 16-byte aligned for Altivec.
580 /// \par Wraps
581 /// vec_xl on POWER9, vec_ld on POWER8 and below
582 /// \sa VecLoad_ALTIVEC, VecLoad
583 /// \since Crypto++ 8.0
584 inline uint32x4_p VecLoadAligned(int off, const byte src[16])
585 {
586  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
587  // word pointers. The ISA lacks loads for short* and char*.
588  // Power9/ISA 3.0 provides vec_xl for all datatypes.
589 
590  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
591  CRYPTOPP_ASSERT(addr % 16 == 0);
592  CRYPTOPP_UNUSED(addr);
593 
594 #if defined(_ARCH_PWR9)
595  return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
596 #else
597  return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
598 #endif
599 }
600 
601 /// \brief Loads a vector from an aligned word array
602 /// \param src the word array
603 /// \details VecLoadAligned() loads a vector from an aligned word array.
604 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
605 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
606 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
607 /// \par Wraps
608 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
609 /// \sa VecLoad_ALTIVEC, VecLoad
610 /// \since Crypto++ 8.0
611 inline uint32x4_p VecLoadAligned(const word32 src[4])
612 {
613  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
614  // word pointers. The ISA lacks loads for short* and char*.
615  // Power9/ISA 3.0 provides vec_xl for all datatypes.
616 
617  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
618  CRYPTOPP_ASSERT(addr % 16 == 0);
619  CRYPTOPP_UNUSED(addr);
620 
621 #if defined(_ARCH_PWR9)
622  return (uint32x4_p)vec_xl(0, CONST_V8_CAST(src));
623 #elif defined(__VSX__) || defined(_ARCH_PWR8)
624  return (uint32x4_p)vec_xl(0, CONST_V32_CAST(src));
625 #else
626  return (uint32x4_p)vec_ld(0, CONST_V8_CAST(src));
627 #endif
628 }
629 
630 /// \brief Loads a vector from an aligned word array
631 /// \param src the word array
632 /// \param off offset into the src word array
633 /// \details VecLoadAligned() loads a vector from an aligned word array.
634 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
635 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
636 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
637 /// \par Wraps
638 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
639 /// \sa VecLoad_ALTIVEC, VecLoad
640 /// \since Crypto++ 8.0
641 inline uint32x4_p VecLoadAligned(int off, const word32 src[4])
642 {
643  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
644  // word pointers. The ISA lacks loads for short* and char*.
645  // Power9/ISA 3.0 provides vec_xl for all datatypes.
646 
647  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
648  CRYPTOPP_ASSERT(addr % 16 == 0);
649  CRYPTOPP_UNUSED(addr);
650 
651 #if defined(_ARCH_PWR9)
652  return (uint32x4_p)vec_xl(off, CONST_V8_CAST(src));
653 #elif defined(__VSX__) || defined(_ARCH_PWR8)
654  return (uint32x4_p)vec_xl(0, CONST_V32_CAST(addr));
655 #else
656  return (uint32x4_p)vec_ld(off, CONST_V8_CAST(src));
657 #endif
658 }
659 
660 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
661 
662 /// \brief Loads a vector from an aligned double word array
663 /// \param src the double word array
664 /// \details VecLoadAligned() loads a vector from an aligned double word array.
665 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
666 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
667 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
668 /// \par Wraps
669 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
670 /// \sa VecLoad_ALTIVEC, VecLoad
671 /// \since Crypto++ 8.0
672 inline uint64x2_p VecLoadAligned(const word64 src[4])
673 {
674  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
675  // word pointers. The ISA lacks loads for short* and char*.
676  // Power9/ISA 3.0 provides vec_xl for all datatypes.
677 
678  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
679  CRYPTOPP_ASSERT(addr % 16 == 0);
680  CRYPTOPP_UNUSED(addr);
681 
682 #if defined(_ARCH_PWR9)
683  return (uint64x2_p)vec_xl(0, CONST_V8_CAST(src));
684 #elif defined(__VSX__) || defined(_ARCH_PWR8)
685  // The 32-bit cast is not a typo. Compiler workaround.
686  return (uint64x2_p)vec_xl(0, CONST_V32_CAST(src));
687 #else
688  return (uint64x2_p)vec_ld(0, CONST_V8_CAST(src));
689 #endif
690 }
691 
692 /// \brief Loads a vector from an aligned double word array
693 /// \param src the double word array
694 /// \param off offset into the src double word array
695 /// \details VecLoadAligned() loads a vector from an aligned double word array.
696 /// \details VecLoadAligned() uses POWER7's and VSX's <tt>vec_xl</tt> if
697 /// available. <tt>vec_ld</tt> is used if POWER7 or VSX are not available.
698 /// The effective address of <tt>src</tt> must be 16-byte aligned for Altivec.
699 /// \par Wraps
700 /// vec_xl on VSX or POWER8 and above, vec_ld on POWER7 and below
701 /// \sa VecLoad_ALTIVEC, VecLoad
702 /// \since Crypto++ 8.0
703 inline uint64x2_p VecLoadAligned(int off, const word64 src[4])
704 {
705  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
706  // word pointers. The ISA lacks loads for short* and char*.
707  // Power9/ISA 3.0 provides vec_xl for all datatypes.
708 
709  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
710  CRYPTOPP_ASSERT(addr % 16 == 0);
711  CRYPTOPP_UNUSED(addr);
712 
713 #if defined(_ARCH_PWR9)
714  return (uint64x2_p)vec_xl(off, CONST_V8_CAST(src));
715 #elif defined(__VSX__) || defined(_ARCH_PWR8)
716  // The 32-bit cast is not a typo. Compiler workaround.
717  return (uint64x2_p)vec_xl(0, CONST_V32_CAST(addr));
718 #else
719  return (uint64x2_p)vec_ld(off, CONST_V8_CAST(src));
720 #endif
721 }
722 
723 #endif
724 
725 /// \brief Loads a vector from a byte array
726 /// \param src the byte array
727 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
728 /// will reverse all bytes in the array on a little endian system.
729 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
730 /// The instruction does not require aligned effective memory addresses.
731 /// VecLoad_ALTIVEC() is used if POWER7 or VSX are not available.
732 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
733 /// are required to fix up unaligned memory addresses.
734 /// \par Wraps
735 /// vec_xl on POWER8, Altivec load on POWER7 and below
736 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
737 /// \since Crypto++ 6.0
738 inline uint32x4_p VecLoadBE(const byte src[16])
739 {
740  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
741  // word pointers. The ISA lacks loads for short* and char*.
742  // Power9/ISA 3.0 provides vec_xl for all datatypes.
743 
744  const uintptr_t addr = reinterpret_cast<uintptr_t>(src);
745  // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
746  CRYPTOPP_UNUSED(addr);
747 
748 #if defined(_ARCH_PWR9)
749  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
750  return (uint32x4_p)vec_xl_be(0, CONST_V8_CAST(src));
751 #elif defined(CRYPTOPP_BIG_ENDIAN)
752  return (uint32x4_p)VecLoad_ALTIVEC(0, CONST_V8_CAST(src));
753 #else
755 #endif
756 }
757 
758 /// \brief Loads a vector from a byte array
759 /// \param src the byte array
760 /// \param off offset into the src byte array
761 /// \details VecLoadBE() loads a vector from a byte array. VecLoadBE
762 /// will reverse all bytes in the array on a little endian system.
763 /// \details VecLoadBE() uses POWER7's and VSX's <tt>vec_xl</tt> if available.
764 /// The instruction does not require aligned effective memory addresses.
765 /// VecLoad_ALTIVEC() is used if POWER7 is not available.
766 /// VecLoad_ALTIVEC() can be relatively expensive if extra instructions
767 /// are required to fix up unaligned memory addresses.
768 /// \par Wraps
769 /// vec_xl on POWER8, Altivec load on POWER7 and below
770 /// \sa VecLoad_ALTIVEC, VecLoad, VecLoadAligned
771 /// \since Crypto++ 6.0
772 inline uint32x4_p VecLoadBE(int off, const byte src[16])
773 {
774  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
775  // word pointers. The ISA lacks loads for short* and char*.
776  // Power9/ISA 3.0 provides vec_xl for all datatypes.
777 
778  const uintptr_t addr = reinterpret_cast<uintptr_t>(src)+off;
779  // CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
780  CRYPTOPP_UNUSED(addr);
781 
782 #if defined(_ARCH_PWR9)
783  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
784  return (uint32x4_p)vec_xl_be(off, CONST_V8_CAST(src));
785 #elif defined(CRYPTOPP_BIG_ENDIAN)
787 #else
789 #endif
790 }
791 
792 //@}
793 
794 /// \name STORE OPERATIONS
795 //@{
796 
797 /// \brief Stores a vector to a byte array
798 /// \tparam T vector type
799 /// \param data the vector
800 /// \param dest the byte array
801 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
802 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
803 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
804 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
805 /// memory adresses.
806 /// \details VecStore_ALTIVEC() is used when POWER7 or above
807 /// and unaligned loads is not available.
808 /// \par Wraps
809 /// vec_st, vec_ste, vec_lvsr, vec_perm
810 /// \sa VecStore, VecStoreAligned
811 /// \since Crypto++ 8.0
812 template<class T>
813 inline void VecStore_ALTIVEC(const T data, byte dest[16])
814 {
815  // Avoid IsAlignedOn for convenience.
816  uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
817  if (addr % 16 == 0)
818  {
819  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
820  }
821  else
822  {
823  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
824  uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
825  vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
826  vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
827  vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
828  vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
829  vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
830  vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
831  vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
832  vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
833  }
834 }
835 
836 /// \brief Stores a vector to a byte array
837 /// \tparam T vector type
838 /// \param data the vector
839 /// \param off offset into the dest byte array
840 /// \param dest the byte array
841 /// \details VecStore_ALTIVEC() stores a vector to a byte array.
842 /// \details VecStore_ALTIVEC() uses <tt>vec_st</tt> if the effective address
843 /// of <tt>dest</tt> is aligned, and uses <tt>vec_ste</tt> otherwise.
844 /// <tt>vec_ste</tt> is relatively expensive so you should provide aligned
845 /// memory adresses.
846 /// \details VecStore_ALTIVEC() is used when POWER7 or above
847 /// and unaligned loads is not available.
848 /// \par Wraps
849 /// vec_st, vec_ste, vec_lvsr, vec_perm
850 /// \sa VecStore, VecStoreAligned
851 /// \since Crypto++ 8.0
852 template<class T>
853 inline void VecStore_ALTIVEC(const T data, int off, byte dest[16])
854 {
855  // Avoid IsAlignedOn for convenience.
856  uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
857  if (addr % 16 == 0)
858  {
859  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
860  }
861  else
862  {
863  // http://www.nxp.com/docs/en/reference-manual/ALTIVECPEM.pdf
864  uint8x16_p perm = (uint8x16_p)vec_perm(data, data, vec_lvsr(0, NCONST_V8_CAST(addr)));
865  vec_ste((uint8x16_p) perm, 0, (unsigned char*) NCONST_V8_CAST(addr));
866  vec_ste((uint16x8_p) perm, 1, (unsigned short*)NCONST_V8_CAST(addr));
867  vec_ste((uint32x4_p) perm, 3, (unsigned int*) NCONST_V8_CAST(addr));
868  vec_ste((uint32x4_p) perm, 4, (unsigned int*) NCONST_V8_CAST(addr));
869  vec_ste((uint32x4_p) perm, 8, (unsigned int*) NCONST_V8_CAST(addr));
870  vec_ste((uint32x4_p) perm, 12, (unsigned int*) NCONST_V8_CAST(addr));
871  vec_ste((uint16x8_p) perm, 14, (unsigned short*)NCONST_V8_CAST(addr));
872  vec_ste((uint8x16_p) perm, 15, (unsigned char*) NCONST_V8_CAST(addr));
873  }
874 }
875 
876 /// \brief Stores a vector to a byte array
877 /// \tparam T vector type
878 /// \param data the vector
879 /// \param dest the byte array
880 /// \details VecStore() stores a vector to a byte array.
881 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
882 /// The instruction does not require aligned effective memory addresses.
883 /// VecStore_ALTIVEC() is used if POWER9 is not available.
884 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
885 /// are required to fix up unaligned memory addresses.
886 /// \par Wraps
887 /// vec_xst on POWER9 and above, Altivec store on POWER8 and below
888 /// \sa VecStore_ALTIVEC, VecStoreAligned
889 /// \since Crypto++ 6.0
890 template<class T>
891 inline void VecStore(const T data, byte dest[16])
892 {
893  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
894  // word pointers. The ISA lacks loads for short* and char*.
895  // Power9/ISA 3.0 provides vec_xl for all datatypes.
896 
897  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
898  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
899  CRYPTOPP_UNUSED(addr);
900 
901 #if defined(_ARCH_PWR9)
902  vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
903 #else
905 #endif
906 }
907 
908 /// \brief Stores a vector to a byte array
909 /// \tparam T vector type
910 /// \param data the vector
911 /// \param off offset into the dest byte array
912 /// \param dest the byte array
913 /// \details VecStore() stores a vector to a byte array.
914 /// \details VecStore() uses POWER9's <tt>vec_xst</tt> if available.
915 /// The instruction does not require aligned effective memory addresses.
916 /// VecStore_ALTIVEC() is used if POWER9 is not available.
917 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
918 /// are required to fix up unaligned memory addresses.
919 /// \par Wraps
920 /// vec_xst on POWER9 and above, Altivec store on POWER8 and below
921 /// \sa VecStore_ALTIVEC, VecStoreAligned
922 /// \since Crypto++ 6.0
923 template<class T>
924 inline void VecStore(const T data, int off, byte dest[16])
925 {
926  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
927  // word pointers. The ISA lacks loads for short* and char*.
928  // Power9/ISA 3.0 provides vec_xl for all datatypes.
929 
930  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
931  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
932  CRYPTOPP_UNUSED(addr);
933 
934 #if defined(_ARCH_PWR9)
935  vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
936 #else
938 #endif
939 }
940 
941 /// \brief Stores a vector to a word array
942 /// \tparam T vector type
943 /// \param data the vector
944 /// \param dest the word array
945 /// \details VecStore() stores a vector to a word array.
946 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
947 /// The instruction does not require aligned effective memory addresses.
948 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
949 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
950 /// are required to fix up unaligned memory addresses.
951 /// \par Wraps
952 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
953 /// \sa VecStore_ALTIVEC, VecStoreAligned
954 /// \since Crypto++ 8.0
955 template<class T>
956 inline void VecStore(const T data, word32 dest[4])
957 {
958  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
959  // word pointers. The ISA lacks stores for short* and char*.
960  // Power9/ISA 3.0 provides vec_xst for all datatypes.
961 
962  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
963  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
964  CRYPTOPP_UNUSED(addr);
965 
966 #if defined(_ARCH_PWR9)
967  vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
968 #elif defined(__VSX__) || defined(_ARCH_PWR8)
969  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
970 #else
972 #endif
973 }
974 
975 /// \brief Stores a vector to a word array
976 /// \tparam T vector type
977 /// \param data the vector
978 /// \param off offset into the dest word array
979 /// \param dest the word array
980 /// \details VecStore() stores a vector to a word array.
981 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
982 /// The instruction does not require aligned effective memory addresses.
983 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
984 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
985 /// are required to fix up unaligned memory addresses.
986 /// \par Wraps
987 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
988 /// \sa VecStore_ALTIVEC, VecStoreAligned
989 /// \since Crypto++ 8.0
990 template<class T>
991 inline void VecStore(const T data, int off, word32 dest[4])
992 {
993  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
994  // word pointers. The ISA lacks stores for short* and char*.
995  // Power9/ISA 3.0 provides vec_xst for all datatypes.
996 
997  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
998  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
999  CRYPTOPP_UNUSED(addr);
1000 
1001 #if defined(_ARCH_PWR9)
1002  vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1003 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1004  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1005 #else
1007 #endif
1008 }
1009 
1010 /// \brief Stores a vector to a word array
1011 /// \tparam T vector type
1012 /// \param data the vector
1013 /// \param dest the word array
1014 /// \details VecStore() stores a vector to a word array.
1015 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1016 /// The instruction does not require aligned effective memory addresses.
1017 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
1018 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1019 /// are required to fix up unaligned memory addresses.
1020 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
1021 /// \par Wraps
1022 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
1023 /// \sa VecStore_ALTIVEC, VecStoreAligned
1024 /// \since Crypto++ 8.0
1025 template<class T>
1026 inline void VecStore(const T data, word64 dest[2])
1027 {
1028  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1029  // word pointers. The ISA lacks stores for short* and char*.
1030  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1031 
1032  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1033  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1034  CRYPTOPP_UNUSED(addr);
1035 
1036 #if defined(_ARCH_PWR9)
1037  vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1038 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1039  // 32-bit cast is not a typo. Compiler workaround.
1040  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1041 #else
1043 #endif
1044 }
1045 
1046 /// \brief Stores a vector to a word array
1047 /// \tparam T vector type
1048 /// \param data the vector
1049 /// \param off offset into the dest word array
1050 /// \param dest the word array
1051 /// \details VecStore() stores a vector to a word array.
1052 /// \details VecStore() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1053 /// The instruction does not require aligned effective memory addresses.
1054 /// VecStore_ALTIVEC() is used if POWER7 or VSX are not available.
1055 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1056 /// are required to fix up unaligned memory addresses.
1057 /// \details VecStore() with 64-bit elements is available on POWER8 and above.
1058 /// \par Wraps
1059 /// vec_xst on VSX or POWER8 and above, Altivec store on POWER7 and below
1060 /// \sa VecStore_ALTIVEC, VecStoreAligned
1061 /// \since Crypto++ 8.0
1062 template<class T>
1063 inline void VecStore(const T data, int off, word64 dest[2])
1064 {
1065  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1066  // word pointers. The ISA lacks stores for short* and char*.
1067  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1068 
1069  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1070  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word64>() == 0);
1071  CRYPTOPP_UNUSED(addr);
1072 
1073 #if defined(_ARCH_PWR9)
1074  vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1075 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1076  // 32-bit cast is not a typo. Compiler workaround.
1077  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1078 #else
1080 #endif
1081 }
1082 
1083 /// \brief Stores a vector to a byte array
1084 /// \tparam T vector type
1085 /// \param data the vector
1086 /// \param dest the byte array
1087 /// \details VecStoreAligned() stores a vector from an aligned byte array.
1088 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1089 /// <tt>vec_st</tt> is used if POWER9 is not available. The effective
1090 /// address of <tt>dest</tt> must be 16-byte aligned for Altivec.
1091 /// \par Wraps
1092 /// vec_xst on POWER9 or above, vec_st on POWER8 and below
1093 /// \sa VecStore_ALTIVEC, VecStore
1094 /// \since Crypto++ 8.0
1095 template<class T>
1096 inline void VecStoreAligned(const T data, byte dest[16])
1097 {
1098  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1099  // word pointers. The ISA lacks loads for short* and char*.
1100  // Power9/ISA 3.0 provides vec_xl for all datatypes.
1101 
1102  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1103  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1104  CRYPTOPP_UNUSED(addr);
1105 
1106 #if defined(_ARCH_PWR9)
1107  vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1108 #else
1109  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1110 #endif
1111 }
1112 
1113 /// \brief Stores a vector to a byte array
1114 /// \tparam T vector type
1115 /// \param data the vector
1116 /// \param off offset into the dest byte array
1117 /// \param dest the byte array
1118 /// \details VecStoreAligned() stores a vector from an aligned byte array.
1119 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1120 /// <tt>vec_st</tt> is used if POWER9 is not available. The effective
1121 /// address of <tt>dest</tt> must be 16-byte aligned for Altivec.
1122 /// \par Wraps
1123 /// vec_xst on POWER9 or above, vec_st on POWER8 and below
1124 /// \sa VecStore_ALTIVEC, VecStore
1125 /// \since Crypto++ 8.0
1126 template<class T>
1127 inline void VecStoreAligned(const T data, int off, byte dest[16])
1128 {
1129  // Power7/ISA 2.06 provides vec_xl, but only for 32-bit and 64-bit
1130  // word pointers. The ISA lacks loads for short* and char*.
1131  // Power9/ISA 3.0 provides vec_xl for all datatypes.
1132 
1133  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1134  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1135  CRYPTOPP_UNUSED(addr);
1136 
1137 #if defined(_ARCH_PWR9)
1138  vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1139 #else
1140  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1141 #endif
1142 }
1143 
1144 /// \brief Stores a vector to a word array
1145 /// \tparam T vector type
1146 /// \param data the vector
1147 /// \param dest the word array
1148 /// \details VecStoreAligned() stores a vector from an aligned word array.
1149 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1150 /// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
1151 /// is used if POWER7 is not available. The effective address of <tt>dest</tt>
1152 /// must be 16-byte aligned for Altivec.
1153 /// \par Wraps
1154 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1155 /// \sa VecStore_ALTIVEC, VecStore
1156 /// \since Crypto++ 8.0
1157 template<class T>
1158 inline void VecStoreAligned(const T data, word32 dest[4])
1159 {
1160  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1161  // word pointers. The ISA lacks stores for short* and char*.
1162  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1163 
1164  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1165  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1166  CRYPTOPP_UNUSED(addr);
1167 
1168 #if defined(_ARCH_PWR9)
1169  vec_xst((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1170 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1171  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1172 #else
1173  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1174 #endif
1175 }
1176 
1177 /// \brief Stores a vector to a word array
1178 /// \tparam T vector type
1179 /// \param data the vector
1180 /// \param off offset into the dest word array
1181 /// \param dest the word array
1182 /// \details VecStoreAligned() stores a vector from an aligned word array.
1183 /// \details VecStoreAligned() uses POWER9's <tt>vec_xl</tt> if available.
1184 /// POWER7 <tt>vec_xst</tt> is used if POWER9 is not available. <tt>vec_st</tt>
1185 /// is used if POWER7 is not available. The effective address of <tt>dest</tt>
1186 /// must be 16-byte aligned for Altivec.
1187 /// \par Wraps
1188 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1189 /// \sa VecStore_ALTIVEC, VecStore
1190 /// \since Crypto++ 8.0
1191 template<class T>
1192 inline void VecStoreAligned(const T data, int off, word32 dest[4])
1193 {
1194  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1195  // word pointers. The ISA lacks stores for short* and char*.
1196  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1197 
1198  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1199  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1200  CRYPTOPP_UNUSED(addr);
1201 
1202 #if defined(_ARCH_PWR9)
1203  vec_xst((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1204 #elif defined(__VSX__) || defined(_ARCH_PWR8)
1205  vec_xst((uint32x4_p)data, 0, NCONST_V32_CAST(addr));
1206 #else
1207  vec_st((uint8x16_p)data, 0, NCONST_V8_CAST(addr));
1208 #endif
1209 }
1210 
1211 /// \brief Stores a vector to a byte array
1212 /// \tparam T vector type
1213 /// \param data the vector
1214 /// \param dest the byte array
1215 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
1216 /// will reverse all bytes in the array on a little endian system.
1217 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1218 /// The instruction does not require aligned effective memory addresses.
1219 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1220 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1221 /// are required to fix up unaligned memory addresses.
1222 /// \par Wraps
1223 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1224 /// \sa VecStore_ALTIVEC, VecStoreAligned
1225 /// \since Crypto++ 6.0
1226 template <class T>
1227 inline void VecStoreBE(const T data, byte dest[16])
1228 {
1229  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1230  // word pointers. The ISA lacks stores for short* and char*.
1231  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1232 
1233  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1234  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1235  CRYPTOPP_UNUSED(addr);
1236 
1237 #if defined(_ARCH_PWR9)
1238  vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1239 #elif defined(CRYPTOPP_BIG_ENDIAN)
1240  VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1241 #else
1243 #endif
1244 }
1245 
1246 /// \brief Stores a vector to a byte array
1247 /// \tparam T vector type
1248 /// \param data the vector
1249 /// \param off offset into the dest byte array
1250 /// \param dest the byte array
1251 /// \details VecStoreBE() stores a vector to a byte array. VecStoreBE
1252 /// will reverse all bytes in the array on a little endian system.
1253 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1254 /// The instruction does not require aligned effective memory addresses.
1255 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1256 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1257 /// are required to fix up unaligned memory addresses.
1258 /// \par Wraps
1259 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1260 /// \sa VecStore_ALTIVEC, VecStoreAligned
1261 /// \since Crypto++ 6.0
1262 template <class T>
1263 inline void VecStoreBE(const T data, int off, byte dest[16])
1264 {
1265  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1266  // word pointers. The ISA lacks stores for short* and char*.
1267  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1268 
1269  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1270  CRYPTOPP_ASSERT(addr % GetAlignmentOf<byte>() == 0);
1271  CRYPTOPP_UNUSED(addr);
1272 
1273 #if defined(_ARCH_PWR9)
1274  vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1275 #elif defined(CRYPTOPP_BIG_ENDIAN)
1276  VecStore((uint8x16_p)data, NCONST_V8_CAST(addr));
1277 #else
1279 #endif
1280 }
1281 
1282 /// \brief Stores a vector to a word array
1283 /// \tparam T vector type
1284 /// \param data the vector
1285 /// \param dest the word array
1286 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
1287 /// will reverse all bytes in the array on a little endian system.
1288 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1289 /// The instruction does not require aligned effective memory addresses.
1290 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1291 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1292 /// are required to fix up unaligned memory addresses.
1293 /// \par Wraps
1294 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1295 /// \sa VecStore_ALTIVEC, VecStoreAligned
1296 /// \since Crypto++ 8.0
1297 template <class T>
1298 inline void VecStoreBE(const T data, word32 dest[4])
1299 {
1300  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1301  // word pointers. The ISA lacks stores for short* and char*.
1302  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1303 
1304  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest);
1305  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1306  CRYPTOPP_UNUSED(addr);
1307 
1308 #if defined(_ARCH_PWR9)
1309  vec_xst_be((uint8x16_p)data, 0, NCONST_V8_CAST(dest));
1310 #elif defined(CRYPTOPP_BIG_ENDIAN)
1311  VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1312 #else
1314 #endif
1315 }
1316 
1317 /// \brief Stores a vector to a word array
1318 /// \tparam T vector type
1319 /// \param data the vector
1320 /// \param off offset into the dest word array
1321 /// \param dest the word array
1322 /// \details VecStoreBE() stores a vector to a word array. VecStoreBE
1323 /// will reverse all words in the array on a little endian system.
1324 /// \details VecStoreBE() uses POWER7's and VSX's <tt>vec_xst</tt> if available.
1325 /// The instruction does not require aligned effective memory addresses.
1326 /// VecStore_ALTIVEC() is used if POWER7 is not available.
1327 /// VecStore_ALTIVEC() can be relatively expensive if extra instructions
1328 /// are required to fix up unaligned memory addresses.
1329 /// \par Wraps
1330 /// vec_xst on VSX or POWER8 and above, vec_st on POWER7 and below
1331 /// \sa VecStore_ALTIVEC, VecStoreAligned
1332 /// \since Crypto++ 8.0
1333 template <class T>
1334 inline void VecStoreBE(const T data, int off, word32 dest[4])
1335 {
1336  // Power7/ISA 2.06 provides vec_xst, but only for 32-bit and 64-bit
1337  // word pointers. The ISA lacks stores for short* and char*.
1338  // Power9/ISA 3.0 provides vec_xst for all datatypes.
1339 
1340  const uintptr_t addr = reinterpret_cast<uintptr_t>(dest)+off;
1341  CRYPTOPP_ASSERT(addr % GetAlignmentOf<word32>() == 0);
1342  CRYPTOPP_UNUSED(addr);
1343 
1344 #if defined(_ARCH_PWR9)
1345  vec_xst_be((uint8x16_p)data, off, NCONST_V8_CAST(dest));
1346 #elif defined(CRYPTOPP_BIG_ENDIAN)
1347  VecStore((uint32x4_p)data, NCONST_V32_CAST(addr));
1348 #else
1350 #endif
1351 }
1352 
1353 //@}
1354 
1355 /// \name LOGICAL OPERATIONS
1356 //@{
1357 
1358 /// \brief AND two vectors
1359 /// \tparam T1 vector type
1360 /// \tparam T2 vector type
1361 /// \param vec1 the first vector
1362 /// \param vec2 the second vector
1363 /// \return vector
1364 /// \details VecAnd() performs <tt>vec1 & vec2</tt>.
1365 /// vec2 is cast to the same type as vec1. The return vector
1366 /// is the same type as vec1.
1367 /// \par Wraps
1368 /// vec_and
1369 /// \sa VecAnd64
1370 /// \since Crypto++ 6.0
1371 template <class T1, class T2>
1372 inline T1 VecAnd(const T1 vec1, const T2 vec2)
1373 {
1374  return (T1)vec_and(vec1, (T1)vec2);
1375 }
1376 
1377 /// \brief OR two vectors
1378 /// \tparam T1 vector type
1379 /// \tparam T2 vector type
1380 /// \param vec1 the first vector
1381 /// \param vec2 the second vector
1382 /// \return vector
1383 /// \details VecOr() performs <tt>vec1 | vec2</tt>.
1384 /// vec2 is cast to the same type as vec1. The return vector
1385 /// is the same type as vec1.
1386 /// \par Wraps
1387 /// vec_or
1388 /// \sa VecOr64
1389 /// \since Crypto++ 6.0
1390 template <class T1, class T2>
1391 inline T1 VecOr(const T1 vec1, const T2 vec2)
1392 {
1393  return (T1)vec_or(vec1, (T1)vec2);
1394 }
1395 
1396 /// \brief XOR two vectors
1397 /// \tparam T1 vector type
1398 /// \tparam T2 vector type
1399 /// \param vec1 the first vector
1400 /// \param vec2 the second vector
1401 /// \return vector
1402 /// \details VecXor() performs <tt>vec1 ^ vec2</tt>.
1403 /// vec2 is cast to the same type as vec1. The return vector
1404 /// is the same type as vec1.
1405 /// \par Wraps
1406 /// vec_xor
1407 /// \sa VecXor64
1408 /// \since Crypto++ 6.0
1409 template <class T1, class T2>
1410 inline T1 VecXor(const T1 vec1, const T2 vec2)
1411 {
1412  return (T1)vec_xor(vec1, (T1)vec2);
1413 }
1414 
1415 //@}
1416 
1417 /// \name ARITHMETIC OPERATIONS
1418 //@{
1419 
1420 /// \brief Add two vectors
1421 /// \tparam T1 vector type
1422 /// \tparam T2 vector type
1423 /// \param vec1 the first vector
1424 /// \param vec2 the second vector
1425 /// \return vector
1426 /// \details VecAdd() performs <tt>vec1 + vec2</tt>.
1427 /// vec2 is cast to the same type as vec1. The return vector
1428 /// is the same type as vec1.
1429 /// \par Wraps
1430 /// vec_add
1431 /// \sa VecAdd64
1432 /// \since Crypto++ 6.0
1433 template <class T1, class T2>
1434 inline T1 VecAdd(const T1 vec1, const T2 vec2)
1435 {
1436  return (T1)vec_add(vec1, (T1)vec2);
1437 }
1438 
1439 /// \brief Subtract two vectors
1440 /// \tparam T1 vector type
1441 /// \tparam T2 vector type
1442 /// \param vec1 the first vector
1443 /// \param vec2 the second vector
1444 /// \details VecSub() performs <tt>vec1 - vec2</tt>.
1445 /// vec2 is cast to the same type as vec1. The return vector
1446 /// is the same type as vec1.
1447 /// \par Wraps
1448 /// vec_sub
1449 /// \sa VecSub64
1450 /// \since Crypto++ 6.0
1451 template <class T1, class T2>
1452 inline T1 VecSub(const T1 vec1, const T2 vec2)
1453 {
1454  return (T1)vec_sub(vec1, (T1)vec2);
1455 }
1456 
1457 //@}
1458 
1459 /// \name PERMUTE OPERATIONS
1460 //@{
1461 
1462 /// \brief Permutes a vector
1463 /// \tparam T1 vector type
1464 /// \tparam T2 vector type
1465 /// \param vec the vector
1466 /// \param mask vector mask
1467 /// \return vector
1468 /// \details VecPermute() creates a new vector from vec according to mask.
1469 /// mask is an uint8x16_p vector. The return vector is the same type as vec.
1470 /// \par Wraps
1471 /// vec_perm
1472 /// \since Crypto++ 6.0
1473 template <class T1, class T2>
1474 inline T1 VecPermute(const T1 vec, const T2 mask)
1475 {
1476  return (T1)vec_perm(vec, vec, (uint8x16_p)mask);
1477 }
1478 
1479 /// \brief Permutes two vectors
1480 /// \tparam T1 vector type
1481 /// \tparam T2 vector type
1482 /// \param vec1 the first vector
1483 /// \param vec2 the second vector
1484 /// \param mask vector mask
1485 /// \return vector
1486 /// \details VecPermute() creates a new vector from vec1 and vec2 according to mask.
1487 /// mask is an uint8x16_p vector. The return vector is the same type as vec.
1488 /// \par Wraps
1489 /// vec_perm
1490 /// \since Crypto++ 6.0
1491 template <class T1, class T2>
1492 inline T1 VecPermute(const T1 vec1, const T1 vec2, const T2 mask)
1493 {
1494  return (T1)vec_perm(vec1, (T1)vec2, (uint8x16_p)mask);
1495 }
1496 
1497 //@}
1498 
1499 /// \name SHIFT AND ROTATE OPERATIONS
1500 //@{
1501 
1502 /// \brief Shift a vector left
1503 /// \tparam C shift byte count
1504 /// \tparam T vector type
1505 /// \param vec the vector
1506 /// \return vector
1507 /// \details VecShiftLeftOctet() returns a new vector after shifting the
1508 /// concatenation of the zero vector and the source vector by the specified
1509 /// number of bytes. The return vector is the same type as vec.
1510 /// \details On big endian machines VecShiftLeftOctet() is <tt>vec_sld(a, z,
1511 /// c)</tt>. On little endian machines VecShiftLeftOctet() is translated to
1512 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1513 /// if on a big endian machine as shown below.
1514 /// <pre>
1515 /// uint8x16_p x = VecLoad(ptr);
1516 /// uint8x16_p y = VecShiftLeftOctet<12>(x);
1517 /// </pre>
1518 /// \par Wraps
1519 /// vec_sld
1520 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1521 /// endian sensitive?</A> on Stack Overflow
1522 /// \since Crypto++ 6.0
1523 template <unsigned int C, class T>
1524 inline T VecShiftLeftOctet(const T vec)
1525 {
1526  const T zero = {0};
1527  if (C >= 16)
1528  {
1529  // Out of range
1530  return zero;
1531  }
1532  else if (C == 0)
1533  {
1534  // Noop
1535  return vec;
1536  }
1537  else
1538  {
1539 #if defined(CRYPTOPP_BIG_ENDIAN)
1540  enum { R=C&0xf };
1541  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1542 #else
1543  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1544  return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1545 #endif
1546  }
1547 }
1548 
1549 /// \brief Shift a vector right
1550 /// \tparam C shift byte count
1551 /// \tparam T vector type
1552 /// \param vec the vector
1553 /// \return vector
1554 /// \details VecShiftRightOctet() returns a new vector after shifting the
1555 /// concatenation of the zero vector and the source vector by the specified
1556 /// number of bytes. The return vector is the same type as vec.
1557 /// \details On big endian machines VecShiftRightOctet() is <tt>vec_sld(a, z,
1558 /// c)</tt>. On little endian machines VecShiftRightOctet() is translated to
1559 /// <tt>vec_sld(z, a, 16-c)</tt>. You should always call the function as
1560 /// if on a big endian machine as shown below.
1561 /// <pre>
1562 /// uint8x16_p x = VecLoad(ptr);
1563 /// uint8x16_p y = VecShiftRightOctet<12>(y);
1564 /// </pre>
1565 /// \par Wraps
1566 /// vec_sld
1567 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1568 /// endian sensitive?</A> on Stack Overflow
1569 /// \since Crypto++ 6.0
1570 template <unsigned int C, class T>
1571 inline T VecShiftRightOctet(const T vec)
1572 {
1573  const T zero = {0};
1574  if (C >= 16)
1575  {
1576  // Out of range
1577  return zero;
1578  }
1579  else if (C == 0)
1580  {
1581  // Noop
1582  return vec;
1583  }
1584  else
1585  {
1586 #if defined(CRYPTOPP_BIG_ENDIAN)
1587  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1588  return (T)vec_sld((uint8x16_p)zero, (uint8x16_p)vec, R);
1589 #else
1590  enum { R=C&0xf };
1591  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)zero, R);
1592 #endif
1593  }
1594 }
1595 
1596 /// \brief Rotate a vector left
1597 /// \tparam C shift byte count
1598 /// \tparam T vector type
1599 /// \param vec the vector
1600 /// \return vector
1601 /// \details VecRotateLeftOctet() returns a new vector after rotating the
1602 /// concatenation of the source vector with itself by the specified
1603 /// number of bytes. The return vector is the same type as vec.
1604 /// \par Wraps
1605 /// vec_sld
1606 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1607 /// endian sensitive?</A> on Stack Overflow
1608 /// \since Crypto++ 6.0
1609 template <unsigned int C, class T>
1610 inline T VecRotateLeftOctet(const T vec)
1611 {
1612 #if defined(CRYPTOPP_BIG_ENDIAN)
1613  enum { R = C&0xf };
1614  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1615 #else
1616  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1617  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1618 #endif
1619 }
1620 
1621 /// \brief Rotate a vector right
1622 /// \tparam C shift byte count
1623 /// \tparam T vector type
1624 /// \param vec the vector
1625 /// \return vector
1626 /// \details VecRotateRightOctet() returns a new vector after rotating the
1627 /// concatenation of the source vector with itself by the specified
1628 /// number of bytes. The return vector is the same type as vec.
1629 /// \par Wraps
1630 /// vec_sld
1631 /// \sa <A HREF="https://stackoverflow.com/q/46341923/608639">Is vec_sld
1632 /// endian sensitive?</A> on Stack Overflow
1633 /// \since Crypto++ 6.0
1634 template <unsigned int C, class T>
1635 inline T VecRotateRightOctet(const T vec)
1636 {
1637 #if defined(CRYPTOPP_BIG_ENDIAN)
1638  enum { R=(16-C)&0xf }; // Linux xlC 13.1 workaround in Debug builds
1639  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1640 #else
1641  enum { R = C&0xf };
1642  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, R);
1643 #endif
1644 }
1645 
1646 /// \brief Rotate a vector left
1647 /// \tparam C rotate bit count
1648 /// \param vec the vector
1649 /// \return vector
1650 /// \details VecRotateLeft() rotates each element in a vector by
1651 /// bit count. The return vector is the same type as vec.
1652 /// \par Wraps
1653 /// vec_rl
1654 /// \since Crypto++ 7.0
1655 template<unsigned int C>
1657 {
1658  const uint32x4_p m = {C, C, C, C};
1659  return vec_rl(vec, m);
1660 }
1661 
1662 /// \brief Rotate a vector right
1663 /// \tparam C rotate bit count
1664 /// \param vec the vector
1665 /// \return vector
1666 /// \details VecRotateRight() rotates each element in a vector
1667 /// by bit count. The return vector is the same type as vec.
1668 /// \par Wraps
1669 /// vec_rl
1670 /// \since Crypto++ 7.0
1671 template<unsigned int C>
1673 {
1674  const uint32x4_p m = {32-C, 32-C, 32-C, 32-C};
1675  return vec_rl(vec, m);
1676 }
1677 
1678 /// \brief Shift a vector left
1679 /// \tparam C shift bit count
1680 /// \param vec the vector
1681 /// \return vector
1682 /// \details VecShiftLeft() rotates each element in a vector
1683 /// by bit count. The return vector is the same type as vec.
1684 /// \par Wraps
1685 /// vec_sl
1686 /// \since Crypto++ 8.1
1687 template<unsigned int C>
1689 {
1690  const uint32x4_p m = {C, C, C, C};
1691  return vec_sl(vec, m);
1692 }
1693 
1694 /// \brief Shift a vector right
1695 /// \tparam C shift bit count
1696 /// \param vec the vector
1697 /// \return vector
1698 /// \details VecShiftRight() rotates each element in a vector
1699 /// by bit count. The return vector is the same type as vec.
1700 /// \par Wraps
1701 /// vec_rl
1702 /// \since Crypto++ 8.1
1703 template<unsigned int C>
1705 {
1706  const uint32x4_p m = {C, C, C, C};
1707  return vec_sr(vec, m);
1708 }
1709 
1710 // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
1711 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1712 
1713 /// \brief Rotate a vector left
1714 /// \tparam C rotate bit count
1715 /// \param vec the vector
1716 /// \return vector
1717 /// \details VecRotateLeft() rotates each element in a vector
1718 /// by bit count. The return vector is the same type as vec.
1719 /// \details VecRotateLeft() with 64-bit elements is available on
1720 /// POWER8 and above.
1721 /// \par Wraps
1722 /// vec_rl
1723 /// \since Crypto++ 8.0
1724 template<unsigned int C>
1726 {
1727  const uint64x2_p m = {C, C};
1728  return vec_rl(vec, m);
1729 }
1730 
1731 /// \brief Shift a vector left
1732 /// \tparam C shift bit count
1733 /// \param vec the vector
1734 /// \return vector
1735 /// \details VecShiftLeft() rotates each element in a vector
1736 /// by bit count. The return vector is the same type as vec.
1737 /// \details VecShiftLeft() with 64-bit elements is available on
1738 /// POWER8 and above.
1739 /// \par Wraps
1740 /// vec_sl
1741 /// \since Crypto++ 8.1
1742 template<unsigned int C>
1744 {
1745  const uint64x2_p m = {C, C};
1746  return vec_sl(vec, m);
1747 }
1748 
1749 /// \brief Rotate a vector right
1750 /// \tparam C rotate bit count
1751 /// \param vec the vector
1752 /// \return vector
1753 /// \details VecRotateRight() rotates each element in a vector
1754 /// by bit count. The return vector is the same type as vec.
1755 /// \details VecRotateRight() with 64-bit elements is available on
1756 /// POWER8 and above.
1757 /// \par Wraps
1758 /// vec_rl
1759 /// \since Crypto++ 8.0
1760 template<unsigned int C>
1762 {
1763  const uint64x2_p m = {64-C, 64-C};
1764  return vec_rl(vec, m);
1765 }
1766 
1767 /// \brief Shift a vector right
1768 /// \tparam C shift bit count
1769 /// \param vec the vector
1770 /// \return vector
1771 /// \details VecShiftRight() rotates each element in a vector
1772 /// by bit count. The return vector is the same type as vec.
1773 /// \details VecShiftRight() with 64-bit elements is available on
1774 /// POWER8 and above.
1775 /// \par Wraps
1776 /// vec_sr
1777 /// \since Crypto++ 8.1
1778 template<unsigned int C>
1780 {
1781  const uint64x2_p m = {C, C};
1782  return vec_sr(vec, m);
1783 }
1784 
1785 #endif // ARCH_PWR8
1786 
1787 //@}
1788 
1789 /// \name OTHER OPERATIONS
1790 //@{
1791 
1792 /// \brief Merge two vectors
1793 /// \tparam T vector type
1794 /// \param vec1 the first vector
1795 /// \param vec2 the second vector
1796 /// \return vector
1797 /// \par Wraps
1798 /// vec_mergel
1799 /// \since Crypto++ 8.1
1800 template <class T>
1801 inline T VecMergeLow(const T vec1, const T vec2)
1802 {
1803  return vec_mergel(vec1, vec2);
1804 }
1805 
1806 /// \brief Merge two vectors
1807 /// \tparam T vector type
1808 /// \param vec1 the first vector
1809 /// \param vec2 the second vector
1810 /// \return vector
1811 /// \par Wraps
1812 /// vec_mergeh
1813 /// \since Crypto++ 8.1
1814 template <class T>
1815 inline T VecMergeHigh(const T vec1, const T vec2)
1816 {
1817  return vec_mergeh(vec1, vec2);
1818 }
1819 
1820 /// \brief Broadcast 32-bit word to a vector
1821 /// \param val the 32-bit value
1822 /// \return vector
1823 /// \par Wraps
1824 /// vec_splats
1825 /// \since Crypto++ 8.3
1826 inline uint32x4_p VecSplatWord(word32 val)
1827 {
1828  // Apple Altivec and XL C++ do not offer vec_splats.
1829  // GCC offers vec_splats back to -mcpu=power4.
1830 #if defined(_ARCH_PWR4) && defined(__GNUC__)
1831  return vec_splats(val);
1832 #else
1833  //const word32 x[4] = {val,val,val,val};
1834  //return VecLoad(x);
1835  const word32 x[4] = {val};
1836  return vec_splat(VecLoad(x),0);
1837 #endif
1838 }
1839 
1840 /// \brief Broadcast 32-bit element to a vector
1841 /// \tparam the element number
1842 /// \param val the 32-bit value
1843 /// \return vector
1844 /// \par Wraps
1845 /// vec_splat
1846 /// \since Crypto++ 8.3
1847 template <unsigned int N>
1849 {
1850  return vec_splat(val, N);
1851 }
1852 
1853 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
1854 /// \brief Broadcast 64-bit double word to a vector
1855 /// \param val the 64-bit value
1856 /// \return vector
1857 /// \par Wraps
1858 /// vec_splats
1859 /// \since Crypto++ 8.3
1860 inline uint64x2_p VecSplatWord(word64 val)
1861 {
1862  // The PPC64 ABI says so.
1863  return vec_splats((unsigned long long)val);
1864 }
1865 
1866 /// \brief Broadcast 64-bit element to a vector
1867 /// \tparam the element number
1868 /// \param val the 64-bit value
1869 /// \return vector
1870 /// \par Wraps
1871 /// vec_splat
1872 /// \since Crypto++ 8.3
1873 template <unsigned int N>
1875 {
1876 #if defined(__VSX__) || defined(_ARCH_PWR8)
1877  return vec_splat(val, N);
1878 #else
1879  enum {E=N&1};
1880  if (E == 0)
1881  {
1882  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
1883  return vec_perm(val, val, m);
1884  }
1885  else // (E == 1)
1886  {
1887  const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
1888  return vec_perm(val, val, m);
1889  }
1890 #endif
1891 }
1892 #endif
1893 
1894 /// \brief Extract a dword from a vector
1895 /// \tparam T vector type
1896 /// \param val the vector
1897 /// \return vector created from low dword
1898 /// \details VecGetLow() extracts the low dword from a vector. The low dword
1899 /// is composed of the least significant bits and occupies bytes 8 through 15
1900 /// when viewed as a big endian array. The return vector is the same type as
1901 /// the original vector and padded with 0's in the most significant bit positions.
1902 /// \par Wraps
1903 /// vec_sld
1904 /// \since Crypto++ 7.0
1905 template <class T>
1906 inline T VecGetLow(const T val)
1907 {
1908 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1909  const T zero = {0};
1910  return (T)VecMergeLow((uint64x2_p)zero, (uint64x2_p)val);
1911 #else
1912  return VecShiftRightOctet<8>(VecShiftLeftOctet<8>(val));
1913 #endif
1914 }
1915 
1916 /// \brief Extract a dword from a vector
1917 /// \tparam T vector type
1918 /// \param val the vector
1919 /// \return vector created from high dword
1920 /// \details VecGetHigh() extracts the high dword from a vector. The high dword
1921 /// is composed of the most significant bits and occupies bytes 0 through 7
1922 /// when viewed as a big endian array. The return vector is the same type as
1923 /// the original vector and padded with 0's in the most significant bit positions.
1924 /// \par Wraps
1925 /// vec_sld
1926 /// \since Crypto++ 7.0
1927 template <class T>
1928 inline T VecGetHigh(const T val)
1929 {
1930 #if defined(CRYPTOPP_BIG_ENDIAN) && (defined(__VSX__) || defined(_ARCH_PWR8))
1931  const T zero = {0};
1932  return (T)VecMergeHigh((uint64x2_p)zero, (uint64x2_p)val);
1933 #else
1934  return VecShiftRightOctet<8>(val);
1935 #endif
1936 }
1937 
1938 /// \brief Exchange high and low double words
1939 /// \tparam T vector type
1940 /// \param vec the vector
1941 /// \return vector
1942 /// \par Wraps
1943 /// vec_sld
1944 /// \since Crypto++ 7.0
1945 template <class T>
1946 inline T VecSwapWords(const T vec)
1947 {
1948  return (T)vec_sld((uint8x16_p)vec, (uint8x16_p)vec, 8);
1949 }
1950 
1951 //@}
1952 
1953 /// \name COMPARISON
1954 //@{
1955 
1956 /// \brief Compare two vectors
1957 /// \tparam T1 vector type
1958 /// \tparam T2 vector type
1959 /// \param vec1 the first vector
1960 /// \param vec2 the second vector
1961 /// \return true if vec1 equals vec2, false otherwise
1962 /// \details VecEqual() performs a bitwise compare. The vector element types do
1963 /// not matter.
1964 /// \par Wraps
1965 /// vec_all_eq
1966 /// \since Crypto++ 8.0
1967 template <class T1, class T2>
1968 inline bool VecEqual(const T1 vec1, const T2 vec2)
1969 {
1970  return 1 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1971 }
1972 
1973 /// \brief Compare two vectors
1974 /// \tparam T1 vector type
1975 /// \tparam T2 vector type
1976 /// \param vec1 the first vector
1977 /// \param vec2 the second vector
1978 /// \return true if vec1 does not equal vec2, false otherwise
1979 /// \details VecNotEqual() performs a bitwise compare. The vector element types do
1980 /// not matter.
1981 /// \par Wraps
1982 /// vec_all_eq
1983 /// \since Crypto++ 8.0
1984 template <class T1, class T2>
1985 inline bool VecNotEqual(const T1 vec1, const T2 vec2)
1986 {
1987  return 0 == vec_all_eq((uint32x4_p)vec1, (uint32x4_p)vec2);
1988 }
1989 
1990 //@}
1991 
1992 ////////////////// 32-bit Altivec /////////////////
1993 
1994 /// \name 32-BIT ALTIVEC
1995 //@{
1996 
1997 /// \brief Add two vectors as if uint64x2_p
1998 /// \param vec1 the first vector
1999 /// \param vec2 the second vector
2000 /// \return vector
2001 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
2002 /// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
2003 /// the carries from the elements.
2004 /// \par Wraps
2005 /// vec_add for POWER8, vec_addc, vec_perm, vec_add for Altivec
2006 /// \since Crypto++ 8.3
2007 inline uint32x4_p VecAdd64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2008 {
2009  // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2010 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2011  return (uint32x4_p)vec_add((uint64x2_p)vec1, (uint64x2_p)vec2);
2012 #else
2013  // The carry mask selects carrys for elements 1 and 3 and sets
2014  // remaining elements to 0. The results is then shifted so the
2015  // carried values are added to elements 0 and 2.
2016 #if defined(CRYPTOPP_BIG_ENDIAN)
2017  const uint32x4_p zero = {0, 0, 0, 0};
2018  const uint32x4_p mask = {0, 1, 0, 1};
2019 #else
2020  const uint32x4_p zero = {0, 0, 0, 0};
2021  const uint32x4_p mask = {1, 0, 1, 0};
2022 #endif
2023 
2024  uint32x4_p cy = vec_addc(vec1, vec2);
2025  uint32x4_p res = vec_add(vec1, vec2);
2026  cy = vec_and(mask, cy);
2027  cy = vec_sld (cy, zero, 4);
2028  return vec_add(res, cy);
2029 #endif
2030 }
2031 
2032 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2033 /// \brief Add two vectors as if uint64x2_p
2034 /// \param vec1 the first vector
2035 /// \param vec2 the second vector
2036 /// \return vector
2037 /// \details VecAdd64() performs <tt>vec1 + vec2</tt>. VecAdd64() performs as
2038 /// if adding two uint64x2_p vectors. On POWER7 and below VecAdd64() manages
2039 /// the carries from the elements.
2040 /// \par Wraps
2041 /// vec_add for POWER8
2042 /// \since Crypto++ 8.3
2043 inline uint64x2_p VecAdd64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2044 {
2045  // 64-bit elements available at POWER7 with VSX, but addudm requires POWER8
2046  const uint64x2_p res = vec_add(vec1, vec2);
2047 
2048 #if defined(CRYPTOPP_DEBUG)
2049  // Test 32-bit add in debug builds while we are here.
2050  const uint32x4_p x = (uint32x4_p)vec1;
2051  const uint32x4_p y = (uint32x4_p)vec2;
2052  const uint32x4_p r = VecAdd64(x, y);
2053 
2054  CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2055 #endif
2056 
2057  return res;
2058 }
2059 #endif
2060 
2061 /// \brief Subtract two vectors as if uint64x2_p
2062 /// \param vec1 the first vector
2063 /// \param vec2 the second vector
2064 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
2065 /// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
2066 /// manages the borrows from the elements.
2067 /// \par Wraps
2068 /// vec_sub for POWER8, vec_subc, vec_andc, vec_perm, vec_sub for Altivec
2069 /// \since Crypto++ 8.3
2070 inline uint32x4_p VecSub64(const uint32x4_p& vec1, const uint32x4_p& vec2)
2071 {
2072 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2073  // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2074  return (uint32x4_p)vec_sub((uint64x2_p)vec1, (uint64x2_p)vec2);
2075 #else
2076  // The borrow mask selects borrows for elements 1 and 3 and sets
2077  // remaining elements to 0. The results is then shifted so the
2078  // borrowed values are subtracted from elements 0 and 2.
2079 #if defined(CRYPTOPP_BIG_ENDIAN)
2080  const uint32x4_p zero = {0, 0, 0, 0};
2081  const uint32x4_p mask = {0, 1, 0, 1};
2082 #else
2083  const uint32x4_p zero = {0, 0, 0, 0};
2084  const uint32x4_p mask = {1, 0, 1, 0};
2085 #endif
2086 
2087  // subc sets the complement of borrow, so we have to
2088  // un-complement it using andc.
2089  uint32x4_p bw = vec_subc(vec1, vec2);
2090  uint32x4_p res = vec_sub(vec1, vec2);
2091  bw = vec_andc(mask, bw);
2092  bw = vec_sld (bw, zero, 4);
2093  return vec_sub(res, bw);
2094 #endif
2095 }
2096 
2097 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2098 /// \brief Subtract two vectors as if uint64x2_p
2099 /// \param vec1 the first vector
2100 /// \param vec2 the second vector
2101 /// \details VecSub64() performs <tt>vec1 - vec2</tt>. VecSub64() performs as
2102 /// if subtracting two uint64x2_p vectors. On POWER7 and below VecSub64()
2103 /// manages the borrows from the elements.
2104 /// \par Wraps
2105 /// vec_sub for POWER8
2106 /// \since Crypto++ 8.3
2107 inline uint64x2_p VecSub64(const uint64x2_p& vec1, const uint64x2_p& vec2)
2108 {
2109  // 64-bit elements available at POWER7 with VSX, but subudm requires POWER8
2110  const uint64x2_p res = vec_sub(vec1, vec2);
2111 
2112 #if defined(CRYPTOPP_DEBUG)
2113  // Test 32-bit sub in debug builds while we are here.
2114  const uint32x4_p x = (uint32x4_p)vec1;
2115  const uint32x4_p y = (uint32x4_p)vec2;
2116  const uint32x4_p r = VecSub64(x, y);
2117 
2118  CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2119 #endif
2120 
2121  return res;
2122 }
2123 #endif
2124 
2125 /// \brief Rotate a vector left as if uint64x2_p
2126 /// \tparam C rotate bit count
2127 /// \param vec the vector
2128 /// \return vector
2129 /// \details VecRotateLeft() rotates each element in a vector by bit count.
2130 /// vec is rotated as if uint64x2_p.
2131 /// \par Wraps
2132 /// vec_rl
2133 /// \since Crypto++ 8.3
2134 template<unsigned int C>
2136 {
2137 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2138  // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2139  return (uint32x4_p)VecRotateLeft<C>((uint64x2_p)vec);
2140 #else
2141  // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2142  enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2143 
2144  // Get the low bits, shift them to high bits
2145  uint32x4_p t1 = VecShiftLeft<S32>(vec);
2146  // Get the high bits, shift them to low bits
2147  uint32x4_p t2 = VecShiftRight<32-S32>(vec);
2148 
2149  if (S64 == 0)
2150  {
2151  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2152  return VecPermute(vec, m);
2153  }
2154  else if (S64 == 32)
2155  {
2156  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2157  return VecPermute(vec, m);
2158  }
2159  else if (BR) // Big rotate amount?
2160  {
2161  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2162  t1 = VecPermute(t1, m);
2163  }
2164  else
2165  {
2166  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2167  t2 = VecPermute(t2, m);
2168  }
2169 
2170  return vec_or(t1, t2);
2171 #endif
2172 }
2173 
2174 /// \brief Rotate a vector left as if uint64x2_p
2175 /// \param vec the vector
2176 /// \return vector
2177 /// \details VecRotateLeft<8>() rotates each element in a vector
2178 /// by 8-bits. vec is rotated as if uint64x2_p. This specialization
2179 /// is used by algorithms like Speck128.
2180 /// \par Wraps
2181 /// vec_rl
2182 /// \since Crypto++ 8.3
2183 template<>
2185 {
2186 #if (CRYPTOPP_BIG_ENDIAN)
2187  const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2188  return VecPermute(vec, m);
2189 #else
2190  const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2191  return VecPermute(vec, m);
2192 #endif
2193 }
2194 
2195 #if defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2196 /// \brief Rotate a vector left as if uint64x2_p
2197 /// \tparam C rotate bit count
2198 /// \param vec the vector
2199 /// \return vector
2200 /// \details VecRotateLeft64() rotates each element in a vector by
2201 /// bit count. vec is rotated as if uint64x2_p.
2202 /// \par Wraps
2203 /// vec_rl
2204 /// \since Crypto++ 8.3
2205 template<unsigned int C>
2207 {
2208  // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2209  const uint64x2_p res = VecRotateLeft<C>(vec);
2210 
2211 #if defined(CRYPTOPP_DEBUG)
2212  // Test 32-bit rotate in debug builds while we are here.
2213  const uint32x4_p x = (uint32x4_p)vec;
2214  const uint32x4_p r = VecRotateLeft64<C>(x);
2215 
2216  CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2217 #endif
2218 
2219  return res;
2220 }
2221 #endif
2222 
2223 /// \brief Rotate a vector right as if uint64x2_p
2224 /// \tparam C rotate bit count
2225 /// \param vec the vector
2226 /// \return vector
2227 /// \details VecRotateRight64() rotates each element in a vector by
2228 /// bit count. vec is rotated as if uint64x2_p.
2229 /// \par Wraps
2230 /// vec_rl
2231 /// \since Crypto++ 8.3
2232 template<unsigned int C>
2234 {
2235 #if defined(_ARCH_PWR8) && !defined(CRYPTOPP_DEBUG)
2236  // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2237  return (uint32x4_p)VecRotateRight<C>((uint64x2_p)vec);
2238 #else
2239  // C=0, 32, or 64 needs special handling. That is S32 and S64 below.
2240  enum {S64=C&63, S32=C&31, BR=(S64>=32)};
2241 
2242  // Get the low bits, shift them to high bits
2243  uint32x4_p t1 = VecShiftRight<S32>(vec);
2244  // Get the high bits, shift them to low bits
2245  uint32x4_p t2 = VecShiftLeft<32-S32>(vec);
2246 
2247  if (S64 == 0)
2248  {
2249  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
2250  return VecPermute(vec, m);
2251  }
2252  else if (S64 == 32)
2253  {
2254  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2255  return VecPermute(vec, m);
2256  }
2257  else if (BR) // Big rotate amount?
2258  {
2259  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2260  t1 = VecPermute(t1, m);
2261  }
2262  else
2263  {
2264  const uint8x16_p m = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
2265  t2 = VecPermute(t2, m);
2266  }
2267 
2268  return vec_or(t1, t2);
2269 #endif
2270 }
2271 
2272 /// \brief Rotate a vector right as if uint64x2_p
2273 /// \param vec the vector
2274 /// \return vector
2275 /// \details VecRotateRight64<8>() rotates each element in a vector
2276 /// by 8-bits. vec is rotated as if uint64x2_p. This specialization
2277 /// is used by algorithms like Speck128.
2278 /// \details vec is rotated as if uint64x2_p.
2279 /// \par Wraps
2280 /// vec_rl
2281 /// \since Crypto++ 8.3
2282 template<>
2284 {
2285 #if (CRYPTOPP_BIG_ENDIAN)
2286  const uint8x16_p m = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
2287  return VecPermute(vec, m);
2288 #else
2289  const uint8x16_p m = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
2290  return VecPermute(vec, m);
2291 #endif
2292 }
2293 
2294 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2295 /// \brief Rotate a vector right as if uint64x2_p
2296 /// \tparam C rotate bit count
2297 /// \param vec the vector
2298 /// \return vector
2299 /// \details VecRotateRight64() rotates each element in a vector by
2300 /// bit count. vec is rotated as if uint64x2_p.
2301 /// \par Wraps
2302 /// vec_rl
2303 /// \since Crypto++ 8.3
2304 template<unsigned int C>
2306 {
2307  // 64-bit elements available at POWER7 with VSX, but vec_rl and vec_sl require POWER8
2308  const uint64x2_p res = VecRotateRight<C>(vec);
2309 
2310 #if defined(CRYPTOPP_DEBUG)
2311  // Test 32-bit rotate in debug builds while we are here.
2312  const uint32x4_p x = (uint32x4_p)vec;
2313  const uint32x4_p r = VecRotateRight64<C>(x);
2314 
2315  CRYPTOPP_ASSERT(vec_all_eq((uint32x4_p)res, r) == 1);
2316 #endif
2317 
2318  return res;
2319 }
2320 #endif
2321 
2322 /// \brief AND two vectors as if uint64x2_p
2323 /// \tparam T1 vector type
2324 /// \tparam T2 vector type
2325 /// \param vec1 the first vector
2326 /// \param vec2 the second vector
2327 /// \return vector
2328 /// \details VecAnd64() performs <tt>vec1 & vec2</tt>.
2329 /// vec2 is cast to the same type as vec1. The return vector
2330 /// is the same type as vec1.
2331 /// \details VecAnd64() is a convenience function that simply performs a VecAnd().
2332 /// \par Wraps
2333 /// vec_and
2334 /// \since Crypto++ 8.3
2335 template <class T1, class T2>
2336 inline T1 VecAnd64(const T1 vec1, const T2 vec2)
2337 {
2338  return (T1)vec_and(vec1, (T1)vec2);
2339 }
2340 
2341 /// \brief OR two vectors as if uint64x2_p
2342 /// \tparam T1 vector type
2343 /// \tparam T2 vector type
2344 /// \param vec1 the first vector
2345 /// \param vec2 the second vector
2346 /// \return vector
2347 /// \details VecOr64() performs <tt>vec1 | vec2</tt>.
2348 /// vec2 is cast to the same type as vec1. The return vector
2349 /// is the same type as vec1.
2350 /// \details VecOr64() is a convenience function that simply performs a VecOr().
2351 /// \par Wraps
2352 /// vec_or
2353 /// \since Crypto++ 8.3
2354 template <class T1, class T2>
2355 inline T1 VecOr64(const T1 vec1, const T2 vec2)
2356 {
2357  return (T1)vec_or(vec1, (T1)vec2);
2358 }
2359 
2360 /// \brief XOR two vectors as if uint64x2_p
2361 /// \tparam T1 vector type
2362 /// \tparam T2 vector type
2363 /// \param vec1 the first vector
2364 /// \param vec2 the second vector
2365 /// \return vector
2366 /// \details VecXor64() performs <tt>vec1 ^ vec2</tt>.
2367 /// vec2 is cast to the same type as vec1. The return vector
2368 /// is the same type as vec1.
2369 /// \details VecXor64() is a convenience function that simply performs a VecXor().
2370 /// \par Wraps
2371 /// vec_xor
2372 /// \since Crypto++ 8.3
2373 template <class T1, class T2>
2374 inline T1 VecXor64(const T1 vec1, const T2 vec2)
2375 {
2376  return (T1)vec_xor(vec1, (T1)vec2);
2377 }
2378 
2379 /// \brief Broadcast 64-bit double word to a vector
2380 /// \param val the 64-bit value
2381 /// \return vector
2382 /// \par Wraps
2383 /// vec_splats
2384 /// \since Crypto++ 8.3
2385 inline uint32x4_p VecSplatWord64(word64 val)
2386 {
2387 #if defined(_ARCH_PWR8)
2388  // The PPC64 ABI says so.
2389  return (uint32x4_p)vec_splats((unsigned long long)val);
2390 #else
2391  const word64 x[2] = {val,val};
2392  return (uint32x4_p)VecLoad((const word32*)x);
2393 #endif
2394 }
2395 
2396 /// \brief Broadcast 64-bit element to a vector as if uint64x2_p
2397 /// \tparam the element number
2398 /// \param val the 64-bit value
2399 /// \return vector
2400 /// \par Wraps
2401 /// vec_splat
2402 /// \since Crypto++ 8.3
2403 template <unsigned int N>
2405 {
2406 #if defined(__VSX__) || defined(_ARCH_PWR8)
2407  return (uint32x4_p)vec_splat((uint64x2_p)val, N);
2408 #else
2409  enum {E=N&1};
2410  if (E == 0)
2411  {
2412  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7};
2413  return (uint32x4_p)vec_perm(val, val, m);
2414  }
2415  else // (E == 1)
2416  {
2417  const uint8x16_p m = {8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15};
2418  return (uint32x4_p)vec_perm(val, val, m);
2419  }
2420 #endif
2421 }
2422 
2423 #if defined(__VSX__) || defined(_ARCH_PWR8) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2424 /// \brief Broadcast 64-bit element to a vector
2425 /// \tparam the element number
2426 /// \param val the 64-bit value
2427 /// \return vector
2428 /// \since Crypto++ 8.3
2429 template <unsigned int N>
2431 {
2432  return vec_splat(val, N);
2433 }
2434 #endif
2435 
2436 //@}
2437 
2438 //////////////////////// Power8 Crypto ////////////////////////
2439 
2440 // __CRYPTO__ alone is not enough. Clang will define __CRYPTO__
2441 // when it is not available, like with Power7. Sigh...
2442 #if (defined(_ARCH_PWR8) && defined(__CRYPTO__)) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
2443 
2444 /// \name POLYNOMIAL MULTIPLICATION
2445 //@{
2446 
2447 /// \brief Polynomial multiplication
2448 /// \param a the first term
2449 /// \param b the second term
2450 /// \return vector product
2451 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
2452 /// polynomial multiplication multiplies the high and low terms, and then
2453 /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
2454 /// al*bl</tt>. It is different behavior than Intel polynomial
2455 /// multiplication. To obtain a single product without the XOR, then set
2456 /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
2457 /// results in <tt>0*bh XOR al*bl = al*bl</tt>.
2458 /// \par Wraps
2459 /// __vpmsumw, __builtin_altivec_crypto_vpmsumw and __builtin_crypto_vpmsumw.
2460 /// \since Crypto++ 8.1
2462 {
2463 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2464  return __vpmsumw (a, b);
2465 #elif defined(__clang__)
2466  return __builtin_altivec_crypto_vpmsumw (a, b);
2467 #else
2468  return __builtin_crypto_vpmsumw (a, b);
2469 #endif
2470 }
2471 
2472 /// \brief Polynomial multiplication
2473 /// \param a the first term
2474 /// \param b the second term
2475 /// \return vector product
2476 /// \details VecPolyMultiply() performs polynomial multiplication. POWER8
2477 /// polynomial multiplication multiplies the high and low terms, and then
2478 /// XOR's the high and low products. That is, the result is <tt>ah*bh XOR
2479 /// al*bl</tt>. It is different behavior than Intel polynomial
2480 /// multiplication. To obtain a single product without the XOR, then set
2481 /// one of the high or low terms to 0. For example, setting <tt>ah=0</tt>
2482 /// results in <tt>0*bh XOR al*bl = al*bl</tt>.
2483 /// \par Wraps
2484 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2485 /// \since Crypto++ 8.1
2487 {
2488 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2489  return __vpmsumd (a, b);
2490 #elif defined(__clang__)
2491  return __builtin_altivec_crypto_vpmsumd (a, b);
2492 #else
2493  return __builtin_crypto_vpmsumd (a, b);
2494 #endif
2495 }
2496 
2497 /// \brief Polynomial multiplication
2498 /// \param a the first term
2499 /// \param b the second term
2500 /// \return vector product
2501 /// \details VecIntelMultiply00() performs polynomial multiplication and presents
2502 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
2503 /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
2504 /// are multiplied.
2505 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2506 /// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
2507 /// \par Wraps
2508 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2509 /// \since Crypto++ 8.0
2511 {
2512 #if defined(CRYPTOPP_BIG_ENDIAN)
2514 #else
2515  return VecPolyMultiply(VecGetHigh(a), VecGetHigh(b));
2516 #endif
2517 }
2518 
2519 /// \brief Polynomial multiplication
2520 /// \param a the first term
2521 /// \param b the second term
2522 /// \return vector product
2523 /// \details VecIntelMultiply01 performs() polynomial multiplication and presents
2524 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
2525 /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
2526 /// 64-bits of <tt>b</tt> are multiplied.
2527 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2528 /// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
2529 /// \par Wraps
2530 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2531 /// \since Crypto++ 8.0
2533 {
2534 #if defined(CRYPTOPP_BIG_ENDIAN)
2535  return VecSwapWords(VecPolyMultiply(a, VecGetHigh(b)));
2536 #else
2537  return VecPolyMultiply(a, VecGetHigh(b));
2538 #endif
2539 }
2540 
2541 /// \brief Polynomial multiplication
2542 /// \param a the first term
2543 /// \param b the second term
2544 /// \return vector product
2545 /// \details VecIntelMultiply10() performs polynomial multiplication and presents
2546 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
2547 /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
2548 /// 64-bits of <tt>b</tt> are multiplied.
2549 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2550 /// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
2551 /// \par Wraps
2552 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2553 /// \since Crypto++ 8.0
2555 {
2556 #if defined(CRYPTOPP_BIG_ENDIAN)
2557  return VecSwapWords(VecPolyMultiply(VecGetHigh(a), b));
2558 #else
2559  return VecPolyMultiply(VecGetHigh(a), b);
2560 #endif
2561 }
2562 
2563 /// \brief Polynomial multiplication
2564 /// \param a the first term
2565 /// \param b the second term
2566 /// \return vector product
2567 /// \details VecIntelMultiply11() performs polynomial multiplication and presents
2568 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
2569 /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
2570 /// are multiplied.
2571 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
2572 /// is MSB and numbered 127, while the the rightmost bit is LSB and numbered 0.
2573 /// \par Wraps
2574 /// __vpmsumd, __builtin_altivec_crypto_vpmsumd and __builtin_crypto_vpmsumd.
2575 /// \since Crypto++ 8.0
2577 {
2578 #if defined(CRYPTOPP_BIG_ENDIAN)
2579  return VecSwapWords(VecPolyMultiply(VecGetLow(a), b));
2580 #else
2581  return VecPolyMultiply(VecGetLow(a), b);
2582 #endif
2583 }
2584 
2585 //@}
2586 
2587 /// \name AES ENCRYPTION
2588 //@{
2589 
2590 /// \brief One round of AES encryption
2591 /// \tparam T1 vector type
2592 /// \tparam T2 vector type
2593 /// \param state the state vector
2594 /// \param key the subkey vector
2595 /// \details VecEncrypt() performs one round of AES encryption of state
2596 /// using subkey key. The return vector is the same type as state.
2597 /// \details VecEncrypt() is available on POWER8 and above.
2598 /// \par Wraps
2599 /// __vcipher, __builtin_altivec_crypto_vcipher, __builtin_crypto_vcipher
2600 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2601 template <class T1, class T2>
2602 inline T1 VecEncrypt(const T1 state, const T2 key)
2603 {
2604 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2605  return (T1)__vcipher((uint8x16_p)state, (uint8x16_p)key);
2606 #elif defined(__clang__)
2607  return (T1)__builtin_altivec_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2608 #elif defined(__GNUC__)
2609  return (T1)__builtin_crypto_vcipher((uint64x2_p)state, (uint64x2_p)key);
2610 #else
2611  CRYPTOPP_ASSERT(0);
2612 #endif
2613 }
2614 
2615 /// \brief Final round of AES encryption
2616 /// \tparam T1 vector type
2617 /// \tparam T2 vector type
2618 /// \param state the state vector
2619 /// \param key the subkey vector
2620 /// \details VecEncryptLast() performs the final round of AES encryption
2621 /// of state using subkey key. The return vector is the same type as state.
2622 /// \details VecEncryptLast() is available on POWER8 and above.
2623 /// \par Wraps
2624 /// __vcipherlast, __builtin_altivec_crypto_vcipherlast, __builtin_crypto_vcipherlast
2625 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2626 template <class T1, class T2>
2627 inline T1 VecEncryptLast(const T1 state, const T2 key)
2628 {
2629 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2630  return (T1)__vcipherlast((uint8x16_p)state, (uint8x16_p)key);
2631 #elif defined(__clang__)
2632  return (T1)__builtin_altivec_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2633 #elif defined(__GNUC__)
2634  return (T1)__builtin_crypto_vcipherlast((uint64x2_p)state, (uint64x2_p)key);
2635 #else
2636  CRYPTOPP_ASSERT(0);
2637 #endif
2638 }
2639 
2640 /// \brief One round of AES decryption
2641 /// \tparam T1 vector type
2642 /// \tparam T2 vector type
2643 /// \param state the state vector
2644 /// \param key the subkey vector
2645 /// \details VecDecrypt() performs one round of AES decryption of state
2646 /// using subkey key. The return vector is the same type as state.
2647 /// \details VecDecrypt() is available on POWER8 and above.
2648 /// \par Wraps
2649 /// __vncipher, __builtin_altivec_crypto_vncipher, __builtin_crypto_vncipher
2650 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2651 template <class T1, class T2>
2652 inline T1 VecDecrypt(const T1 state, const T2 key)
2653 {
2654 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2655  return (T1)__vncipher((uint8x16_p)state, (uint8x16_p)key);
2656 #elif defined(__clang__)
2657  return (T1)__builtin_altivec_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2658 #elif defined(__GNUC__)
2659  return (T1)__builtin_crypto_vncipher((uint64x2_p)state, (uint64x2_p)key);
2660 #else
2661  CRYPTOPP_ASSERT(0);
2662 #endif
2663 }
2664 
2665 /// \brief Final round of AES decryption
2666 /// \tparam T1 vector type
2667 /// \tparam T2 vector type
2668 /// \param state the state vector
2669 /// \param key the subkey vector
2670 /// \details VecDecryptLast() performs the final round of AES decryption
2671 /// of state using subkey key. The return vector is the same type as state.
2672 /// \details VecDecryptLast() is available on POWER8 and above.
2673 /// \par Wraps
2674 /// __vncipherlast, __builtin_altivec_crypto_vncipherlast, __builtin_crypto_vncipherlast
2675 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2676 template <class T1, class T2>
2677 inline T1 VecDecryptLast(const T1 state, const T2 key)
2678 {
2679 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2680  return (T1)__vncipherlast((uint8x16_p)state, (uint8x16_p)key);
2681 #elif defined(__clang__)
2682  return (T1)__builtin_altivec_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2683 #elif defined(__GNUC__)
2684  return (T1)__builtin_crypto_vncipherlast((uint64x2_p)state, (uint64x2_p)key);
2685 #else
2686  CRYPTOPP_ASSERT(0);
2687 #endif
2688 }
2689 
2690 //@}
2691 
2692 /// \name SHA DIGESTS
2693 //@{
2694 
2695 /// \brief SHA256 Sigma functions
2696 /// \tparam func function
2697 /// \tparam fmask function mask
2698 /// \tparam T vector type
2699 /// \param data the block to transform
2700 /// \details VecSHA256() selects sigma0, sigma1, Sigma0, Sigma1 based on
2701 /// func and fmask. The return vector is the same type as data.
2702 /// \details VecSHA256() is available on POWER8 and above.
2703 /// \par Wraps
2704 /// __vshasigmaw, __builtin_altivec_crypto_vshasigmaw, __builtin_crypto_vshasigmaw
2705 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2706 template <int func, int fmask, class T>
2707 inline T VecSHA256(const T data)
2708 {
2709 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2710  return (T)__vshasigmaw((uint32x4_p)data, func, fmask);
2711 #elif defined(__clang__)
2712  return (T)__builtin_altivec_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2713 #elif defined(__GNUC__)
2714  return (T)__builtin_crypto_vshasigmaw((uint32x4_p)data, func, fmask);
2715 #else
2716  CRYPTOPP_ASSERT(0);
2717 #endif
2718 }
2719 
2720 /// \brief SHA512 Sigma functions
2721 /// \tparam func function
2722 /// \tparam fmask function mask
2723 /// \tparam T vector type
2724 /// \param data the block to transform
2725 /// \details VecSHA512() selects sigma0, sigma1, Sigma0, Sigma1 based on
2726 /// func and fmask. The return vector is the same type as data.
2727 /// \details VecSHA512() is available on POWER8 and above.
2728 /// \par Wraps
2729 /// __vshasigmad, __builtin_altivec_crypto_vshasigmad, __builtin_crypto_vshasigmad
2730 /// \since GCC and XLC since Crypto++ 6.0, LLVM Clang since Crypto++ 8.0
2731 template <int func, int fmask, class T>
2732 inline T VecSHA512(const T data)
2733 {
2734 #if defined(__ibmxl__) || (defined(_AIX) && defined(__xlC__))
2735  return (T)__vshasigmad((uint64x2_p)data, func, fmask);
2736 #elif defined(__clang__)
2737  return (T)__builtin_altivec_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2738 #elif defined(__GNUC__)
2739  return (T)__builtin_crypto_vshasigmad((uint64x2_p)data, func, fmask);
2740 #else
2741  CRYPTOPP_ASSERT(0);
2742 #endif
2743 }
2744 
2745 //@}
2746 
2747 #endif // __CRYPTO__
2748 
2749 #endif // _ALTIVEC_
2750 
2751 NAMESPACE_END
2752 
2753 #if CRYPTOPP_GCC_DIAGNOSTIC_AVAILABLE
2754 # pragma GCC diagnostic pop
2755 #endif
2756 
2757 #endif // CRYPTOPP_PPC_CRYPTO_H
VecAnd
T1 VecAnd(const T1 vec1, const T2 vec2)
AND two vectors.
Definition: ppc_simd.h:1372
VecSplatWord64
uint32x4_p VecSplatWord64(word64 val)
Broadcast 64-bit double word to a vector.
Definition: ppc_simd.h:2385
VecRotateLeft64
uint32x4_p VecRotateLeft64(const uint32x4_p vec)
Rotate a vector left as if uint64x2_p.
Definition: ppc_simd.h:2135
VecIntelMultiply11
uint64x2_p VecIntelMultiply11(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:2576
NCONST_V8_CAST
#define NCONST_V8_CAST(x)
Cast array to vector pointer.
Definition: ppc_simd.h:159
VecXor64
T1 VecXor64(const T1 vec1, const T2 vec2)
XOR two vectors as if uint64x2_p.
Definition: ppc_simd.h:2374
VecRotateLeft
uint32x4_p VecRotateLeft(const uint32x4_p vec)
Rotate a vector left.
Definition: ppc_simd.h:1656
VecPolyMultiply
uint32x4_p VecPolyMultiply(const uint32x4_p &a, const uint32x4_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:2461
uint64x2_p
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:208
uint8x16_p
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:188
VecIntelMultiply00
uint64x2_p VecIntelMultiply00(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:2510
VecStoreBE
void VecStoreBE(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:1227
VecSplatWord
uint32x4_p VecSplatWord(word32 val)
Broadcast 32-bit word to a vector.
Definition: ppc_simd.h:1826
VecSub
T1 VecSub(const T1 vec1, const T2 vec2)
Subtract two vectors.
Definition: ppc_simd.h:1452
VecGetHigh
T VecGetHigh(const T val)
Extract a dword from a vector.
Definition: ppc_simd.h:1928
NCONST_V32_CAST
#define NCONST_V32_CAST(x)
Cast array to vector pointer.
Definition: ppc_simd.h:165
VecStore_ALTIVEC
void VecStore_ALTIVEC(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:813
VecLoadAligned
uint32x4_p VecLoadAligned(const byte src[16])
Loads a vector from an aligned byte array.
Definition: ppc_simd.h:556
VecSHA512
T VecSHA512(const T data)
SHA512 Sigma functions.
Definition: ppc_simd.h:2732
VecOr64
T1 VecOr64(const T1 vec1, const T2 vec2)
OR two vectors as if uint64x2_p.
Definition: ppc_simd.h:2355
CRYPTOPP_ASSERT
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68
VecStore
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:891
VecDecryptLast
T1 VecDecryptLast(const T1 state, const T2 key)
Final round of AES decryption.
Definition: ppc_simd.h:2677
VecAnd64
T1 VecAnd64(const T1 vec1, const T2 vec2)
AND two vectors as if uint64x2_p.
Definition: ppc_simd.h:2336
CONST_V32_CAST
#define CONST_V32_CAST(x)
Cast array to vector pointer.
Definition: ppc_simd.h:147
VecRotateRightOctet
T VecRotateRightOctet(const T vec)
Rotate a vector right.
Definition: ppc_simd.h:1635
uint32x4_p
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:198
VecZero
uint32x4_p VecZero()
The 0 vector.
Definition: ppc_simd.h:214
uint16x8_p
__vector unsigned short uint16x8_p
Vector of 16-bit elements.
Definition: ppc_simd.h:193
VecSplatElement64
uint32x4_p VecSplatElement64(const uint32x4_p val)
Broadcast 64-bit element to a vector as if uint64x2_p.
Definition: ppc_simd.h:2404
VecGetLow
T VecGetLow(const T val)
Extract a dword from a vector.
Definition: ppc_simd.h:1906
VecMergeLow
T VecMergeLow(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1801
VecShiftRightOctet
T VecShiftRightOctet(const T vec)
Shift a vector right.
Definition: ppc_simd.h:1571
misc.h
Utility functions for the Crypto++ library.
VecStoreAligned
void VecStoreAligned(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:1096
VecSwapWords
T VecSwapWords(const T vec)
Exchange high and low double words.
Definition: ppc_simd.h:1946
VecRotateRight64< 8 >
uint32x4_p VecRotateRight64< 8 >(const uint32x4_p vec)
Rotate a vector right as if uint64x2_p.
Definition: ppc_simd.h:2283
VecReverseBE
T VecReverseBE(const T data)
Reverse bytes in a vector.
Definition: ppc_simd.h:279
VecMergeHigh
T VecMergeHigh(const T vec1, const T vec2)
Merge two vectors.
Definition: ppc_simd.h:1815
CONST_V8_CAST
#define CONST_V8_CAST(x)
Cast array to vector pointer.
Definition: ppc_simd.h:141
VecLoad_ALTIVEC
uint32x4_p VecLoad_ALTIVEC(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:304
VecLoad
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:365
VecLoadBE
uint32x4_p VecLoadBE(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:738
VecAdd64
uint32x4_p VecAdd64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Add two vectors as if uint64x2_p.
Definition: ppc_simd.h:2007
VecSplatElement
uint32x4_p VecSplatElement(const uint32x4_p val)
Broadcast 32-bit element to a vector.
Definition: ppc_simd.h:1848
VecRotateLeft64< 8 >
uint32x4_p VecRotateLeft64< 8 >(const uint32x4_p vec)
Rotate a vector left as if uint64x2_p.
Definition: ppc_simd.h:2184
VecOr
T1 VecOr(const T1 vec1, const T2 vec2)
OR two vectors.
Definition: ppc_simd.h:1391
VecIntelMultiply10
uint64x2_p VecIntelMultiply10(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:2554
VecRotateRight
uint32x4_p VecRotateRight(const uint32x4_p vec)
Rotate a vector right.
Definition: ppc_simd.h:1672
VecShiftLeft
uint32x4_p VecShiftLeft(const uint32x4_p vec)
Shift a vector left.
Definition: ppc_simd.h:1688
VecReverse
T VecReverse(const T data)
Reverse bytes in a vector.
Definition: ppc_simd.h:238
VecXor
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1410
VecEqual
bool VecEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition: ppc_simd.h:1968
VecRotateLeftOctet
T VecRotateLeftOctet(const T vec)
Rotate a vector left.
Definition: ppc_simd.h:1610
VecSub64
uint32x4_p VecSub64(const uint32x4_p &vec1, const uint32x4_p &vec2)
Subtract two vectors as if uint64x2_p.
Definition: ppc_simd.h:2070
VecSHA256
T VecSHA256(const T data)
SHA256 Sigma functions.
Definition: ppc_simd.h:2707
VecShiftRight
uint32x4_p VecShiftRight(const uint32x4_p vec)
Shift a vector right.
Definition: ppc_simd.h:1704
VecRotateRight64
uint32x4_p VecRotateRight64(const uint32x4_p vec)
Rotate a vector right as if uint64x2_p.
Definition: ppc_simd.h:2233
VecEncryptLast
T1 VecEncryptLast(const T1 state, const T2 key)
Final round of AES encryption.
Definition: ppc_simd.h:2627
CryptoPP
Crypto++ library namespace.
VecOne
uint32x4_p VecOne()
The 1 vector.
Definition: ppc_simd.h:223
config.h
Library configuration file.
VecPermute
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1474
VecNotEqual
bool VecNotEqual(const T1 vec1, const T2 vec2)
Compare two vectors.
Definition: ppc_simd.h:1985
VecDecrypt
T1 VecDecrypt(const T1 state, const T2 key)
One round of AES decryption.
Definition: ppc_simd.h:2652
VecIntelMultiply01
uint64x2_p VecIntelMultiply01(const uint64x2_p &a, const uint64x2_p &b)
Polynomial multiplication.
Definition: ppc_simd.h:2532
VecAdd
T1 VecAdd(const T1 vec1, const T2 vec2)
Add two vectors.
Definition: ppc_simd.h:1434
VecShiftLeftOctet
T VecShiftLeftOctet(const T vec)
Shift a vector left.
Definition: ppc_simd.h:1524
VecEncrypt
T1 VecEncrypt(const T1 state, const T2 key)
One round of AES encryption.
Definition: ppc_simd.h:2602
VecReverseLE
T VecReverseLE(const T data)
Reverse bytes in a vector.
Definition: ppc_simd.h:259