19 #if (CRYPTOPP_SSSE3_AVAILABLE)
21 # include <pmmintrin.h>
22 # include <tmmintrin.h>
26 # include <ammintrin.h>
27 # if defined(__GNUC__)
28 # include <x86intrin.h>
32 #if (CRYPTOPP_ARM_NEON_HEADER)
34 # include <arm_neon.h>
37 #if (CRYPTOPP_ARM_ACLE_HEADER)
39 # include <arm_acle.h>
46 #if (CRYPTOPP_ALTIVEC_AVAILABLE)
52 extern const char SIMON128_SIMD_FNAME[] = __FILE__;
54 ANONYMOUS_NAMESPACE_BEGIN
57 using CryptoPP::word32;
58 using CryptoPP::word64;
63 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
66 #if defined(_MSC_VER) && !defined(_M_ARM64)
67 inline uint64x2_t vld1q_dup_u64(
const uint64_t* ptr)
69 return vmovq_n_u64(*ptr);
74 inline T UnpackHigh64(
const T& a,
const T& b)
76 const uint64x1_t x(vget_high_u64((uint64x2_t)a));
77 const uint64x1_t y(vget_high_u64((uint64x2_t)b));
78 return (T)vcombine_u64(x, y);
82 inline T UnpackLow64(
const T& a,
const T& b)
84 const uint64x1_t x(vget_low_u64((uint64x2_t)a));
85 const uint64x1_t y(vget_low_u64((uint64x2_t)b));
86 return (T)vcombine_u64(x, y);
89 template <
unsigned int R>
90 inline uint64x2_t RotateLeft64(
const uint64x2_t& val)
92 const uint64x2_t a(vshlq_n_u64(val, R));
93 const uint64x2_t b(vshrq_n_u64(val, 64 - R));
94 return vorrq_u64(a, b);
97 template <
unsigned int R>
98 inline uint64x2_t RotateRight64(
const uint64x2_t& val)
100 const uint64x2_t a(vshlq_n_u64(val, 64 - R));
101 const uint64x2_t b(vshrq_n_u64(val, R));
102 return vorrq_u64(a, b);
105 #if defined(__aarch32__) || defined(__aarch64__)
108 inline uint64x2_t RotateLeft64<8>(
const uint64x2_t& val)
110 const uint8_t maskb[16] = { 7,0,1,2, 3,4,5,6, 15,8,9,10, 11,12,13,14 };
111 const uint8x16_t mask = vld1q_u8(maskb);
113 return vreinterpretq_u64_u8(
114 vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
119 inline uint64x2_t RotateRight64<8>(
const uint64x2_t& val)
121 const uint8_t maskb[16] = { 1,2,3,4, 5,6,7,0, 9,10,11,12, 13,14,15,8 };
122 const uint8x16_t mask = vld1q_u8(maskb);
124 return vreinterpretq_u64_u8(
125 vqtbl1q_u8(vreinterpretq_u8_u64(val), mask));
129 inline uint64x2_t SIMON128_f(
const uint64x2_t& val)
131 return veorq_u64(RotateLeft64<2>(val),
132 vandq_u64(RotateLeft64<1>(val), RotateLeft64<8>(val)));
135 inline void SIMON128_Enc_Block(uint64x2_t &block0, uint64x2_t &block1,
136 const word64 *subkeys,
unsigned int rounds)
139 uint64x2_t x1 = UnpackHigh64(block0, block1);
140 uint64x2_t y1 = UnpackLow64(block0, block1);
142 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
144 const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
145 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
147 const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
148 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
153 const uint64x2_t rk = vld1q_dup_u64(subkeys+rounds-1);
155 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
160 block0 = UnpackLow64(y1, x1);
161 block1 = UnpackHigh64(y1, x1);
164 inline void SIMON128_Enc_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
165 uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
166 const word64 *subkeys,
unsigned int rounds)
169 uint64x2_t x1 = UnpackHigh64(block0, block1);
170 uint64x2_t y1 = UnpackLow64(block0, block1);
171 uint64x2_t x2 = UnpackHigh64(block2, block3);
172 uint64x2_t y2 = UnpackLow64(block2, block3);
173 uint64x2_t x3 = UnpackHigh64(block4, block5);
174 uint64x2_t y3 = UnpackLow64(block4, block5);
176 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1) - 1; i += 2)
178 const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i);
179 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk1);
180 y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk1);
181 y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk1);
183 const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i+1);
184 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk2);
185 x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk2);
186 x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk2);
191 const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
193 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk);
194 y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk);
195 y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk);
200 block0 = UnpackLow64(y1, x1);
201 block1 = UnpackHigh64(y1, x1);
202 block2 = UnpackLow64(y2, x2);
203 block3 = UnpackHigh64(y2, x2);
204 block4 = UnpackLow64(y3, x3);
205 block5 = UnpackHigh64(y3, x3);
208 inline void SIMON128_Dec_Block(uint64x2_t &block0, uint64x2_t &block1,
209 const word64 *subkeys,
unsigned int rounds)
212 uint64x2_t x1 = UnpackHigh64(block0, block1);
213 uint64x2_t y1 = UnpackLow64(block0, block1);
218 const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
220 y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
224 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
226 const uint64x2_t rk1 = vld1q_dup_u64(subkeys+i+1);
227 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
229 const uint64x2_t rk2 = vld1q_dup_u64(subkeys+i);
230 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
234 block0 = UnpackLow64(y1, x1);
235 block1 = UnpackHigh64(y1, x1);
238 inline void SIMON128_Dec_6_Blocks(uint64x2_t &block0, uint64x2_t &block1,
239 uint64x2_t &block2, uint64x2_t &block3, uint64x2_t &block4, uint64x2_t &block5,
240 const word64 *subkeys,
unsigned int rounds)
243 uint64x2_t x1 = UnpackHigh64(block0, block1);
244 uint64x2_t y1 = UnpackLow64(block0, block1);
245 uint64x2_t x2 = UnpackHigh64(block2, block3);
246 uint64x2_t y2 = UnpackLow64(block2, block3);
247 uint64x2_t x3 = UnpackHigh64(block4, block5);
248 uint64x2_t y3 = UnpackLow64(block4, block5);
253 const uint64x2_t rk = vld1q_dup_u64(subkeys + rounds - 1);
255 y1 = veorq_u64(veorq_u64(y1, rk), SIMON128_f(x1));
256 y2 = veorq_u64(veorq_u64(y2, rk), SIMON128_f(x2));
257 y3 = veorq_u64(veorq_u64(y3, rk), SIMON128_f(x3));
261 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
263 const uint64x2_t rk1 = vld1q_dup_u64(subkeys + i + 1);
264 x1 = veorq_u64(veorq_u64(x1, SIMON128_f(y1)), rk1);
265 x2 = veorq_u64(veorq_u64(x2, SIMON128_f(y2)), rk1);
266 x3 = veorq_u64(veorq_u64(x3, SIMON128_f(y3)), rk1);
268 const uint64x2_t rk2 = vld1q_dup_u64(subkeys + i);
269 y1 = veorq_u64(veorq_u64(y1, SIMON128_f(x1)), rk2);
270 y2 = veorq_u64(veorq_u64(y2, SIMON128_f(x2)), rk2);
271 y3 = veorq_u64(veorq_u64(y3, SIMON128_f(x3)), rk2);
275 block0 = UnpackLow64(y1, x1);
276 block1 = UnpackHigh64(y1, x1);
277 block2 = UnpackLow64(y2, x2);
278 block3 = UnpackHigh64(y2, x2);
279 block4 = UnpackLow64(y3, x3);
280 block5 = UnpackHigh64(y3, x3);
283 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
287 #if (CRYPTOPP_SSSE3_AVAILABLE)
291 # define M128_CAST(x) ((__m128i *)(void *)(x))
293 #ifndef CONST_M128_CAST
294 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
299 # define DOUBLE_CAST(x) ((double *)(void *)(x))
301 #ifndef CONST_DOUBLE_CAST
302 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
305 inline void Swap128(__m128i& a,__m128i& b)
307 #if defined(__SUNPRO_CC) && (__SUNPRO_CC <= 0x5120)
316 template <
unsigned int R>
317 inline __m128i RotateLeft64(
const __m128i& val)
320 return _mm_roti_epi64(val, R);
323 _mm_slli_epi64(val, R), _mm_srli_epi64(val, 64-R));
327 template <
unsigned int R>
328 inline __m128i RotateRight64(
const __m128i& val)
331 return _mm_roti_epi64(val, 64-R);
334 _mm_slli_epi64(val, 64-R), _mm_srli_epi64(val, R));
340 __m128i RotateLeft64<8>(
const __m128i& val)
343 return _mm_roti_epi64(val, 8);
345 const __m128i mask = _mm_set_epi8(14,13,12,11, 10,9,8,15, 6,5,4,3, 2,1,0,7);
346 return _mm_shuffle_epi8(val, mask);
352 __m128i RotateRight64<8>(
const __m128i& val)
355 return _mm_roti_epi64(val, 64-8);
357 const __m128i mask = _mm_set_epi8(8,15,14,13, 12,11,10,9, 0,7,6,5, 4,3,2,1);
358 return _mm_shuffle_epi8(val, mask);
362 inline __m128i SIMON128_f(
const __m128i& v)
364 return _mm_xor_si128(RotateLeft64<2>(v),
365 _mm_and_si128(RotateLeft64<1>(v), RotateLeft64<8>(v)));
368 inline void SIMON128_Enc_Block(__m128i &block0, __m128i &block1,
369 const word64 *subkeys,
unsigned int rounds)
372 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
373 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
375 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
379 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
382 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
388 const __m128i rk = _mm_load_si128(
CONST_M128_CAST(subkeys+(rounds-1)*2));
390 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
395 block0 = _mm_unpacklo_epi64(y1, x1);
396 block1 = _mm_unpackhi_epi64(y1, x1);
399 inline void SIMON128_Enc_6_Blocks(__m128i &block0, __m128i &block1,
400 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
401 const word64 *subkeys,
unsigned int rounds)
404 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
405 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
406 __m128i x2 = _mm_unpackhi_epi64(block2, block3);
407 __m128i y2 = _mm_unpacklo_epi64(block2, block3);
408 __m128i x3 = _mm_unpackhi_epi64(block4, block5);
409 __m128i y3 = _mm_unpacklo_epi64(block4, block5);
411 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1) - 1; i += 2)
415 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk1);
416 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk1);
417 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk1);
421 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk2);
422 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk2);
423 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk2);
429 const __m128i rk = _mm_load_si128(
CONST_M128_CAST(subkeys+(rounds-1)*2));
430 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk);
431 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk);
432 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk);
433 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
437 block0 = _mm_unpacklo_epi64(y1, x1);
438 block1 = _mm_unpackhi_epi64(y1, x1);
439 block2 = _mm_unpacklo_epi64(y2, x2);
440 block3 = _mm_unpackhi_epi64(y2, x2);
441 block4 = _mm_unpacklo_epi64(y3, x3);
442 block5 = _mm_unpackhi_epi64(y3, x3);
445 inline void SIMON128_Dec_Block(__m128i &block0, __m128i &block1,
446 const word64 *subkeys,
unsigned int rounds)
449 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
450 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
454 const __m128i rk = _mm_castpd_si128(
455 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
458 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
462 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
464 const __m128i rk1 = _mm_castpd_si128(
465 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i+1)));
466 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
468 const __m128i rk2 = _mm_castpd_si128(
469 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys+i)));
470 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
474 block0 = _mm_unpacklo_epi64(y1, x1);
475 block1 = _mm_unpackhi_epi64(y1, x1);
478 inline void SIMON128_Dec_6_Blocks(__m128i &block0, __m128i &block1,
479 __m128i &block2, __m128i &block3, __m128i &block4, __m128i &block5,
480 const word64 *subkeys,
unsigned int rounds)
483 __m128i x1 = _mm_unpackhi_epi64(block0, block1);
484 __m128i y1 = _mm_unpacklo_epi64(block0, block1);
485 __m128i x2 = _mm_unpackhi_epi64(block2, block3);
486 __m128i y2 = _mm_unpacklo_epi64(block2, block3);
487 __m128i x3 = _mm_unpackhi_epi64(block4, block5);
488 __m128i y3 = _mm_unpacklo_epi64(block4, block5);
492 const __m128i rk = _mm_castpd_si128(
493 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + rounds - 1)));
495 Swap128(x1, y1); Swap128(x2, y2); Swap128(x3, y3);
496 y1 = _mm_xor_si128(_mm_xor_si128(y1, rk), SIMON128_f(x1));
497 y2 = _mm_xor_si128(_mm_xor_si128(y2, rk), SIMON128_f(x2));
498 y3 = _mm_xor_si128(_mm_xor_si128(y3, rk), SIMON128_f(x3));
502 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
504 const __m128i rk1 = _mm_castpd_si128(
505 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i + 1)));
506 x1 = _mm_xor_si128(_mm_xor_si128(x1, SIMON128_f(y1)), rk1);
507 x2 = _mm_xor_si128(_mm_xor_si128(x2, SIMON128_f(y2)), rk1);
508 x3 = _mm_xor_si128(_mm_xor_si128(x3, SIMON128_f(y3)), rk1);
510 const __m128i rk2 = _mm_castpd_si128(
511 _mm_loaddup_pd(CONST_DOUBLE_CAST(subkeys + i)));
512 y1 = _mm_xor_si128(_mm_xor_si128(y1, SIMON128_f(x1)), rk2);
513 y2 = _mm_xor_si128(_mm_xor_si128(y2, SIMON128_f(x2)), rk2);
514 y3 = _mm_xor_si128(_mm_xor_si128(y3, SIMON128_f(x3)), rk2);
518 block0 = _mm_unpacklo_epi64(y1, x1);
519 block1 = _mm_unpackhi_epi64(y1, x1);
520 block2 = _mm_unpacklo_epi64(y2, x2);
521 block3 = _mm_unpackhi_epi64(y2, x2);
522 block4 = _mm_unpacklo_epi64(y3, x3);
523 block5 = _mm_unpackhi_epi64(y3, x3);
526 #endif // CRYPTOPP_SSSE3_AVAILABLE
530 #if (CRYPTOPP_ALTIVEC_AVAILABLE)
547 #if defined(_ARCH_PWR8)
563 #if defined(_ARCH_PWR8)
564 #define simon128_t uint64x2_p
566 #define simon128_t uint32x4_p
569 inline simon128_t SIMON128_f(
const simon128_t val)
571 return (simon128_t)
VecXor64(VecRotateLeft64<2>(val),
575 inline void SIMON128_Enc_Block(
uint32x4_p &block,
const word64 *subkeys,
unsigned int rounds)
577 #if (CRYPTOPP_BIG_ENDIAN)
578 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
579 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
581 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
582 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
586 simon128_t x1 = (simon128_t)
VecPermute(block, block, m1);
587 simon128_t y1 = (simon128_t)
VecPermute(block, block, m2);
589 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
592 const word32* ptr1 =
reinterpret_cast<const word32*
>(subkeys+i*2);
594 const word32* ptr2 =
reinterpret_cast<const word32*
>(subkeys+(i+1)*2);
604 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+(rounds-1)*2);
612 #if (CRYPTOPP_BIG_ENDIAN)
613 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
616 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
624 inline void SIMON128_Dec_Block(
uint32x4_p &block,
const word64 *subkeys,
unsigned int rounds)
626 #if (CRYPTOPP_BIG_ENDIAN)
627 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
628 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
630 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
631 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
635 simon128_t x1 = (simon128_t)
VecPermute(block, block, m1);
636 simon128_t y1 = (simon128_t)
VecPermute(block, block, m2);
642 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+rounds-1);
643 const simon128_t tk = (simon128_t)
VecLoad(ptr);
644 const simon128_t rk = (simon128_t)VecSplatElement64<0>(tk);
650 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
652 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+i);
653 const simon128_t tk = (simon128_t)
VecLoad(ptr);
654 const simon128_t rk1 = (simon128_t)VecSplatElement64<1>(tk);
655 const simon128_t rk2 = (simon128_t)VecSplatElement64<0>(tk);
661 #if (CRYPTOPP_BIG_ENDIAN)
662 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
665 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
675 uint32x4_p &block5,
const word64 *subkeys,
unsigned int rounds)
677 #if (CRYPTOPP_BIG_ENDIAN)
678 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
679 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
681 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
682 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
686 simon128_t x1 = (simon128_t)
VecPermute(block0, block1, m1);
687 simon128_t y1 = (simon128_t)
VecPermute(block0, block1, m2);
688 simon128_t x2 = (simon128_t)
VecPermute(block2, block3, m1);
689 simon128_t y2 = (simon128_t)
VecPermute(block2, block3, m2);
690 simon128_t x3 = (simon128_t)
VecPermute(block4, block5, m1);
691 simon128_t y3 = (simon128_t)
VecPermute(block4, block5, m2);
693 for (
size_t i = 0; i < static_cast<size_t>(rounds & ~1)-1; i += 2)
696 const word32* ptr1 =
reinterpret_cast<const word32*
>(subkeys+i*2);
699 const word32* ptr2 =
reinterpret_cast<const word32*
>(subkeys+(i+1)*2);
714 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+(rounds-1)*2);
724 #if (CRYPTOPP_BIG_ENDIAN)
725 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
726 const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
728 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
729 const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
743 uint32x4_p &block5,
const word64 *subkeys,
unsigned int rounds)
745 #if (CRYPTOPP_BIG_ENDIAN)
746 const uint8x16_p m1 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
747 const uint8x16_p m2 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
749 const uint8x16_p m1 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
750 const uint8x16_p m2 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
754 simon128_t x1 = (simon128_t)
VecPermute(block0, block1, m1);
755 simon128_t y1 = (simon128_t)
VecPermute(block0, block1, m2);
756 simon128_t x2 = (simon128_t)
VecPermute(block2, block3, m1);
757 simon128_t y2 = (simon128_t)
VecPermute(block2, block3, m2);
758 simon128_t x3 = (simon128_t)
VecPermute(block4, block5, m1);
759 simon128_t y3 = (simon128_t)
VecPermute(block4, block5, m2);
765 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+rounds-1);
766 const simon128_t tk = (simon128_t)
VecLoad(ptr);
767 const simon128_t rk = (simon128_t)VecSplatElement64<0>(tk);
775 for (
int i =
static_cast<int>(rounds-2); i >= 0; i -= 2)
777 const word32* ptr =
reinterpret_cast<const word32*
>(subkeys+i);
778 const simon128_t tk = (simon128_t)
VecLoad(ptr);
779 const simon128_t rk1 = (simon128_t)VecSplatElement64<1>(tk);
780 const simon128_t rk2 = (simon128_t)VecSplatElement64<0>(tk);
791 #if (CRYPTOPP_BIG_ENDIAN)
792 const uint8x16_p m3 = {31,30,29,28,27,26,25,24, 15,14,13,12,11,10,9,8};
793 const uint8x16_p m4 = {23,22,21,20,19,18,17,16, 7,6,5,4,3,2,1,0};
795 const uint8x16_p m3 = {7,6,5,4,3,2,1,0, 23,22,21,20,19,18,17,16};
796 const uint8x16_p m4 = {15,14,13,12,11,10,9,8, 31,30,29,28,27,26,25,24};
808 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
810 ANONYMOUS_NAMESPACE_END
818 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
819 size_t SIMON128_Enc_AdvancedProcessBlocks_NEON(
const word64* subKeys,
size_t rounds,
820 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
823 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
826 size_t SIMON128_Dec_AdvancedProcessBlocks_NEON(
const word64* subKeys,
size_t rounds,
827 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
830 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
832 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
836 #if (CRYPTOPP_SSSE3_AVAILABLE)
837 size_t SIMON128_Enc_AdvancedProcessBlocks_SSSE3(
const word64* subKeys,
size_t rounds,
838 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
841 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
844 size_t SIMON128_Dec_AdvancedProcessBlocks_SSSE3(
const word64* subKeys,
size_t rounds,
845 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
848 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
850 #endif // CRYPTOPP_SSSE3_AVAILABLE
854 #if (CRYPTOPP_ALTIVEC_AVAILABLE)
855 size_t SIMON128_Enc_AdvancedProcessBlocks_ALTIVEC(
const word64* subKeys,
size_t rounds,
856 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
859 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
862 size_t SIMON128_Dec_AdvancedProcessBlocks_ALTIVEC(
const word64* subKeys,
size_t rounds,
863 const byte *inBlocks,
const byte *xorBlocks,
byte *outBlocks,
size_t length, word32 flags)
866 subKeys, rounds, inBlocks, xorBlocks, outBlocks, length, flags);
868 #endif // CRYPTOPP_ALTIVEC_AVAILABLE