22 #if (defined(CRYPTOPP_APPLE_CLANG_VERSION) && (CRYPTOPP_APPLE_CLANG_VERSION <= 60000)) || (defined(CRYPTOPP_LLVM_CLANG_VERSION) && (CRYPTOPP_LLVM_CLANG_VERSION <= 30500)) 23 # undef CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 28 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE && ((__SUNPRO_CC >= 0x5100 && __SUNPRO_CC < 0x5130) || (_MSC_VER >= 1200 && _MSC_VER < 1600) || (defined(_M_IX86) && _MSC_VER >= 1600)) 29 inline __m128i _mm_set_epi64x(
const word64 a,
const word64 b)
37 val.v64[0] = b; val.v64[1] = a;
47 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 49 # if (__SUNPRO_CC != 0x5120) 54 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 59 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE 64 #ifndef CRYPTOPP_DOXYGEN_PROCESSING 68 template<
bool T_64bit>
69 struct CRYPTOPP_NO_VTABLE BLAKE2_IV {};
73 struct CRYPTOPP_NO_VTABLE BLAKE2_IV<false>
75 CRYPTOPP_CONSTANT(IVSIZE = 8)
77 CRYPTOPP_ALIGN_DATA(16) static const word32 iv[8];
80 CRYPTOPP_ALIGN_DATA(16)
81 const word32 BLAKE2_IV<false>::iv[8] = {
82 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
83 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL
86 #define BLAKE2S_IV(n) BLAKE2_IV<false>::iv[n] 89 struct CRYPTOPP_NO_VTABLE BLAKE2_IV<true>
91 CRYPTOPP_CONSTANT(IVSIZE = 8)
93 CRYPTOPP_ALIGN_DATA(16) static const word64 iv[8];
96 CRYPTOPP_ALIGN_DATA(16)
97 const word64 BLAKE2_IV<true>::iv[8] = {
98 W64LIT(0x6a09e667f3bcc908), W64LIT(0xbb67ae8584caa73b),
99 W64LIT(0x3c6ef372fe94f82b), W64LIT(0xa54ff53a5f1d36f1),
100 W64LIT(0x510e527fade682d1), W64LIT(0x9b05688c2b3e6c1f),
101 W64LIT(0x1f83d9abfb41bd6b), W64LIT(0x5be0cd19137e2179)
104 #define BLAKE2B_IV(n) BLAKE2_IV<true>::iv[n] 108 template<
bool T_64bit>
109 struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma {};
112 struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma<false>
115 CRYPTOPP_ALIGN_DATA(16) static const byte sigma[10][16];
118 CRYPTOPP_ALIGN_DATA(16)
119 const byte BLAKE2_Sigma<false>::sigma[10][16] = {
120 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
121 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
122 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
123 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
124 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
125 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
126 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
127 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
128 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
129 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
134 struct CRYPTOPP_NO_VTABLE BLAKE2_Sigma<true>
137 CRYPTOPP_ALIGN_DATA(16) static const byte sigma[12][16];
140 CRYPTOPP_ALIGN_DATA(16)
141 const byte BLAKE2_Sigma<true>::sigma[12][16] = {
142 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
143 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
144 { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
145 { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
146 { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
147 { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
148 { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
149 { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
150 { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
151 { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 },
152 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
153 { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }
159 pfnCompress64 InitializeCompress64Fn()
161 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 163 return &BLAKE2_SSE4_Compress64;
166 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 167 # if (__SUNPRO_CC != 0x5120) 169 return &BLAKE2_SSE2_Compress64;
173 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE 175 return &BLAKE2_NEON_Compress64;
178 return &BLAKE2_CXX_Compress64;
181 pfnCompress32 InitializeCompress32Fn()
183 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 185 return &BLAKE2_SSE4_Compress32;
188 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 190 return &BLAKE2_SSE2_Compress32;
193 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE 195 return &BLAKE2_NEON_Compress32;
198 return &BLAKE2_CXX_Compress32;
201 #endif // CRYPTOPP_DOXYGEN_PROCESSING 204 const byte* saltStr,
size_t saltLen,
205 const byte* personalizationStr,
size_t personalizationLen)
208 digestLength = (byte)digestLen;
209 keyLength = (byte)keyLen;
211 nodeDepth = innerLength = 0;
213 memset(leafLength, 0x00,
COUNTOF(leafLength));
214 memset(nodeOffset, 0x00,
COUNTOF(nodeOffset));
216 if (saltStr && saltLen)
219 const size_t rem =
COUNTOF(salt) - saltLen;
220 const size_t off =
COUNTOF(salt) - rem;
222 memset(salt+off, 0x00, rem);
226 memset(salt, 0x00,
COUNTOF(salt));
229 if (personalizationStr && personalizationLen)
231 memcpy_s(personalization,
COUNTOF(personalization), personalizationStr, personalizationLen);
232 const size_t rem =
COUNTOF(personalization) - personalizationLen;
233 const size_t off =
COUNTOF(personalization) - rem;
235 memset(personalization+off, 0x00, rem);
239 memset(personalization, 0x00,
COUNTOF(personalization));
244 const byte* saltStr,
size_t saltLen,
245 const byte* personalizationStr,
size_t personalizationLen)
248 digestLength = (byte)digestLen;
249 keyLength = (byte)keyLen;
251 nodeDepth = innerLength = 0;
253 memset(rfu, 0x00,
COUNTOF(rfu));
254 memset(leafLength, 0x00,
COUNTOF(leafLength));
255 memset(nodeOffset, 0x00,
COUNTOF(nodeOffset));
257 if (saltStr && saltLen)
260 const size_t rem =
COUNTOF(salt) - saltLen;
261 const size_t off =
COUNTOF(salt) - rem;
263 memset(salt+off, 0x00, rem);
267 memset(salt, 0x00,
COUNTOF(salt));
270 if (personalizationStr && personalizationLen)
272 memcpy_s(personalization,
COUNTOF(personalization), personalizationStr, personalizationLen);
273 const size_t rem =
COUNTOF(personalization) - personalizationLen;
274 const size_t off =
COUNTOF(personalization) - rem;
276 memset(personalization+off, 0x00, rem);
280 memset(personalization, 0x00,
COUNTOF(personalization));
284 template <
class W,
bool T_64bit>
290 memcpy_s(temp, BLOCKSIZE, key, length);
292 const size_t rem = BLOCKSIZE - length;
294 memset(temp+length, 0x00, rem);
303 #if defined(__COVERITY__) 305 ParameterBlock& block = *m_block.data();
306 memset(m_block.data(), 0x00,
sizeof(ParameterBlock));
309 ParameterBlock& block = *m_block.data();
310 memset(m_block.data(), 0x00, T_64bit ? 32 : 16);
313 block.keyLength = (byte)length;
314 block.digestLength = (byte)params.GetIntValueWithDefault(
Name::DigestSize(), DIGESTSIZE);
315 block.fanout = block.depth = 1;
322 const size_t off =
COUNTOF(block.salt) - rem;
324 memset(block.salt+off, 0x00, rem);
328 memset(block.salt, 0x00,
COUNTOF(block.salt));
334 const size_t rem =
COUNTOF(block.personalization) - t.
size();
335 const size_t off =
COUNTOF(block.personalization) - rem;
337 memset(block.personalization+off, 0x00, rem);
341 memset(block.personalization, 0x00,
COUNTOF(block.personalization));
345 template <
class W,
bool T_64bit>
352 template <
class W,
bool T_64bit>
355 assert(digestSize <= DIGESTSIZE);
361 template <
class W,
bool T_64bit>
363 const byte* personalization,
size_t personalizationLength,
bool treeMode,
unsigned int digestSize)
364 : m_state(1), m_block(1), m_digestSize(digestSize), m_treeMode(treeMode)
366 assert(keyLength <= MAX_KEYLENGTH);
367 assert(digestSize <= DIGESTSIZE);
368 assert(saltLength <= SALTSIZE);
369 assert(personalizationLength <= PERSONALIZATIONSIZE);
376 template <
class W,
bool T_64bit>
379 static const W zero[2] = {0,0};
380 Restart(*m_block.data(), zero);
383 template <
class W,
bool T_64bit>
388 if (&block != m_block.data())
391 m_block.data()->digestLength = (byte)m_digestSize;
392 m_block.data()->keyLength = (byte)m_key.size();
395 State& state = *m_state.data();
396 state.t[0] = state.t[1] = 0, state.f[0] = state.f[1] = 0, state.length = 0;
400 state.t[0] = counter[0];
401 state.t[1] = counter[1];
405 put(BLAKE2_IV<T_64bit>::iv[0])(BLAKE2_IV<T_64bit>::iv[1])(BLAKE2_IV<T_64bit>::iv[2])(BLAKE2_IV<T_64bit>::iv[3]);
406 put(BLAKE2_IV<T_64bit>::iv[4])(BLAKE2_IV<T_64bit>::iv[5])(BLAKE2_IV<T_64bit>::iv[6])(BLAKE2_IV<T_64bit>::iv[7]);
411 Update(m_key, m_key.size());
414 template <
class W,
bool T_64bit>
417 State& state = *m_state.data();
418 if (state.length + length > BLOCKSIZE)
421 const size_t fill = BLOCKSIZE - state.length;
422 memcpy_s(&state.buffer[state.length], fill, input, fill);
425 Compress(state.buffer);
428 length -= fill, input += fill;
431 while (length > BLOCKSIZE)
435 length -= BLOCKSIZE, input += BLOCKSIZE;
442 assert(length <= BLOCKSIZE - state.length);
443 memcpy_s(&state.buffer[state.length], length, input, length);
444 state.length +=
static_cast<unsigned int>(length);
448 template <
class W,
bool T_64bit>
451 this->ThrowIfInvalidTruncatedSize(size);
454 State& state = *m_state.data();
455 state.f[0] =
static_cast<W
>(-1);
459 state.f[1] =
static_cast<W
>(-1);
462 IncrementCounter(state.length);
464 memset(state.buffer + state.length, 0x00, BLOCKSIZE - state.length);
465 Compress(state.buffer);
468 memcpy_s(hash, size, &state.h[0], size);
473 template <
class W,
bool T_64bit>
476 State& state = *m_state.data();
477 state.t[0] +=
static_cast<W
>(count);
478 state.t[1] += !!(state.t[0] < count);
485 static const pfnCompress64 s_pfn = InitializeCompress64Fn();
486 s_pfn(input, *m_state.data());
493 static const pfnCompress32 s_pfn = InitializeCompress32Fn();
494 s_pfn(input, *m_state.data());
502 #define BLAKE2_G(r,i,a,b,c,d) \ 504 a = a + b + m[BLAKE2_Sigma<true>::sigma[r][2*i+0]]; \ 505 d = rotrVariable<word64>(d ^ a, 32); \ 507 b = rotrVariable<word64>(b ^ c, 24); \ 508 a = a + b + m[BLAKE2_Sigma<true>::sigma[r][2*i+1]]; \ 509 d = rotrVariable<word64>(d ^ a, 16); \ 511 b = rotrVariable<word64>(b ^ c, 63); \ 514 #define BLAKE2_ROUND(r) \ 516 BLAKE2_G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ 517 BLAKE2_G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ 518 BLAKE2_G(r,2,v[ 2],v[ 6],v[10],v[14]); \ 519 BLAKE2_G(r,3,v[ 3],v[ 7],v[11],v[15]); \ 520 BLAKE2_G(r,4,v[ 0],v[ 5],v[10],v[15]); \ 521 BLAKE2_G(r,5,v[ 1],v[ 6],v[11],v[12]); \ 522 BLAKE2_G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ 523 BLAKE2_G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ 529 get1(m[0])(m[1])(m[2])(m[3])(m[4])(m[5])(m[6])(m[7])(m[8])(m[9])(m[10])(m[11])(m[12])(m[13])(m[14])(m[15]);
532 get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]);
534 v[ 8] = BLAKE2B_IV(0);
535 v[ 9] = BLAKE2B_IV(1);
536 v[10] = BLAKE2B_IV(2);
537 v[11] = BLAKE2B_IV(3);
538 v[12] = state.t[0] ^ BLAKE2B_IV(4);
539 v[13] = state.t[1] ^ BLAKE2B_IV(5);
540 v[14] = state.f[0] ^ BLAKE2B_IV(6);
541 v[15] = state.f[1] ^ BLAKE2B_IV(7);
556 for(
unsigned int i = 0; i < 8; ++i)
565 #define BLAKE2_G(r,i,a,b,c,d) \ 567 a = a + b + m[BLAKE2_Sigma<false>::sigma[r][2*i+0]]; \ 568 d = rotrVariable<word32>(d ^ a, 16); \ 570 b = rotrVariable<word32>(b ^ c, 12); \ 571 a = a + b + m[BLAKE2_Sigma<false>::sigma[r][2*i+1]]; \ 572 d = rotrVariable<word32>(d ^ a, 8); \ 574 b = rotrVariable<word32>(b ^ c, 7); \ 577 #define BLAKE2_ROUND(r) \ 579 BLAKE2_G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \ 580 BLAKE2_G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \ 581 BLAKE2_G(r,2,v[ 2],v[ 6],v[10],v[14]); \ 582 BLAKE2_G(r,3,v[ 3],v[ 7],v[11],v[15]); \ 583 BLAKE2_G(r,4,v[ 0],v[ 5],v[10],v[15]); \ 584 BLAKE2_G(r,5,v[ 1],v[ 6],v[11],v[12]); \ 585 BLAKE2_G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \ 586 BLAKE2_G(r,7,v[ 3],v[ 4],v[ 9],v[14]); \ 592 get1(m[0])(m[1])(m[2])(m[3])(m[4])(m[5])(m[6])(m[7])(m[8])(m[9])(m[10])(m[11])(m[12])(m[13])(m[14])(m[15]);
595 get2(v[0])(v[1])(v[2])(v[3])(v[4])(v[5])(v[6])(v[7]);
597 v[ 8] = BLAKE2S_IV(0);
598 v[ 9] = BLAKE2S_IV(1);
599 v[10] = BLAKE2S_IV(2);
600 v[11] = BLAKE2S_IV(3);
601 v[12] = state.t[0] ^ BLAKE2S_IV(4);
602 v[13] = state.t[1] ^ BLAKE2S_IV(5);
603 v[14] = state.f[0] ^ BLAKE2S_IV(6);
604 v[15] = state.f[1] ^ BLAKE2S_IV(7);
617 for(
unsigned int i = 0; i < 8; ++i)
621 #if CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 624 word32 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
626 get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15);
628 __m128i row1,row2,row3,row4;
629 __m128i buf1,buf2,buf3,buf4;
632 row1 = ff0 = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[0]));
633 row2 = ff1 = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[4]));
634 row3 = _mm_setr_epi32(BLAKE2S_IV(0),BLAKE2S_IV(1),BLAKE2S_IV(2),BLAKE2S_IV(3));
635 row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4),BLAKE2S_IV(5),BLAKE2S_IV(6),BLAKE2S_IV(7)),_mm_loadu_si128((
const __m128i*)(
const void*)(&state.t[0])));
636 buf1 = _mm_set_epi32(m6,m4,m2,m0);
637 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
638 row4 = _mm_xor_si128(row4,row1);
639 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
640 row3 = _mm_add_epi32(row3,row4);
641 row2 = _mm_xor_si128(row2,row3);
642 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
644 buf2 = _mm_set_epi32(m7,m5,m3,m1);
645 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
646 row4 = _mm_xor_si128(row4,row1);
647 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
648 row3 = _mm_add_epi32(row3,row4);
649 row2 = _mm_xor_si128(row2,row3);
650 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
652 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
653 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
654 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
656 buf3 = _mm_set_epi32(m14,m12,m10,m8);
657 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
658 row4 = _mm_xor_si128(row4,row1);
659 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
660 row3 = _mm_add_epi32(row3,row4);
661 row2 = _mm_xor_si128(row2,row3);
662 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
664 buf4 = _mm_set_epi32(m15,m13,m11,m9);
665 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
666 row4 = _mm_xor_si128(row4,row1);
667 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
668 row3 = _mm_add_epi32(row3,row4);
669 row2 = _mm_xor_si128(row2,row3);
670 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
672 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
673 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
674 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
676 buf1 = _mm_set_epi32(m13,m9,m4,m14);
677 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
678 row4 = _mm_xor_si128(row4,row1);
679 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
680 row3 = _mm_add_epi32(row3,row4);
681 row2 = _mm_xor_si128(row2,row3);
682 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
684 buf2 = _mm_set_epi32(m6,m15,m8,m10);
685 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
686 row4 = _mm_xor_si128(row4,row1);
687 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
688 row3 = _mm_add_epi32(row3,row4);
689 row2 = _mm_xor_si128(row2,row3);
690 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
692 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
693 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
694 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
696 buf3 = _mm_set_epi32(m5,m11,m0,m1);
697 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
698 row4 = _mm_xor_si128(row4,row1);
699 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
700 row3 = _mm_add_epi32(row3,row4);
701 row2 = _mm_xor_si128(row2,row3);
702 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
704 buf4 = _mm_set_epi32(m3,m7,m2,m12);
705 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
706 row4 = _mm_xor_si128(row4,row1);
707 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
708 row3 = _mm_add_epi32(row3,row4);
709 row2 = _mm_xor_si128(row2,row3);
710 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
712 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
713 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
714 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
716 buf1 = _mm_set_epi32(m15,m5,m12,m11);
717 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
718 row4 = _mm_xor_si128(row4,row1);
719 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
720 row3 = _mm_add_epi32(row3,row4);
721 row2 = _mm_xor_si128(row2,row3);
722 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
724 buf2 = _mm_set_epi32(m13,m2,m0,m8);
725 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
726 row4 = _mm_xor_si128(row4,row1);
727 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
728 row3 = _mm_add_epi32(row3,row4);
729 row2 = _mm_xor_si128(row2,row3);
730 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
732 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
733 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
734 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
736 buf3 = _mm_set_epi32(m9,m7,m3,m10);
737 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
738 row4 = _mm_xor_si128(row4,row1);
739 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
740 row3 = _mm_add_epi32(row3,row4);
741 row2 = _mm_xor_si128(row2,row3);
742 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
744 buf4 = _mm_set_epi32(m4,m1,m6,m14);
745 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
746 row4 = _mm_xor_si128(row4,row1);
747 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
748 row3 = _mm_add_epi32(row3,row4);
749 row2 = _mm_xor_si128(row2,row3);
750 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
752 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
753 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
754 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
756 buf1 = _mm_set_epi32(m11,m13,m3,m7);
757 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
758 row4 = _mm_xor_si128(row4,row1);
759 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
760 row3 = _mm_add_epi32(row3,row4);
761 row2 = _mm_xor_si128(row2,row3);
762 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
764 buf2 = _mm_set_epi32(m14,m12,m1,m9);
765 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
766 row4 = _mm_xor_si128(row4,row1);
767 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
768 row3 = _mm_add_epi32(row3,row4);
769 row2 = _mm_xor_si128(row2,row3);
770 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
772 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
773 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
774 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
776 buf3 = _mm_set_epi32(m15,m4,m5,m2);
777 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
778 row4 = _mm_xor_si128(row4,row1);
779 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
780 row3 = _mm_add_epi32(row3,row4);
781 row2 = _mm_xor_si128(row2,row3);
782 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
784 buf4 = _mm_set_epi32(m8,m0,m10,m6);
785 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
786 row4 = _mm_xor_si128(row4,row1);
787 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
788 row3 = _mm_add_epi32(row3,row4);
789 row2 = _mm_xor_si128(row2,row3);
790 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
792 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
793 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
794 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
796 buf1 = _mm_set_epi32(m10,m2,m5,m9);
797 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
798 row4 = _mm_xor_si128(row4,row1);
799 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
800 row3 = _mm_add_epi32(row3,row4);
801 row2 = _mm_xor_si128(row2,row3);
802 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
804 buf2 = _mm_set_epi32(m15,m4,m7,m0);
805 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
806 row4 = _mm_xor_si128(row4,row1);
807 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
808 row3 = _mm_add_epi32(row3,row4);
809 row2 = _mm_xor_si128(row2,row3);
810 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
812 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
813 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
814 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
816 buf3 = _mm_set_epi32(m3,m6,m11,m14);
817 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
818 row4 = _mm_xor_si128(row4,row1);
819 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
820 row3 = _mm_add_epi32(row3,row4);
821 row2 = _mm_xor_si128(row2,row3);
822 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
824 buf4 = _mm_set_epi32(m13,m8,m12,m1);
825 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
826 row4 = _mm_xor_si128(row4,row1);
827 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
828 row3 = _mm_add_epi32(row3,row4);
829 row2 = _mm_xor_si128(row2,row3);
830 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
832 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
833 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
834 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
836 buf1 = _mm_set_epi32(m8,m0,m6,m2);
837 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
838 row4 = _mm_xor_si128(row4,row1);
839 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
840 row3 = _mm_add_epi32(row3,row4);
841 row2 = _mm_xor_si128(row2,row3);
842 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
844 buf2 = _mm_set_epi32(m3,m11,m10,m12);
845 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
846 row4 = _mm_xor_si128(row4,row1);
847 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
848 row3 = _mm_add_epi32(row3,row4);
849 row2 = _mm_xor_si128(row2,row3);
850 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
852 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
853 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
854 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
856 buf3 = _mm_set_epi32(m1,m15,m7,m4);
857 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
858 row4 = _mm_xor_si128(row4,row1);
859 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
860 row3 = _mm_add_epi32(row3,row4);
861 row2 = _mm_xor_si128(row2,row3);
862 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
864 buf4 = _mm_set_epi32(m9,m14,m5,m13);
865 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
866 row4 = _mm_xor_si128(row4,row1);
867 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
868 row3 = _mm_add_epi32(row3,row4);
869 row2 = _mm_xor_si128(row2,row3);
870 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
872 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
873 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
874 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
876 buf1 = _mm_set_epi32(m4,m14,m1,m12);
877 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
878 row4 = _mm_xor_si128(row4,row1);
879 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
880 row3 = _mm_add_epi32(row3,row4);
881 row2 = _mm_xor_si128(row2,row3);
882 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
884 buf2 = _mm_set_epi32(m10,m13,m15,m5);
885 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
886 row4 = _mm_xor_si128(row4,row1);
887 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
888 row3 = _mm_add_epi32(row3,row4);
889 row2 = _mm_xor_si128(row2,row3);
890 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
892 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
893 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
894 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
896 buf3 = _mm_set_epi32(m8,m9,m6,m0);
897 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
898 row4 = _mm_xor_si128(row4,row1);
899 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
900 row3 = _mm_add_epi32(row3,row4);
901 row2 = _mm_xor_si128(row2,row3);
902 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
904 buf4 = _mm_set_epi32(m11,m2,m3,m7);
905 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
906 row4 = _mm_xor_si128(row4,row1);
907 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
908 row3 = _mm_add_epi32(row3,row4);
909 row2 = _mm_xor_si128(row2,row3);
910 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
912 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
913 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
914 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
916 buf1 = _mm_set_epi32(m3,m12,m7,m13);
917 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
918 row4 = _mm_xor_si128(row4,row1);
919 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
920 row3 = _mm_add_epi32(row3,row4);
921 row2 = _mm_xor_si128(row2,row3);
922 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
924 buf2 = _mm_set_epi32(m9,m1,m14,m11);
925 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
926 row4 = _mm_xor_si128(row4,row1);
927 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
928 row3 = _mm_add_epi32(row3,row4);
929 row2 = _mm_xor_si128(row2,row3);
930 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
932 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
933 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
934 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
936 buf3 = _mm_set_epi32(m2,m8,m15,m5);
937 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
938 row4 = _mm_xor_si128(row4,row1);
939 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
940 row3 = _mm_add_epi32(row3,row4);
941 row2 = _mm_xor_si128(row2,row3);
942 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
944 buf4 = _mm_set_epi32(m10,m6,m4,m0);
945 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
946 row4 = _mm_xor_si128(row4,row1);
947 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
948 row3 = _mm_add_epi32(row3,row4);
949 row2 = _mm_xor_si128(row2,row3);
950 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
952 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
953 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
954 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
956 buf1 = _mm_set_epi32(m0,m11,m14,m6);
957 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
958 row4 = _mm_xor_si128(row4,row1);
959 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
960 row3 = _mm_add_epi32(row3,row4);
961 row2 = _mm_xor_si128(row2,row3);
962 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
964 buf2 = _mm_set_epi32(m8,m3,m9,m15);
965 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
966 row4 = _mm_xor_si128(row4,row1);
967 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
968 row3 = _mm_add_epi32(row3,row4);
969 row2 = _mm_xor_si128(row2,row3);
970 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
972 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
973 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
974 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
976 buf3 = _mm_set_epi32(m10,m1,m13,m12);
977 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
978 row4 = _mm_xor_si128(row4,row1);
979 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
980 row3 = _mm_add_epi32(row3,row4);
981 row2 = _mm_xor_si128(row2,row3);
982 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
984 buf4 = _mm_set_epi32(m5,m4,m7,m2);
985 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
986 row4 = _mm_xor_si128(row4,row1);
987 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
988 row3 = _mm_add_epi32(row3,row4);
989 row2 = _mm_xor_si128(row2,row3);
990 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
992 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
993 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
994 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
996 buf1 = _mm_set_epi32(m1,m7,m8,m10);
997 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf1),row2);
998 row4 = _mm_xor_si128(row4,row1);
999 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
1000 row3 = _mm_add_epi32(row3,row4);
1001 row2 = _mm_xor_si128(row2,row3);
1002 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
1004 buf2 = _mm_set_epi32(m5,m6,m4,m2);
1005 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf2),row2);
1006 row4 = _mm_xor_si128(row4,row1);
1007 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
1008 row3 = _mm_add_epi32(row3,row4);
1009 row2 = _mm_xor_si128(row2,row3);
1010 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
1012 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(2,1,0,3));
1013 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
1014 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(0,3,2,1));
1016 buf3 = _mm_set_epi32(m13,m3,m9,m15);
1017 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf3),row2);
1018 row4 = _mm_xor_si128(row4,row1);
1019 row4 = _mm_xor_si128(_mm_srli_epi32(row4,16),_mm_slli_epi32(row4,16));
1020 row3 = _mm_add_epi32(row3,row4);
1021 row2 = _mm_xor_si128(row2,row3);
1022 row2 = _mm_xor_si128(_mm_srli_epi32(row2,12),_mm_slli_epi32(row2,20));
1024 buf4 = _mm_set_epi32(m0,m12,m14,m11);
1025 row1 = _mm_add_epi32(_mm_add_epi32(row1,buf4),row2);
1026 row4 = _mm_xor_si128(row4,row1);
1027 row4 = _mm_xor_si128(_mm_srli_epi32(row4,8),_mm_slli_epi32(row4,24));
1028 row3 = _mm_add_epi32(row3,row4);
1029 row2 = _mm_xor_si128(row2,row3);
1030 row2 = _mm_xor_si128(_mm_srli_epi32(row2,7),_mm_slli_epi32(row2,25));
1032 row4 = _mm_shuffle_epi32(row4,_MM_SHUFFLE(0,3,2,1));
1033 row3 = _mm_shuffle_epi32(row3,_MM_SHUFFLE(1,0,3,2));
1034 row2 = _mm_shuffle_epi32(row2,_MM_SHUFFLE(2,1,0,3));
1036 _mm_storeu_si128((__m128i *)(
void*)(&state.h[0]),_mm_xor_si128(ff0,_mm_xor_si128(row1,row3)));
1037 _mm_storeu_si128((__m128i *)(
void*)(&state.h[4]),_mm_xor_si128(ff1,_mm_xor_si128(row2,row4)));
1040 # if (__SUNPRO_CC != 0x5120) 1043 word64 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15;
1045 get(m0)(m1)(m2)(m3)(m4)(m5)(m6)(m7)(m8)(m9)(m10)(m11)(m12)(m13)(m14)(m15);
1047 __m128i row1l, row1h, row2l, row2h;
1048 __m128i row3l, row3h, row4l, row4h;
1049 __m128i b0, b1, t0, t1;
1051 row1l = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[0]));
1052 row1h = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[2]));
1053 row2l = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[4]));
1054 row2h = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[6]));
1055 row3l = _mm_loadu_si128((
const __m128i*)(
const void*)(&BLAKE2B_IV(0)));
1056 row3h = _mm_loadu_si128((
const __m128i*)(
const void*)(&BLAKE2B_IV(2)));
1057 row4l = _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&BLAKE2B_IV(4))), _mm_loadu_si128((
const __m128i*)(
const void*)(&state.t[0])));
1058 row4h = _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&BLAKE2B_IV(6))), _mm_loadu_si128((
const __m128i*)(
const void*)(&state.f[0])));
1060 b0 = _mm_set_epi64x(m2, m0);
1061 b1 = _mm_set_epi64x(m6, m4);
1062 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1063 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1064 row4l = _mm_xor_si128(row4l, row1l);
1065 row4h = _mm_xor_si128(row4h, row1h);
1066 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1067 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1068 row3l = _mm_add_epi64(row3l, row4l);
1069 row3h = _mm_add_epi64(row3h, row4h);
1070 row2l = _mm_xor_si128(row2l, row3l);
1071 row2h = _mm_xor_si128(row2h, row3h);
1072 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l, 40));
1073 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h, 40));
1075 b0 = _mm_set_epi64x(m3, m1);
1076 b1 = _mm_set_epi64x(m7, m5);
1077 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1078 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1079 row4l = _mm_xor_si128(row4l, row1l);
1080 row4h = _mm_xor_si128(row4h, row1h);
1081 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1082 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1083 row3l = _mm_add_epi64(row3l, row4l);
1084 row3h = _mm_add_epi64(row3h, row4h);
1085 row2l = _mm_xor_si128(row2l, row3l);
1086 row2h = _mm_xor_si128(row2h, row3h);
1087 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1088 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1090 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1091 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1092 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1093 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1094 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1096 b0 = _mm_set_epi64x(m10, m8);
1097 b1 = _mm_set_epi64x(m14, m12);
1098 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1099 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1100 row4l = _mm_xor_si128(row4l, row1l);
1101 row4h = _mm_xor_si128(row4h, row1h);
1102 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1103 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1104 row3l = _mm_add_epi64(row3l, row4l);
1105 row3h = _mm_add_epi64(row3h, row4h);
1106 row2l = _mm_xor_si128(row2l, row3l);
1107 row2h = _mm_xor_si128(row2h, row3h);
1108 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1109 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1111 b0 = _mm_set_epi64x(m11, m9);
1112 b1 = _mm_set_epi64x(m15, m13);
1113 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1114 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1115 row4l = _mm_xor_si128(row4l, row1l);
1116 row4h = _mm_xor_si128(row4h, row1h);
1117 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1118 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1119 row3l = _mm_add_epi64(row3l, row4l);
1120 row3h = _mm_add_epi64(row3h, row4h);
1121 row2l = _mm_xor_si128(row2l, row3l);
1122 row2h = _mm_xor_si128(row2h, row3h);
1123 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1124 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1126 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1127 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1128 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1129 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1130 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1132 b0 = _mm_set_epi64x(m4, m14);
1133 b1 = _mm_set_epi64x(m13, m9);
1134 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1135 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1136 row4l = _mm_xor_si128(row4l, row1l);
1137 row4h = _mm_xor_si128(row4h, row1h);
1138 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1139 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1140 row3l = _mm_add_epi64(row3l, row4l);
1141 row3h = _mm_add_epi64(row3h, row4h);
1142 row2l = _mm_xor_si128(row2l, row3l);
1143 row2h = _mm_xor_si128(row2h, row3h);
1144 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1145 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1147 b0 = _mm_set_epi64x(m8, m10);
1148 b1 = _mm_set_epi64x(m6, m15);
1149 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1150 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1151 row4l = _mm_xor_si128(row4l, row1l);
1152 row4h = _mm_xor_si128(row4h, row1h);
1153 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1154 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1155 row3l = _mm_add_epi64(row3l, row4l);
1156 row3h = _mm_add_epi64(row3h, row4h);
1157 row2l = _mm_xor_si128(row2l, row3l);
1158 row2h = _mm_xor_si128(row2h, row3h);
1159 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1160 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1162 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1163 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1164 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1165 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1166 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1167 b0 = _mm_set_epi64x(m0, m1);
1168 b1 = _mm_set_epi64x(m5, m11);
1169 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1170 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1171 row4l = _mm_xor_si128(row4l, row1l);
1172 row4h = _mm_xor_si128(row4h, row1h);
1173 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1174 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1175 row3l = _mm_add_epi64(row3l, row4l);
1176 row3h = _mm_add_epi64(row3h, row4h);
1177 row2l = _mm_xor_si128(row2l, row3l);
1178 row2h = _mm_xor_si128(row2h, row3h);
1179 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1180 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1182 b0 = _mm_set_epi64x(m2, m12);
1183 b1 = _mm_set_epi64x(m3, m7);
1184 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1185 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1186 row4l = _mm_xor_si128(row4l, row1l);
1187 row4h = _mm_xor_si128(row4h, row1h);
1188 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1189 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1190 row3l = _mm_add_epi64(row3l, row4l);
1191 row3h = _mm_add_epi64(row3h, row4h);
1192 row2l = _mm_xor_si128(row2l, row3l);
1193 row2h = _mm_xor_si128(row2h, row3h);
1194 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1195 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1197 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1198 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1199 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1200 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1201 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1203 b0 = _mm_set_epi64x(m12, m11);
1204 b1 = _mm_set_epi64x(m15, m5);
1205 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1206 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1207 row4l = _mm_xor_si128(row4l, row1l);
1208 row4h = _mm_xor_si128(row4h, row1h);
1209 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1210 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1211 row3l = _mm_add_epi64(row3l, row4l);
1212 row3h = _mm_add_epi64(row3h, row4h);
1213 row2l = _mm_xor_si128(row2l, row3l);
1214 row2h = _mm_xor_si128(row2h, row3h);
1215 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1216 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1218 b0 = _mm_set_epi64x(m0, m8);
1219 b1 = _mm_set_epi64x(m13, m2);
1220 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1221 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1222 row4l = _mm_xor_si128(row4l, row1l);
1223 row4h = _mm_xor_si128(row4h, row1h);
1224 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1225 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1226 row3l = _mm_add_epi64(row3l, row4l);
1227 row3h = _mm_add_epi64(row3h, row4h);
1228 row2l = _mm_xor_si128(row2l, row3l);
1229 row2h = _mm_xor_si128(row2h, row3h);
1230 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1231 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1233 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1234 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1235 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1236 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1237 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1238 b0 = _mm_set_epi64x(m3, m10);
1239 b1 = _mm_set_epi64x(m9, m7);
1240 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1241 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1242 row4l = _mm_xor_si128(row4l, row1l);
1243 row4h = _mm_xor_si128(row4h, row1h);
1244 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1245 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1246 row3l = _mm_add_epi64(row3l, row4l);
1247 row3h = _mm_add_epi64(row3h, row4h);
1248 row2l = _mm_xor_si128(row2l, row3l);
1249 row2h = _mm_xor_si128(row2h, row3h);
1250 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1251 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1253 b0 = _mm_set_epi64x(m6, m14);
1254 b1 = _mm_set_epi64x(m4, m1);
1255 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1256 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1257 row4l = _mm_xor_si128(row4l, row1l);
1258 row4h = _mm_xor_si128(row4h, row1h);
1259 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1260 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1261 row3l = _mm_add_epi64(row3l, row4l);
1262 row3h = _mm_add_epi64(row3h, row4h);
1263 row2l = _mm_xor_si128(row2l, row3l);
1264 row2h = _mm_xor_si128(row2h, row3h);
1265 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1266 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1268 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1269 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1270 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1271 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1272 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1274 b0 = _mm_set_epi64x(m3, m7);
1275 b1 = _mm_set_epi64x(m11, m13);
1276 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1277 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1278 row4l = _mm_xor_si128(row4l, row1l);
1279 row4h = _mm_xor_si128(row4h, row1h);
1280 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1281 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1282 row3l = _mm_add_epi64(row3l, row4l);
1283 row3h = _mm_add_epi64(row3h, row4h);
1284 row2l = _mm_xor_si128(row2l, row3l);
1285 row2h = _mm_xor_si128(row2h, row3h);
1286 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1287 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1289 b0 = _mm_set_epi64x(m1, m9);
1290 b1 = _mm_set_epi64x(m14, m12);
1291 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1292 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1293 row4l = _mm_xor_si128(row4l, row1l);
1294 row4h = _mm_xor_si128(row4h, row1h);
1295 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1296 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1297 row3l = _mm_add_epi64(row3l, row4l);
1298 row3h = _mm_add_epi64(row3h, row4h);
1299 row2l = _mm_xor_si128(row2l, row3l);
1300 row2h = _mm_xor_si128(row2h, row3h);
1301 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1302 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1304 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1305 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1306 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1307 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1308 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1309 b0 = _mm_set_epi64x(m5, m2);
1310 b1 = _mm_set_epi64x(m15, m4);
1311 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1312 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1313 row4l = _mm_xor_si128(row4l, row1l);
1314 row4h = _mm_xor_si128(row4h, row1h);
1315 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1316 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1317 row3l = _mm_add_epi64(row3l, row4l);
1318 row3h = _mm_add_epi64(row3h, row4h);
1319 row2l = _mm_xor_si128(row2l, row3l);
1320 row2h = _mm_xor_si128(row2h, row3h);
1321 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1322 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1324 b0 = _mm_set_epi64x(m10, m6);
1325 b1 = _mm_set_epi64x(m8, m0);
1326 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1327 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1328 row4l = _mm_xor_si128(row4l, row1l);
1329 row4h = _mm_xor_si128(row4h, row1h);
1330 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1331 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1332 row3l = _mm_add_epi64(row3l, row4l);
1333 row3h = _mm_add_epi64(row3h, row4h);
1334 row2l = _mm_xor_si128(row2l, row3l);
1335 row2h = _mm_xor_si128(row2h, row3h);
1336 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1337 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1339 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1340 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1341 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1342 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1343 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1345 b0 = _mm_set_epi64x(m5, m9);
1346 b1 = _mm_set_epi64x(m10, m2);
1347 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1348 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1349 row4l = _mm_xor_si128(row4l, row1l);
1350 row4h = _mm_xor_si128(row4h, row1h);
1351 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1352 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1353 row3l = _mm_add_epi64(row3l, row4l);
1354 row3h = _mm_add_epi64(row3h, row4h);
1355 row2l = _mm_xor_si128(row2l, row3l);
1356 row2h = _mm_xor_si128(row2h, row3h);
1357 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1358 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1360 b0 = _mm_set_epi64x(m7, m0);
1361 b1 = _mm_set_epi64x(m15, m4);
1362 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1363 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1364 row4l = _mm_xor_si128(row4l, row1l);
1365 row4h = _mm_xor_si128(row4h, row1h);
1366 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1367 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1368 row3l = _mm_add_epi64(row3l, row4l);
1369 row3h = _mm_add_epi64(row3h, row4h);
1370 row2l = _mm_xor_si128(row2l, row3l);
1371 row2h = _mm_xor_si128(row2h, row3h);
1372 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1373 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1375 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1376 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1377 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1378 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1379 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1380 b0 = _mm_set_epi64x(m11, m14);
1381 b1 = _mm_set_epi64x(m3, m6);
1382 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1383 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1384 row4l = _mm_xor_si128(row4l, row1l);
1385 row4h = _mm_xor_si128(row4h, row1h);
1386 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1387 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1388 row3l = _mm_add_epi64(row3l, row4l);
1389 row3h = _mm_add_epi64(row3h, row4h);
1390 row2l = _mm_xor_si128(row2l, row3l);
1391 row2h = _mm_xor_si128(row2h, row3h);
1392 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1393 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1396 b0 = _mm_set_epi64x(m12, m1);
1397 b1 = _mm_set_epi64x(m13, m8);
1398 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1399 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1400 row4l = _mm_xor_si128(row4l, row1l);
1401 row4h = _mm_xor_si128(row4h, row1h);
1402 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1403 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1404 row3l = _mm_add_epi64(row3l, row4l);
1405 row3h = _mm_add_epi64(row3h, row4h);
1406 row2l = _mm_xor_si128(row2l, row3l);
1407 row2h = _mm_xor_si128(row2h, row3h);
1408 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1409 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1411 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1412 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1413 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1414 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1415 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1417 b0 = _mm_set_epi64x(m6, m2);
1418 b1 = _mm_set_epi64x(m8, m0);
1419 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1420 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1421 row4l = _mm_xor_si128(row4l, row1l);
1422 row4h = _mm_xor_si128(row4h, row1h);
1423 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1424 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1425 row3l = _mm_add_epi64(row3l, row4l);
1426 row3h = _mm_add_epi64(row3h, row4h);
1427 row2l = _mm_xor_si128(row2l, row3l);
1428 row2h = _mm_xor_si128(row2h, row3h);
1429 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1430 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1432 b0 = _mm_set_epi64x(m10, m12);
1433 b1 = _mm_set_epi64x(m3, m11);
1434 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1435 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1436 row4l = _mm_xor_si128(row4l, row1l);
1437 row4h = _mm_xor_si128(row4h, row1h);
1438 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1439 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1440 row3l = _mm_add_epi64(row3l, row4l);
1441 row3h = _mm_add_epi64(row3h, row4h);
1442 row2l = _mm_xor_si128(row2l, row3l);
1443 row2h = _mm_xor_si128(row2h, row3h);
1444 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1445 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1447 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1448 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1449 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1450 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1451 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1452 b0 = _mm_set_epi64x(m7, m4);
1453 b1 = _mm_set_epi64x(m1, m15);
1454 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1455 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1456 row4l = _mm_xor_si128(row4l, row1l);
1457 row4h = _mm_xor_si128(row4h, row1h);
1458 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1459 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1460 row3l = _mm_add_epi64(row3l, row4l);
1461 row3h = _mm_add_epi64(row3h, row4h);
1462 row2l = _mm_xor_si128(row2l, row3l);
1463 row2h = _mm_xor_si128(row2h, row3h);
1464 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1465 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1467 b0 = _mm_set_epi64x(m5, m13);
1468 b1 = _mm_set_epi64x(m9, m14);
1469 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1470 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1471 row4l = _mm_xor_si128(row4l, row1l);
1472 row4h = _mm_xor_si128(row4h, row1h);
1473 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1474 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1475 row3l = _mm_add_epi64(row3l, row4l);
1476 row3h = _mm_add_epi64(row3h, row4h);
1477 row2l = _mm_xor_si128(row2l, row3l);
1478 row2h = _mm_xor_si128(row2h, row3h);
1479 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1480 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1482 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1483 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1484 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1485 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1486 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1488 b0 = _mm_set_epi64x(m1, m12);
1489 b1 = _mm_set_epi64x(m4, m14);
1490 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1491 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1492 row4l = _mm_xor_si128(row4l, row1l);
1493 row4h = _mm_xor_si128(row4h, row1h);
1494 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1495 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1496 row3l = _mm_add_epi64(row3l, row4l);
1497 row3h = _mm_add_epi64(row3h, row4h);
1498 row2l = _mm_xor_si128(row2l, row3l);
1499 row2h = _mm_xor_si128(row2h, row3h);
1500 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1501 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1503 b0 = _mm_set_epi64x(m15, m5);
1504 b1 = _mm_set_epi64x(m10, m13);
1505 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1506 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1507 row4l = _mm_xor_si128(row4l, row1l);
1508 row4h = _mm_xor_si128(row4h, row1h);
1509 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1510 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1511 row3l = _mm_add_epi64(row3l, row4l);
1512 row3h = _mm_add_epi64(row3h, row4h);
1513 row2l = _mm_xor_si128(row2l, row3l);
1514 row2h = _mm_xor_si128(row2h, row3h);
1515 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1516 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1518 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1519 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1520 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1521 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1522 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1523 b0 = _mm_set_epi64x(m6, m0);
1524 b1 = _mm_set_epi64x(m8, m9);
1525 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1526 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1527 row4l = _mm_xor_si128(row4l, row1l);
1528 row4h = _mm_xor_si128(row4h, row1h);
1529 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1530 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1531 row3l = _mm_add_epi64(row3l, row4l);
1532 row3h = _mm_add_epi64(row3h, row4h);
1533 row2l = _mm_xor_si128(row2l, row3l);
1534 row2h = _mm_xor_si128(row2h, row3h);
1535 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1536 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1538 b0 = _mm_set_epi64x(m3, m7);
1539 b1 = _mm_set_epi64x(m11, m2);
1540 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1541 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1542 row4l = _mm_xor_si128(row4l, row1l);
1543 row4h = _mm_xor_si128(row4h, row1h);
1544 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1545 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1546 row3l = _mm_add_epi64(row3l, row4l);
1547 row3h = _mm_add_epi64(row3h, row4h);
1548 row2l = _mm_xor_si128(row2l, row3l);
1549 row2h = _mm_xor_si128(row2h, row3h);
1550 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1551 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1553 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1554 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1555 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1556 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1557 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1559 b0 = _mm_set_epi64x(m7, m13);
1560 b1 = _mm_set_epi64x(m3, m12);
1561 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1562 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1563 row4l = _mm_xor_si128(row4l, row1l);
1564 row4h = _mm_xor_si128(row4h, row1h);
1565 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1566 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1567 row3l = _mm_add_epi64(row3l, row4l);
1568 row3h = _mm_add_epi64(row3h, row4h);
1569 row2l = _mm_xor_si128(row2l, row3l);
1570 row2h = _mm_xor_si128(row2h, row3h);
1571 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1572 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1574 b0 = _mm_set_epi64x(m14, m11);
1575 b1 = _mm_set_epi64x(m9, m1);
1576 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1577 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1578 row4l = _mm_xor_si128(row4l, row1l);
1579 row4h = _mm_xor_si128(row4h, row1h);
1580 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1581 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1582 row3l = _mm_add_epi64(row3l, row4l);
1583 row3h = _mm_add_epi64(row3h, row4h);
1584 row2l = _mm_xor_si128(row2l, row3l);
1585 row2h = _mm_xor_si128(row2h, row3h);
1586 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1587 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1589 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1590 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1591 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1592 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1593 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1594 b0 = _mm_set_epi64x(m15, m5);
1595 b1 = _mm_set_epi64x(m2, m8);
1596 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1597 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1598 row4l = _mm_xor_si128(row4l, row1l);
1599 row4h = _mm_xor_si128(row4h, row1h);
1600 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1601 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1602 row3l = _mm_add_epi64(row3l, row4l);
1603 row3h = _mm_add_epi64(row3h, row4h);
1604 row2l = _mm_xor_si128(row2l, row3l);
1605 row2h = _mm_xor_si128(row2h, row3h);
1606 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1607 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1609 b0 = _mm_set_epi64x(m4, m0);
1610 b1 = _mm_set_epi64x(m10, m6);
1611 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1612 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1613 row4l = _mm_xor_si128(row4l, row1l);
1614 row4h = _mm_xor_si128(row4h, row1h);
1615 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1616 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1617 row3l = _mm_add_epi64(row3l, row4l);
1618 row3h = _mm_add_epi64(row3h, row4h);
1619 row2l = _mm_xor_si128(row2l, row3l);
1620 row2h = _mm_xor_si128(row2h, row3h);
1621 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1622 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1624 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1625 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1626 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1627 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1628 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1630 b0 = _mm_set_epi64x(m14, m6);
1631 b1 = _mm_set_epi64x(m0, m11);
1632 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1633 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1634 row4l = _mm_xor_si128(row4l, row1l);
1635 row4h = _mm_xor_si128(row4h, row1h);
1636 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1637 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1638 row3l = _mm_add_epi64(row3l, row4l);
1639 row3h = _mm_add_epi64(row3h, row4h);
1640 row2l = _mm_xor_si128(row2l, row3l);
1641 row2h = _mm_xor_si128(row2h, row3h);
1642 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1643 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1645 b0 = _mm_set_epi64x(m9, m15);
1646 b1 = _mm_set_epi64x(m8, m3);
1647 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1648 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1649 row4l = _mm_xor_si128(row4l, row1l);
1650 row4h = _mm_xor_si128(row4h, row1h);
1651 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1652 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1653 row3l = _mm_add_epi64(row3l, row4l);
1654 row3h = _mm_add_epi64(row3h, row4h);
1655 row2l = _mm_xor_si128(row2l, row3l);
1656 row2h = _mm_xor_si128(row2h, row3h);
1657 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1658 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1660 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1661 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1662 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1663 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1664 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1665 b0 = _mm_set_epi64x(m13, m12);
1666 b1 = _mm_set_epi64x(m10, m1);
1667 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1668 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1669 row4l = _mm_xor_si128(row4l, row1l);
1670 row4h = _mm_xor_si128(row4h, row1h);
1671 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1672 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1673 row3l = _mm_add_epi64(row3l, row4l);
1674 row3h = _mm_add_epi64(row3h, row4h);
1675 row2l = _mm_xor_si128(row2l, row3l);
1676 row2h = _mm_xor_si128(row2h, row3h);
1677 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1678 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1680 b0 = _mm_set_epi64x(m7, m2);
1681 b1 = _mm_set_epi64x(m5, m4);
1682 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1683 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1684 row4l = _mm_xor_si128(row4l, row1l);
1685 row4h = _mm_xor_si128(row4h, row1h);
1686 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1687 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1688 row3l = _mm_add_epi64(row3l, row4l);
1689 row3h = _mm_add_epi64(row3h, row4h);
1690 row2l = _mm_xor_si128(row2l, row3l);
1691 row2h = _mm_xor_si128(row2h, row3h);
1692 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1693 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1695 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1696 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1697 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1698 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1699 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1701 b0 = _mm_set_epi64x(m8, m10);
1702 b1 = _mm_set_epi64x(m1, m7);
1703 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1704 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1705 row4l = _mm_xor_si128(row4l, row1l);
1706 row4h = _mm_xor_si128(row4h, row1h);
1707 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1708 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1709 row3l = _mm_add_epi64(row3l, row4l);
1710 row3h = _mm_add_epi64(row3h, row4h);
1711 row2l = _mm_xor_si128(row2l, row3l);
1712 row2h = _mm_xor_si128(row2h, row3h);
1713 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1714 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1716 b0 = _mm_set_epi64x(m4, m2);
1717 b1 = _mm_set_epi64x(m5, m6);
1718 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1719 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1720 row4l = _mm_xor_si128(row4l, row1l);
1721 row4h = _mm_xor_si128(row4h, row1h);
1722 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1723 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1724 row3l = _mm_add_epi64(row3l, row4l);
1725 row3h = _mm_add_epi64(row3h, row4h);
1726 row2l = _mm_xor_si128(row2l, row3l);
1727 row2h = _mm_xor_si128(row2h, row3h);
1728 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1729 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1731 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1732 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1733 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1734 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1735 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1736 b0 = _mm_set_epi64x(m9, m15);
1737 b1 = _mm_set_epi64x(m13, m3);
1738 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1739 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1740 row4l = _mm_xor_si128(row4l, row1l);
1741 row4h = _mm_xor_si128(row4h, row1h);
1742 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1743 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1744 row3l = _mm_add_epi64(row3l, row4l);
1745 row3h = _mm_add_epi64(row3h, row4h);
1746 row2l = _mm_xor_si128(row2l, row3l);
1747 row2h = _mm_xor_si128(row2h, row3h);
1748 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1749 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1751 b0 = _mm_set_epi64x(m14, m11);
1752 b1 = _mm_set_epi64x(m0, m12);
1753 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1754 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1755 row4l = _mm_xor_si128(row4l, row1l);
1756 row4h = _mm_xor_si128(row4h, row1h);
1757 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1758 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1759 row3l = _mm_add_epi64(row3l, row4l);
1760 row3h = _mm_add_epi64(row3h, row4h);
1761 row2l = _mm_xor_si128(row2l, row3l);
1762 row2h = _mm_xor_si128(row2h, row3h);
1763 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1764 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1766 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1767 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1768 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1769 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1770 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1772 b0 = _mm_set_epi64x(m2, m0);
1773 b1 = _mm_set_epi64x(m6, m4);
1774 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1775 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1776 row4l = _mm_xor_si128(row4l, row1l);
1777 row4h = _mm_xor_si128(row4h, row1h);
1778 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1779 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1780 row3l = _mm_add_epi64(row3l, row4l);
1781 row3h = _mm_add_epi64(row3h, row4h);
1782 row2l = _mm_xor_si128(row2l, row3l);
1783 row2h = _mm_xor_si128(row2h, row3h);
1784 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1785 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1787 b0 = _mm_set_epi64x(m3, m1);
1788 b1 = _mm_set_epi64x(m7, m5);
1789 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1790 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1791 row4l = _mm_xor_si128(row4l, row1l);
1792 row4h = _mm_xor_si128(row4h, row1h);
1793 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1794 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1795 row3l = _mm_add_epi64(row3l, row4l);
1796 row3h = _mm_add_epi64(row3h, row4h);
1797 row2l = _mm_xor_si128(row2l, row3l);
1798 row2h = _mm_xor_si128(row2h, row3h);
1799 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1800 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1802 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1803 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1804 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1805 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1806 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1808 b0 = _mm_set_epi64x(m10, m8);
1809 b1 = _mm_set_epi64x(m14, m12);
1810 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1811 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1812 row4l = _mm_xor_si128(row4l, row1l);
1813 row4h = _mm_xor_si128(row4h, row1h);
1814 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1815 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1816 row3l = _mm_add_epi64(row3l, row4l);
1817 row3h = _mm_add_epi64(row3h, row4h);
1818 row2l = _mm_xor_si128(row2l, row3l);
1819 row2h = _mm_xor_si128(row2h, row3h);
1820 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1821 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1823 b0 = _mm_set_epi64x(m11, m9);
1824 b1 = _mm_set_epi64x(m15, m13);
1825 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1826 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1827 row4l = _mm_xor_si128(row4l, row1l);
1828 row4h = _mm_xor_si128(row4h, row1h);
1829 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1830 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1831 row3l = _mm_add_epi64(row3l, row4l);
1832 row3h = _mm_add_epi64(row3h, row4h);
1833 row2l = _mm_xor_si128(row2l, row3l);
1834 row2h = _mm_xor_si128(row2h, row3h);
1835 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1836 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1838 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1839 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1840 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1841 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1842 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1844 b0 = _mm_set_epi64x(m4, m14);
1845 b1 = _mm_set_epi64x(m13, m9);
1846 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1847 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1848 row4l = _mm_xor_si128(row4l, row1l);
1849 row4h = _mm_xor_si128(row4h, row1h);
1850 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1851 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1852 row3l = _mm_add_epi64(row3l, row4l);
1853 row3h = _mm_add_epi64(row3h, row4h);
1854 row2l = _mm_xor_si128(row2l, row3l);
1855 row2h = _mm_xor_si128(row2h, row3h);
1856 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1857 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1859 b0 = _mm_set_epi64x(m8, m10);
1860 b1 = _mm_set_epi64x(m6, m15);
1861 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1862 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1863 row4l = _mm_xor_si128(row4l, row1l);
1864 row4h = _mm_xor_si128(row4h, row1h);
1865 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1866 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1867 row3l = _mm_add_epi64(row3l, row4l);
1868 row3h = _mm_add_epi64(row3h, row4h);
1869 row2l = _mm_xor_si128(row2l, row3l);
1870 row2h = _mm_xor_si128(row2h, row3h);
1871 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1872 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1874 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
1875 row4l = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t0, t0));
1876 row4h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row4h, row4h));
1877 row2l = _mm_unpackhi_epi64(row2l, _mm_unpacklo_epi64(row2h, row2h));
1878 row2h = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(t1, t1));
1879 b0 = _mm_set_epi64x(m0, m1);
1880 b1 = _mm_set_epi64x(m5, m11);
1881 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1882 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1883 row4l = _mm_xor_si128(row4l, row1l);
1884 row4h = _mm_xor_si128(row4h, row1h);
1885 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,32),_mm_slli_epi64(row4l,32));
1886 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,32),_mm_slli_epi64(row4h,32));
1887 row3l = _mm_add_epi64(row3l, row4l);
1888 row3h = _mm_add_epi64(row3h, row4h);
1889 row2l = _mm_xor_si128(row2l, row3l);
1890 row2h = _mm_xor_si128(row2h, row3h);
1891 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,24),_mm_slli_epi64(row2l,40));
1892 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,24),_mm_slli_epi64(row2h,40));
1894 b0 = _mm_set_epi64x(m2, m12);
1895 b1 = _mm_set_epi64x(m3, m7);
1896 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
1897 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
1898 row4l = _mm_xor_si128(row4l, row1l);
1899 row4h = _mm_xor_si128(row4h, row1h);
1900 row4l = _mm_xor_si128(_mm_srli_epi64(row4l,16),_mm_slli_epi64(row4l,48));
1901 row4h = _mm_xor_si128(_mm_srli_epi64(row4h,16),_mm_slli_epi64(row4h,48));
1902 row3l = _mm_add_epi64(row3l, row4l);
1903 row3h = _mm_add_epi64(row3h, row4h);
1904 row2l = _mm_xor_si128(row2l, row3l);
1905 row2h = _mm_xor_si128(row2h, row3h);
1906 row2l = _mm_xor_si128(_mm_srli_epi64(row2l,63),_mm_slli_epi64(row2l,1));
1907 row2h = _mm_xor_si128(_mm_srli_epi64(row2h,63),_mm_slli_epi64(row2h,1));
1909 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
1910 row2l = _mm_unpackhi_epi64(row2h, _mm_unpacklo_epi64(row2l, row2l));
1911 row2h = _mm_unpackhi_epi64(t0, _mm_unpacklo_epi64(row2h, row2h));
1912 row4l = _mm_unpackhi_epi64(row4l, _mm_unpacklo_epi64(row4h, row4h));
1913 row4h = _mm_unpackhi_epi64(row4h, _mm_unpacklo_epi64(t1, t1));
1915 row1l = _mm_xor_si128(row3l, row1l);
1916 row1h = _mm_xor_si128(row3h, row1h);
1917 _mm_storeu_si128((__m128i *)(
void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[0])), row1l));
1918 _mm_storeu_si128((__m128i *)(
void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[2])), row1h));
1920 row2l = _mm_xor_si128(row4l, row2l);
1921 row2h = _mm_xor_si128(row4h, row2h);
1922 _mm_storeu_si128((__m128i *)(
void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[4])), row2l));
1923 _mm_storeu_si128((__m128i *)(
void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[6])), row2h));
1925 # endif // (__SUNPRO_CC != 0x5120) 1926 #endif // CRYPTOPP_BOOL_SSE2_INTRINSICS_AVAILABLE 1928 #if CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 1931 __m128i row1, row2, row3, row4;
1932 __m128i buf1, buf2, buf3, buf4;
1937 const __m128i r8 = _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1);
1938 const __m128i r16 = _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
1940 const __m128i m0 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 00));
1941 const __m128i m1 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 16));
1942 const __m128i m2 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 32));
1943 const __m128i m3 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 48));
1945 row1 = ff0 = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[0]));
1946 row2 = ff1 = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[4]));
1947 row3 = _mm_setr_epi32(BLAKE2S_IV(0), BLAKE2S_IV(1), BLAKE2S_IV(2), BLAKE2S_IV(3));
1948 row4 = _mm_xor_si128(_mm_setr_epi32(BLAKE2S_IV(4), BLAKE2S_IV(5), BLAKE2S_IV(6), BLAKE2S_IV(7)), _mm_loadu_si128((
const __m128i*)(
const void*)(&state.t[0])));
1949 buf1 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(2,0,2,0))));
1951 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
1952 row4 = _mm_xor_si128(row4, row1);
1953 row4 = _mm_shuffle_epi8(row4,r16);
1954 row3 = _mm_add_epi32(row3, row4);
1955 row2 = _mm_xor_si128(row2, row3);
1956 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1958 buf2 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m0)), _mm_castsi128_ps((m1)), _MM_SHUFFLE(3,1,3,1))));
1960 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
1961 row4 = _mm_xor_si128(row4, row1);
1962 row4 = _mm_shuffle_epi8(row4,r8);
1963 row3 = _mm_add_epi32(row3, row4);
1964 row2 = _mm_xor_si128(row2, row3);
1965 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
1967 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
1968 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
1969 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
1971 buf3 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(2,0,2,0))));
1973 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
1974 row4 = _mm_xor_si128(row4, row1);
1975 row4 = _mm_shuffle_epi8(row4,r16);
1976 row3 = _mm_add_epi32(row3, row4);
1977 row2 = _mm_xor_si128(row2, row3);
1978 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
1980 buf4 = _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps((m2)), _mm_castsi128_ps((m3)), _MM_SHUFFLE(3,1,3,1))));
1982 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
1983 row4 = _mm_xor_si128(row4, row1);
1984 row4 = _mm_shuffle_epi8(row4,r8);
1985 row3 = _mm_add_epi32(row3, row4);
1986 row2 = _mm_xor_si128(row2, row3);
1987 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
1989 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
1990 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
1991 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
1993 t0 = _mm_blend_epi16(m1, m2, 0x0C);
1994 t1 = _mm_slli_si128(m3, 4);
1995 t2 = _mm_blend_epi16(t0, t1, 0xF0);
1996 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,0,3));
1998 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
1999 row4 = _mm_xor_si128(row4, row1);
2000 row4 = _mm_shuffle_epi8(row4,r16);
2001 row3 = _mm_add_epi32(row3, row4);
2002 row2 = _mm_xor_si128(row2, row3);
2003 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2005 t0 = _mm_shuffle_epi32(m2,_MM_SHUFFLE(0,0,2,0));
2006 t1 = _mm_blend_epi16(m1,m3,0xC0);
2007 t2 = _mm_blend_epi16(t0, t1, 0xF0);
2008 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2010 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2011 row4 = _mm_xor_si128(row4, row1);
2012 row4 = _mm_shuffle_epi8(row4,r8);
2013 row3 = _mm_add_epi32(row3, row4);
2014 row2 = _mm_xor_si128(row2, row3);
2015 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2017 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2018 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2019 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2021 t0 = _mm_slli_si128(m1, 4);
2022 t1 = _mm_blend_epi16(m2, t0, 0x30);
2023 t2 = _mm_blend_epi16(m0, t1, 0xF0);
2024 buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2026 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2027 row4 = _mm_xor_si128(row4, row1);
2028 row4 = _mm_shuffle_epi8(row4,r16);
2029 row3 = _mm_add_epi32(row3, row4);
2030 row2 = _mm_xor_si128(row2, row3);
2031 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2033 t0 = _mm_unpackhi_epi32(m0,m1);
2034 t1 = _mm_slli_si128(m3, 4);
2035 t2 = _mm_blend_epi16(t0, t1, 0x0C);
2036 buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,3,0,1));
2038 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2039 row4 = _mm_xor_si128(row4, row1);
2040 row4 = _mm_shuffle_epi8(row4,r8);
2041 row3 = _mm_add_epi32(row3, row4);
2042 row2 = _mm_xor_si128(row2, row3);
2043 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2045 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2046 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2047 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2049 t0 = _mm_unpackhi_epi32(m2,m3);
2050 t1 = _mm_blend_epi16(m3,m1,0x0C);
2051 t2 = _mm_blend_epi16(t0, t1, 0x0F);
2052 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
2054 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2055 row4 = _mm_xor_si128(row4, row1);
2056 row4 = _mm_shuffle_epi8(row4,r16);
2057 row3 = _mm_add_epi32(row3, row4);
2058 row2 = _mm_xor_si128(row2, row3);
2059 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2061 t0 = _mm_unpacklo_epi32(m2,m0);
2062 t1 = _mm_blend_epi16(t0, m0, 0xF0);
2063 t2 = _mm_slli_si128(m3, 8);
2064 buf2 = _mm_blend_epi16(t1, t2, 0xC0);
2066 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2067 row4 = _mm_xor_si128(row4, row1);
2068 row4 = _mm_shuffle_epi8(row4,r8);
2069 row3 = _mm_add_epi32(row3, row4);
2070 row2 = _mm_xor_si128(row2, row3);
2071 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2073 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2074 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2075 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2077 t0 = _mm_blend_epi16(m0, m2, 0x3C);
2078 t1 = _mm_srli_si128(m1, 12);
2079 t2 = _mm_blend_epi16(t0,t1,0x03);
2080 buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,3,2));
2082 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2083 row4 = _mm_xor_si128(row4, row1);
2084 row4 = _mm_shuffle_epi8(row4,r16);
2085 row3 = _mm_add_epi32(row3, row4);
2086 row2 = _mm_xor_si128(row2, row3);
2087 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2089 t0 = _mm_slli_si128(m3, 4);
2090 t1 = _mm_blend_epi16(m0, m1, 0x33);
2091 t2 = _mm_blend_epi16(t1, t0, 0xC0);
2092 buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(0,1,2,3));
2094 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2095 row4 = _mm_xor_si128(row4, row1);
2096 row4 = _mm_shuffle_epi8(row4,r8);
2097 row3 = _mm_add_epi32(row3, row4);
2098 row2 = _mm_xor_si128(row2, row3);
2099 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2101 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2102 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2103 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2105 t0 = _mm_unpackhi_epi32(m0,m1);
2106 t1 = _mm_unpackhi_epi32(t0, m2);
2107 t2 = _mm_blend_epi16(t1, m3, 0x0C);
2108 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(3,1,0,2));
2110 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2111 row4 = _mm_xor_si128(row4, row1);
2112 row4 = _mm_shuffle_epi8(row4,r16);
2113 row3 = _mm_add_epi32(row3, row4);
2114 row2 = _mm_xor_si128(row2, row3);
2115 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2117 t0 = _mm_slli_si128(m2, 8);
2118 t1 = _mm_blend_epi16(m3,m0,0x0C);
2119 t2 = _mm_blend_epi16(t1, t0, 0xC0);
2120 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
2122 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2123 row4 = _mm_xor_si128(row4, row1);
2124 row4 = _mm_shuffle_epi8(row4,r8);
2125 row3 = _mm_add_epi32(row3, row4);
2126 row2 = _mm_xor_si128(row2, row3);
2127 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2129 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2130 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2131 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2133 t0 = _mm_blend_epi16(m0,m1,0x0F);
2134 t1 = _mm_blend_epi16(t0, m3, 0xC0);
2135 buf3 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
2137 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2138 row4 = _mm_xor_si128(row4, row1);
2139 row4 = _mm_shuffle_epi8(row4,r16);
2140 row3 = _mm_add_epi32(row3, row4);
2141 row2 = _mm_xor_si128(row2, row3);
2142 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2144 t0 = _mm_unpacklo_epi32(m0,m2);
2145 t1 = _mm_unpackhi_epi32(m1,m2);
2146 buf4 = _mm_unpacklo_epi64(t1,t0);
2148 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2149 row4 = _mm_xor_si128(row4, row1);
2150 row4 = _mm_shuffle_epi8(row4,r8);
2151 row3 = _mm_add_epi32(row3, row4);
2152 row2 = _mm_xor_si128(row2, row3);
2153 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2155 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2156 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2157 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2159 t0 = _mm_unpacklo_epi64(m1,m2);
2160 t1 = _mm_unpackhi_epi64(m0,m2);
2161 t2 = _mm_blend_epi16(t0,t1,0x33);
2162 buf1 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,0,1,3));
2164 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2165 row4 = _mm_xor_si128(row4, row1);
2166 row4 = _mm_shuffle_epi8(row4,r16);
2167 row3 = _mm_add_epi32(row3, row4);
2168 row2 = _mm_xor_si128(row2, row3);
2169 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2171 t0 = _mm_unpackhi_epi64(m1,m3);
2172 t1 = _mm_unpacklo_epi64(m0,m1);
2173 buf2 = _mm_blend_epi16(t0,t1,0x33);
2175 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2176 row4 = _mm_xor_si128(row4, row1);
2177 row4 = _mm_shuffle_epi8(row4,r8);
2178 row3 = _mm_add_epi32(row3, row4);
2179 row2 = _mm_xor_si128(row2, row3);
2180 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2182 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2183 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2184 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2186 t0 = _mm_unpackhi_epi64(m3,m1);
2187 t1 = _mm_unpackhi_epi64(m2,m0);
2188 buf3 = _mm_blend_epi16(t1,t0,0x33);
2190 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2191 row4 = _mm_xor_si128(row4, row1);
2192 row4 = _mm_shuffle_epi8(row4,r16);
2193 row3 = _mm_add_epi32(row3, row4);
2194 row2 = _mm_xor_si128(row2, row3);
2195 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2197 t0 = _mm_blend_epi16(m0,m2,0x03);
2198 t1 = _mm_slli_si128(t0, 8);
2199 t2 = _mm_blend_epi16(t1,m3,0x0F);
2200 buf4 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,0,3));
2202 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2203 row4 = _mm_xor_si128(row4, row1);
2204 row4 = _mm_shuffle_epi8(row4,r8);
2205 row3 = _mm_add_epi32(row3, row4);
2206 row2 = _mm_xor_si128(row2, row3);
2207 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2209 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2210 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2211 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2213 t0 = _mm_unpackhi_epi32(m0,m1);
2214 t1 = _mm_unpacklo_epi32(m0,m2);
2215 buf1 = _mm_unpacklo_epi64(t0,t1);
2217 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2218 row4 = _mm_xor_si128(row4, row1);
2219 row4 = _mm_shuffle_epi8(row4,r16);
2220 row3 = _mm_add_epi32(row3, row4);
2221 row2 = _mm_xor_si128(row2, row3);
2222 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2224 t0 = _mm_srli_si128(m2, 4);
2225 t1 = _mm_blend_epi16(m0,m3,0x03);
2226 buf2 = _mm_blend_epi16(t1,t0,0x3C);
2228 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2229 row4 = _mm_xor_si128(row4, row1);
2230 row4 = _mm_shuffle_epi8(row4,r8);
2231 row3 = _mm_add_epi32(row3, row4);
2232 row2 = _mm_xor_si128(row2, row3);
2233 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2235 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2236 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2237 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2239 t0 = _mm_blend_epi16(m1,m0,0x0C);
2240 t1 = _mm_srli_si128(m3, 4);
2241 t2 = _mm_blend_epi16(t0,t1,0x30);
2242 buf3 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,2,3,0));
2244 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2245 row4 = _mm_xor_si128(row4, row1);
2246 row4 = _mm_shuffle_epi8(row4,r16);
2247 row3 = _mm_add_epi32(row3, row4);
2248 row2 = _mm_xor_si128(row2, row3);
2249 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2251 t0 = _mm_unpacklo_epi64(m1,m2);
2252 t1= _mm_shuffle_epi32(m3, _MM_SHUFFLE(0,2,0,1));
2253 buf4 = _mm_blend_epi16(t0,t1,0x33);
2255 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2256 row4 = _mm_xor_si128(row4, row1);
2257 row4 = _mm_shuffle_epi8(row4,r8);
2258 row3 = _mm_add_epi32(row3, row4);
2259 row2 = _mm_xor_si128(row2, row3);
2260 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2262 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2263 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2264 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2266 t0 = _mm_slli_si128(m1, 12);
2267 t1 = _mm_blend_epi16(m0,m3,0x33);
2268 buf1 = _mm_blend_epi16(t1,t0,0xC0);
2270 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2271 row4 = _mm_xor_si128(row4, row1);
2272 row4 = _mm_shuffle_epi8(row4,r16);
2273 row3 = _mm_add_epi32(row3, row4);
2274 row2 = _mm_xor_si128(row2, row3);
2275 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2277 t0 = _mm_blend_epi16(m3,m2,0x30);
2278 t1 = _mm_srli_si128(m1, 4);
2279 t2 = _mm_blend_epi16(t0,t1,0x03);
2280 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2,1,3,0));
2282 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2283 row4 = _mm_xor_si128(row4, row1);
2284 row4 = _mm_shuffle_epi8(row4,r8);
2285 row3 = _mm_add_epi32(row3, row4);
2286 row2 = _mm_xor_si128(row2, row3);
2287 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2289 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2290 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2291 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2293 t0 = _mm_unpacklo_epi64(m0,m2);
2294 t1 = _mm_srli_si128(m1, 4);
2295 buf3 = _mm_shuffle_epi32(_mm_blend_epi16(t0,t1,0x0C), _MM_SHUFFLE(2,3,1,0));
2297 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2298 row4 = _mm_xor_si128(row4, row1);
2299 row4 = _mm_shuffle_epi8(row4,r16);
2300 row3 = _mm_add_epi32(row3, row4);
2301 row2 = _mm_xor_si128(row2, row3);
2302 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2304 t0 = _mm_unpackhi_epi32(m1,m2);
2305 t1 = _mm_unpackhi_epi64(m0,t0);
2306 buf4 = _mm_shuffle_epi32(t1, _MM_SHUFFLE(3,0,1,2));
2308 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2309 row4 = _mm_xor_si128(row4, row1);
2310 row4 = _mm_shuffle_epi8(row4,r8);
2311 row3 = _mm_add_epi32(row3, row4);
2312 row2 = _mm_xor_si128(row2, row3);
2313 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2315 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2316 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2317 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2319 t0 = _mm_unpackhi_epi32(m0,m1);
2320 t1 = _mm_blend_epi16(t0,m3,0x0F);
2321 buf1 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(2,0,3,1));
2323 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2324 row4 = _mm_xor_si128(row4, row1);
2325 row4 = _mm_shuffle_epi8(row4,r16);
2326 row3 = _mm_add_epi32(row3, row4);
2327 row2 = _mm_xor_si128(row2, row3);
2328 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2330 t0 = _mm_blend_epi16(m2,m3,0x30);
2331 t1 = _mm_srli_si128(m0,4);
2332 t2 = _mm_blend_epi16(t0,t1,0x03);
2333 buf2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(1,0,2,3));
2335 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2336 row4 = _mm_xor_si128(row4, row1);
2337 row4 = _mm_shuffle_epi8(row4,r8);
2338 row3 = _mm_add_epi32(row3, row4);
2339 row2 = _mm_xor_si128(row2, row3);
2340 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2342 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2343 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2344 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2346 t0 = _mm_unpackhi_epi64(m0,m3);
2347 t1 = _mm_unpacklo_epi64(m1,m2);
2348 t2 = _mm_blend_epi16(t0,t1,0x3C);
2349 buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,2,3,1));
2351 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2352 row4 = _mm_xor_si128(row4, row1);
2353 row4 = _mm_shuffle_epi8(row4,r16);
2354 row3 = _mm_add_epi32(row3, row4);
2355 row2 = _mm_xor_si128(row2, row3);
2356 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2358 t0 = _mm_unpacklo_epi32(m0,m1);
2359 t1 = _mm_unpackhi_epi32(m1,m2);
2360 buf4 = _mm_unpacklo_epi64(t0,t1);
2362 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2363 row4 = _mm_xor_si128(row4, row1);
2364 row4 = _mm_shuffle_epi8(row4,r8);
2365 row3 = _mm_add_epi32(row3, row4);
2366 row2 = _mm_xor_si128(row2, row3);
2367 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2369 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2370 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2371 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2373 t0 = _mm_unpackhi_epi32(m1,m3);
2374 t1 = _mm_unpacklo_epi64(t0,m0);
2375 t2 = _mm_blend_epi16(t1,m2,0xC0);
2376 buf1 = _mm_shufflehi_epi16(t2,_MM_SHUFFLE(1,0,3,2));
2378 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2379 row4 = _mm_xor_si128(row4, row1);
2380 row4 = _mm_shuffle_epi8(row4,r16);
2381 row3 = _mm_add_epi32(row3, row4);
2382 row2 = _mm_xor_si128(row2, row3);
2383 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2385 t0 = _mm_unpackhi_epi32(m0,m3);
2386 t1 = _mm_blend_epi16(m2,t0,0xF0);
2387 buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(0,2,1,3));
2389 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2390 row4 = _mm_xor_si128(row4, row1);
2391 row4 = _mm_shuffle_epi8(row4,r8);
2392 row3 = _mm_add_epi32(row3, row4);
2393 row2 = _mm_xor_si128(row2, row3);
2394 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2396 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2397 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2398 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2400 t0 = _mm_blend_epi16(m2,m0,0x0C);
2401 t1 = _mm_slli_si128(t0,4);
2402 buf3 = _mm_blend_epi16(t1,m3,0x0F);
2404 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2405 row4 = _mm_xor_si128(row4, row1);
2406 row4 = _mm_shuffle_epi8(row4,r16);
2407 row3 = _mm_add_epi32(row3, row4);
2408 row2 = _mm_xor_si128(row2, row3);
2409 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2411 t0 = _mm_blend_epi16(m1,m0,0x30);
2412 buf4 = _mm_shuffle_epi32(t0,_MM_SHUFFLE(1,0,3,2));
2414 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2415 row4 = _mm_xor_si128(row4, row1);
2416 row4 = _mm_shuffle_epi8(row4,r8);
2417 row3 = _mm_add_epi32(row3, row4);
2418 row2 = _mm_xor_si128(row2, row3);
2419 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2421 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2422 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2423 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2425 t0 = _mm_blend_epi16(m0,m2,0x03);
2426 t1 = _mm_blend_epi16(m1,m2,0x30);
2427 t2 = _mm_blend_epi16(t1,t0,0x0F);
2428 buf1 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(1,3,0,2));
2430 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf1), row2);
2431 row4 = _mm_xor_si128(row4, row1);
2432 row4 = _mm_shuffle_epi8(row4,r16);
2433 row3 = _mm_add_epi32(row3, row4);
2434 row2 = _mm_xor_si128(row2, row3);
2435 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2437 t0 = _mm_slli_si128(m0,4);
2438 t1 = _mm_blend_epi16(m1,t0,0xC0);
2439 buf2 = _mm_shuffle_epi32(t1,_MM_SHUFFLE(1,2,0,3));
2441 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf2), row2);
2442 row4 = _mm_xor_si128(row4, row1);
2443 row4 = _mm_shuffle_epi8(row4,r8);
2444 row3 = _mm_add_epi32(row3, row4);
2445 row2 = _mm_xor_si128(row2, row3);
2446 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2448 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(2,1,0,3));
2449 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2450 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(0,3,2,1));
2452 t0 = _mm_unpackhi_epi32(m0,m3);
2453 t1 = _mm_unpacklo_epi32(m2,m3);
2454 t2 = _mm_unpackhi_epi64(t0,t1);
2455 buf3 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(3,0,2,1));
2457 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf3), row2);
2458 row4 = _mm_xor_si128(row4, row1);
2459 row4 = _mm_shuffle_epi8(row4,r16);
2460 row3 = _mm_add_epi32(row3, row4);
2461 row2 = _mm_xor_si128(row2, row3);
2462 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 12),_mm_slli_epi32(row2, 20));
2464 t0 = _mm_blend_epi16(m3,m2,0xC0);
2465 t1 = _mm_unpacklo_epi32(m0,m3);
2466 t2 = _mm_blend_epi16(t0,t1,0x0F);
2467 buf4 = _mm_shuffle_epi32(t2,_MM_SHUFFLE(0,1,2,3));
2469 row1 = _mm_add_epi32(_mm_add_epi32(row1, buf4), row2);
2470 row4 = _mm_xor_si128(row4, row1);
2471 row4 = _mm_shuffle_epi8(row4,r8);
2472 row3 = _mm_add_epi32(row3, row4);
2473 row2 = _mm_xor_si128(row2, row3);
2474 row2 = _mm_xor_si128(_mm_srli_epi32(row2, 7),_mm_slli_epi32(row2, 25));
2476 row4 = _mm_shuffle_epi32(row4, _MM_SHUFFLE(0,3,2,1));
2477 row3 = _mm_shuffle_epi32(row3, _MM_SHUFFLE(1,0,3,2));
2478 row2 = _mm_shuffle_epi32(row2, _MM_SHUFFLE(2,1,0,3));
2480 _mm_storeu_si128((__m128i *)(
void*)(&state.h[0]), _mm_xor_si128(ff0, _mm_xor_si128(row1, row3)));
2481 _mm_storeu_si128((__m128i *)(
void*)(&state.h[4]), _mm_xor_si128(ff1, _mm_xor_si128(row2, row4)));
2486 __m128i row1l, row1h;
2487 __m128i row2l, row2h;
2488 __m128i row3l, row3h;
2489 __m128i row4l, row4h;
2490 __m128i b0, b1, t0, t1;
2492 const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
2493 const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
2495 const __m128i m0 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 00));
2496 const __m128i m1 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 16));
2497 const __m128i m2 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 32));
2498 const __m128i m3 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 48));
2499 const __m128i m4 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 64));
2500 const __m128i m5 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 80));
2501 const __m128i m6 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 96));
2502 const __m128i m7 = _mm_loadu_si128((
const __m128i*)(
const void*)(input + 112));
2504 row1l = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[0]));
2505 row1h = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[2]));
2506 row2l = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[4]));
2507 row2h = _mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[6]));
2508 row3l = _mm_loadu_si128((
const __m128i*)(
const void*)(&BLAKE2B_IV(0)));
2509 row3h = _mm_loadu_si128((
const __m128i*)(
const void*)(&BLAKE2B_IV(2)));
2510 row4l = _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&BLAKE2B_IV(4))), _mm_loadu_si128((
const __m128i*)(
const void*)(&state.t[0])));
2511 row4h = _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&BLAKE2B_IV(6))), _mm_loadu_si128((
const __m128i*)(
const void*)(&state.f[0])));
2513 b0 = _mm_unpacklo_epi64(m0, m1);
2514 b1 = _mm_unpacklo_epi64(m2, m3);
2515 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2516 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2517 row4l = _mm_xor_si128(row4l, row1l);
2518 row4h = _mm_xor_si128(row4h, row1h);
2519 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2520 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2521 row3l = _mm_add_epi64(row3l, row4l);
2522 row3h = _mm_add_epi64(row3h, row4h);
2523 row2l = _mm_xor_si128(row2l, row3l);
2524 row2h = _mm_xor_si128(row2h, row3h);
2525 row2l = _mm_shuffle_epi8(row2l, r24);
2526 row2h = _mm_shuffle_epi8(row2h, r24);
2528 b0 = _mm_unpackhi_epi64(m0, m1);
2529 b1 = _mm_unpackhi_epi64(m2, m3);
2531 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2532 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2533 row4l = _mm_xor_si128(row4l, row1l);
2534 row4h = _mm_xor_si128(row4h, row1h);
2535 row4l = _mm_shuffle_epi8(row4l, r16);
2536 row4h = _mm_shuffle_epi8(row4h, r16);
2537 row3l = _mm_add_epi64(row3l, row4l);
2538 row3h = _mm_add_epi64(row3h, row4h);
2539 row2l = _mm_xor_si128(row2l, row3l);
2540 row2h = _mm_xor_si128(row2h, row3h);
2541 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2542 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2544 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2545 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2546 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2547 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2548 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2549 row4l = t1, row4h = t0;
2551 b0 = _mm_unpacklo_epi64(m4, m5);
2552 b1 = _mm_unpacklo_epi64(m6, m7);
2554 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2555 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2556 row4l = _mm_xor_si128(row4l, row1l);
2557 row4h = _mm_xor_si128(row4h, row1h);
2558 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2559 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2560 row3l = _mm_add_epi64(row3l, row4l);
2561 row3h = _mm_add_epi64(row3h, row4h);
2562 row2l = _mm_xor_si128(row2l, row3l);
2563 row2h = _mm_xor_si128(row2h, row3h);
2564 row2l = _mm_shuffle_epi8(row2l, r24);
2565 row2h = _mm_shuffle_epi8(row2h, r24);
2567 b0 = _mm_unpackhi_epi64(m4, m5);
2568 b1 = _mm_unpackhi_epi64(m6, m7);
2570 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2571 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2572 row4l = _mm_xor_si128(row4l, row1l);
2573 row4h = _mm_xor_si128(row4h, row1h);
2574 row4l = _mm_shuffle_epi8(row4l, r16);
2575 row4h = _mm_shuffle_epi8(row4h, r16);
2576 row3l = _mm_add_epi64(row3l, row4l);
2577 row3h = _mm_add_epi64(row3h, row4h);
2578 row2l = _mm_xor_si128(row2l, row3l);
2579 row2h = _mm_xor_si128(row2h, row3h);
2580 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2581 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2583 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2584 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2585 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2586 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2587 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2588 row4l = t1, row4h = t0;
2590 b0 = _mm_unpacklo_epi64(m7, m2);
2591 b1 = _mm_unpackhi_epi64(m4, m6);
2593 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2594 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2595 row4l = _mm_xor_si128(row4l, row1l);
2596 row4h = _mm_xor_si128(row4h, row1h);
2597 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2598 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2599 row3l = _mm_add_epi64(row3l, row4l);
2600 row3h = _mm_add_epi64(row3h, row4h);
2601 row2l = _mm_xor_si128(row2l, row3l);
2602 row2h = _mm_xor_si128(row2h, row3h);
2603 row2l = _mm_shuffle_epi8(row2l, r24);
2604 row2h = _mm_shuffle_epi8(row2h, r24);
2606 b0 = _mm_unpacklo_epi64(m5, m4);
2607 b1 = _mm_alignr_epi8(m3, m7, 8);
2609 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2610 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2611 row4l = _mm_xor_si128(row4l, row1l);
2612 row4h = _mm_xor_si128(row4h, row1h);
2613 row4l = _mm_shuffle_epi8(row4l, r16);
2614 row4h = _mm_shuffle_epi8(row4h, r16);
2615 row3l = _mm_add_epi64(row3l, row4l);
2616 row3h = _mm_add_epi64(row3h, row4h);
2617 row2l = _mm_xor_si128(row2l, row3l);
2618 row2h = _mm_xor_si128(row2h, row3h);
2619 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2620 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2622 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2623 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2624 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2625 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2626 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2627 row4l = t1, row4h = t0;
2629 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
2630 b1 = _mm_unpackhi_epi64(m5, m2);
2632 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2633 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2634 row4l = _mm_xor_si128(row4l, row1l);
2635 row4h = _mm_xor_si128(row4h, row1h);
2636 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2637 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2638 row3l = _mm_add_epi64(row3l, row4l);
2639 row3h = _mm_add_epi64(row3h, row4h);
2640 row2l = _mm_xor_si128(row2l, row3l);
2641 row2h = _mm_xor_si128(row2h, row3h);
2642 row2l = _mm_shuffle_epi8(row2l, r24);
2643 row2h = _mm_shuffle_epi8(row2h, r24);
2645 b0 = _mm_unpacklo_epi64(m6, m1);
2646 b1 = _mm_unpackhi_epi64(m3, m1);
2648 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2649 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2650 row4l = _mm_xor_si128(row4l, row1l);
2651 row4h = _mm_xor_si128(row4h, row1h);
2652 row4l = _mm_shuffle_epi8(row4l, r16);
2653 row4h = _mm_shuffle_epi8(row4h, r16);
2654 row3l = _mm_add_epi64(row3l, row4l);
2655 row3h = _mm_add_epi64(row3h, row4h);
2656 row2l = _mm_xor_si128(row2l, row3l);
2657 row2h = _mm_xor_si128(row2h, row3h);
2658 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2659 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2661 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2662 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2663 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2664 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2665 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2666 row4l = t1, row4h = t0;
2668 b0 = _mm_alignr_epi8(m6, m5, 8);
2669 b1 = _mm_unpackhi_epi64(m2, m7);
2671 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2672 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2673 row4l = _mm_xor_si128(row4l, row1l);
2674 row4h = _mm_xor_si128(row4h, row1h);
2675 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2676 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2677 row3l = _mm_add_epi64(row3l, row4l);
2678 row3h = _mm_add_epi64(row3h, row4h);
2679 row2l = _mm_xor_si128(row2l, row3l);
2680 row2h = _mm_xor_si128(row2h, row3h);
2681 row2l = _mm_shuffle_epi8(row2l, r24);
2682 row2h = _mm_shuffle_epi8(row2h, r24);
2684 b0 = _mm_unpacklo_epi64(m4, m0);
2685 b1 = _mm_blend_epi16(m1, m6, 0xF0);
2687 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2688 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2689 row4l = _mm_xor_si128(row4l, row1l);
2690 row4h = _mm_xor_si128(row4h, row1h);
2691 row4l = _mm_shuffle_epi8(row4l, r16);
2692 row4h = _mm_shuffle_epi8(row4h, r16);
2693 row3l = _mm_add_epi64(row3l, row4l);
2694 row3h = _mm_add_epi64(row3h, row4h);
2695 row2l = _mm_xor_si128(row2l, row3l);
2696 row2h = _mm_xor_si128(row2h, row3h);
2697 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2698 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2700 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2701 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2702 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2703 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2704 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2705 row4l = t1, row4h = t0;
2707 b0 = _mm_blend_epi16(m5, m1, 0xF0);
2708 b1 = _mm_unpackhi_epi64(m3, m4);
2710 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2711 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2712 row4l = _mm_xor_si128(row4l, row1l);
2713 row4h = _mm_xor_si128(row4h, row1h);
2714 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2715 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2716 row3l = _mm_add_epi64(row3l, row4l);
2717 row3h = _mm_add_epi64(row3h, row4h);
2718 row2l = _mm_xor_si128(row2l, row3l);
2719 row2h = _mm_xor_si128(row2h, row3h);
2720 row2l = _mm_shuffle_epi8(row2l, r24);
2721 row2h = _mm_shuffle_epi8(row2h, r24);
2723 b0 = _mm_unpacklo_epi64(m7, m3);
2724 b1 = _mm_alignr_epi8(m2, m0, 8);
2726 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2727 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2728 row4l = _mm_xor_si128(row4l, row1l);
2729 row4h = _mm_xor_si128(row4h, row1h);
2730 row4l = _mm_shuffle_epi8(row4l, r16);
2731 row4h = _mm_shuffle_epi8(row4h, r16);
2732 row3l = _mm_add_epi64(row3l, row4l);
2733 row3h = _mm_add_epi64(row3h, row4h);
2734 row2l = _mm_xor_si128(row2l, row3l);
2735 row2h = _mm_xor_si128(row2h, row3h);
2736 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2737 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2739 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2740 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2741 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2742 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2743 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2744 row4l = t1, row4h = t0;
2746 b0 = _mm_unpackhi_epi64(m3, m1);
2747 b1 = _mm_unpackhi_epi64(m6, m5);
2749 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2750 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2751 row4l = _mm_xor_si128(row4l, row1l);
2752 row4h = _mm_xor_si128(row4h, row1h);
2753 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2754 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2755 row3l = _mm_add_epi64(row3l, row4l);
2756 row3h = _mm_add_epi64(row3h, row4h);
2757 row2l = _mm_xor_si128(row2l, row3l);
2758 row2h = _mm_xor_si128(row2h, row3h);
2759 row2l = _mm_shuffle_epi8(row2l, r24);
2760 row2h = _mm_shuffle_epi8(row2h, r24);
2762 b0 = _mm_unpackhi_epi64(m4, m0);
2763 b1 = _mm_unpacklo_epi64(m6, m7);
2765 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2766 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2767 row4l = _mm_xor_si128(row4l, row1l);
2768 row4h = _mm_xor_si128(row4h, row1h);
2769 row4l = _mm_shuffle_epi8(row4l, r16);
2770 row4h = _mm_shuffle_epi8(row4h, r16);
2771 row3l = _mm_add_epi64(row3l, row4l);
2772 row3h = _mm_add_epi64(row3h, row4h);
2773 row2l = _mm_xor_si128(row2l, row3l);
2774 row2h = _mm_xor_si128(row2h, row3h);
2775 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2776 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2778 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2779 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2780 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2781 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2782 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2783 row4l = t1, row4h = t0;
2785 b0 = _mm_blend_epi16(m1, m2, 0xF0);
2786 b1 = _mm_blend_epi16(m2, m7, 0xF0);
2788 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2789 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2790 row4l = _mm_xor_si128(row4l, row1l);
2791 row4h = _mm_xor_si128(row4h, row1h);
2792 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2793 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2794 row3l = _mm_add_epi64(row3l, row4l);
2795 row3h = _mm_add_epi64(row3h, row4h);
2796 row2l = _mm_xor_si128(row2l, row3l);
2797 row2h = _mm_xor_si128(row2h, row3h);
2798 row2l = _mm_shuffle_epi8(row2l, r24);
2799 row2h = _mm_shuffle_epi8(row2h, r24);
2801 b0 = _mm_unpacklo_epi64(m3, m5);
2802 b1 = _mm_unpacklo_epi64(m0, m4);
2804 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2805 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2806 row4l = _mm_xor_si128(row4l, row1l);
2807 row4h = _mm_xor_si128(row4h, row1h);
2808 row4l = _mm_shuffle_epi8(row4l, r16);
2809 row4h = _mm_shuffle_epi8(row4h, r16);
2810 row3l = _mm_add_epi64(row3l, row4l);
2811 row3h = _mm_add_epi64(row3h, row4h);
2812 row2l = _mm_xor_si128(row2l, row3l);
2813 row2h = _mm_xor_si128(row2h, row3h);
2814 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2815 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2817 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2818 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2819 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2820 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2821 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2822 row4l = t1, row4h = t0;
2824 b0 = _mm_unpackhi_epi64(m4, m2);
2825 b1 = _mm_unpacklo_epi64(m1, m5);
2827 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2828 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2829 row4l = _mm_xor_si128(row4l, row1l);
2830 row4h = _mm_xor_si128(row4h, row1h);
2831 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2832 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2833 row3l = _mm_add_epi64(row3l, row4l);
2834 row3h = _mm_add_epi64(row3h, row4h);
2835 row2l = _mm_xor_si128(row2l, row3l);
2836 row2h = _mm_xor_si128(row2h, row3h);
2837 row2l = _mm_shuffle_epi8(row2l, r24);
2838 row2h = _mm_shuffle_epi8(row2h, r24);
2840 b0 = _mm_blend_epi16(m0, m3, 0xF0);
2841 b1 = _mm_blend_epi16(m2, m7, 0xF0);
2843 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2844 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2845 row4l = _mm_xor_si128(row4l, row1l);
2846 row4h = _mm_xor_si128(row4h, row1h);
2847 row4l = _mm_shuffle_epi8(row4l, r16);
2848 row4h = _mm_shuffle_epi8(row4h, r16);
2849 row3l = _mm_add_epi64(row3l, row4l);
2850 row3h = _mm_add_epi64(row3h, row4h);
2851 row2l = _mm_xor_si128(row2l, row3l);
2852 row2h = _mm_xor_si128(row2h, row3h);
2853 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2854 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2856 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2857 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2858 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2859 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2860 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2861 row4l = t1, row4h = t0;
2863 b0 = _mm_blend_epi16(m7, m5, 0xF0);
2864 b1 = _mm_blend_epi16(m3, m1, 0xF0);
2866 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2867 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2868 row4l = _mm_xor_si128(row4l, row1l);
2869 row4h = _mm_xor_si128(row4h, row1h);
2870 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2871 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2872 row3l = _mm_add_epi64(row3l, row4l);
2873 row3h = _mm_add_epi64(row3h, row4h);
2874 row2l = _mm_xor_si128(row2l, row3l);
2875 row2h = _mm_xor_si128(row2h, row3h);
2876 row2l = _mm_shuffle_epi8(row2l, r24);
2877 row2h = _mm_shuffle_epi8(row2h, r24);
2879 b0 = _mm_alignr_epi8(m6, m0, 8);
2880 b1 = _mm_blend_epi16(m4, m6, 0xF0);
2882 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2883 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2884 row4l = _mm_xor_si128(row4l, row1l);
2885 row4h = _mm_xor_si128(row4h, row1h);
2886 row4l = _mm_shuffle_epi8(row4l, r16);
2887 row4h = _mm_shuffle_epi8(row4h, r16);
2888 row3l = _mm_add_epi64(row3l, row4l);
2889 row3h = _mm_add_epi64(row3h, row4h);
2890 row2l = _mm_xor_si128(row2l, row3l);
2891 row2h = _mm_xor_si128(row2h, row3h);
2892 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2893 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2895 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2896 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2897 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2898 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2899 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2900 row4l = t1, row4h = t0;
2902 b0 = _mm_unpacklo_epi64(m1, m3);
2903 b1 = _mm_unpacklo_epi64(m0, m4);
2905 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2906 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2907 row4l = _mm_xor_si128(row4l, row1l);
2908 row4h = _mm_xor_si128(row4h, row1h);
2909 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2910 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2911 row3l = _mm_add_epi64(row3l, row4l);
2912 row3h = _mm_add_epi64(row3h, row4h);
2913 row2l = _mm_xor_si128(row2l, row3l);
2914 row2h = _mm_xor_si128(row2h, row3h);
2915 row2l = _mm_shuffle_epi8(row2l, r24);
2916 row2h = _mm_shuffle_epi8(row2h, r24);
2918 b0 = _mm_unpacklo_epi64(m6, m5);
2919 b1 = _mm_unpackhi_epi64(m5, m1);
2921 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2922 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2923 row4l = _mm_xor_si128(row4l, row1l);
2924 row4h = _mm_xor_si128(row4h, row1h);
2925 row4l = _mm_shuffle_epi8(row4l, r16);
2926 row4h = _mm_shuffle_epi8(row4h, r16);
2927 row3l = _mm_add_epi64(row3l, row4l);
2928 row3h = _mm_add_epi64(row3h, row4h);
2929 row2l = _mm_xor_si128(row2l, row3l);
2930 row2h = _mm_xor_si128(row2h, row3h);
2931 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2932 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2934 t0 = _mm_alignr_epi8(row2h, row2l, 8);
2935 t1 = _mm_alignr_epi8(row2l, row2h, 8);
2936 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2937 t0 = _mm_alignr_epi8(row4h, row4l, 8);
2938 t1 = _mm_alignr_epi8(row4l, row4h, 8);
2939 row4l = t1, row4h = t0;
2941 b0 = _mm_blend_epi16(m2, m3, 0xF0);
2942 b1 = _mm_unpackhi_epi64(m7, m0);
2944 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2945 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2946 row4l = _mm_xor_si128(row4l, row1l);
2947 row4h = _mm_xor_si128(row4h, row1h);
2948 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2949 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2950 row3l = _mm_add_epi64(row3l, row4l);
2951 row3h = _mm_add_epi64(row3h, row4h);
2952 row2l = _mm_xor_si128(row2l, row3l);
2953 row2h = _mm_xor_si128(row2h, row3h);
2954 row2l = _mm_shuffle_epi8(row2l, r24);
2955 row2h = _mm_shuffle_epi8(row2h, r24);
2957 b0 = _mm_unpackhi_epi64(m6, m2);
2958 b1 = _mm_blend_epi16(m7, m4, 0xF0);
2960 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2961 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2962 row4l = _mm_xor_si128(row4l, row1l);
2963 row4h = _mm_xor_si128(row4h, row1h);
2964 row4l = _mm_shuffle_epi8(row4l, r16);
2965 row4h = _mm_shuffle_epi8(row4h, r16);
2966 row3l = _mm_add_epi64(row3l, row4l);
2967 row3h = _mm_add_epi64(row3h, row4h);
2968 row2l = _mm_xor_si128(row2l, row3l);
2969 row2h = _mm_xor_si128(row2h, row3h);
2970 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
2971 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
2973 t0 = _mm_alignr_epi8(row2l, row2h, 8);
2974 t1 = _mm_alignr_epi8(row2h, row2l, 8);
2975 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
2976 t0 = _mm_alignr_epi8(row4l, row4h, 8);
2977 t1 = _mm_alignr_epi8(row4h, row4l, 8);
2978 row4l = t1, row4h = t0;
2980 b0 = _mm_blend_epi16(m6, m0, 0xF0);
2981 b1 = _mm_unpacklo_epi64(m7, m2);
2983 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
2984 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
2985 row4l = _mm_xor_si128(row4l, row1l);
2986 row4h = _mm_xor_si128(row4h, row1h);
2987 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
2988 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
2989 row3l = _mm_add_epi64(row3l, row4l);
2990 row3h = _mm_add_epi64(row3h, row4h);
2991 row2l = _mm_xor_si128(row2l, row3l);
2992 row2h = _mm_xor_si128(row2h, row3h);
2993 row2l = _mm_shuffle_epi8(row2l, r24);
2994 row2h = _mm_shuffle_epi8(row2h, r24);
2996 b0 = _mm_unpackhi_epi64(m2, m7);
2997 b1 = _mm_alignr_epi8(m5, m6, 8);
2999 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3000 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3001 row4l = _mm_xor_si128(row4l, row1l);
3002 row4h = _mm_xor_si128(row4h, row1h);
3003 row4l = _mm_shuffle_epi8(row4l, r16);
3004 row4h = _mm_shuffle_epi8(row4h, r16);
3005 row3l = _mm_add_epi64(row3l, row4l);
3006 row3h = _mm_add_epi64(row3h, row4h);
3007 row2l = _mm_xor_si128(row2l, row3l);
3008 row2h = _mm_xor_si128(row2h, row3h);
3009 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3010 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3012 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3013 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3014 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3015 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3016 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3017 row4l = t1, row4h = t0;
3019 b0 = _mm_unpacklo_epi64(m0, m3);
3020 b1 = _mm_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));
3022 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3023 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3024 row4l = _mm_xor_si128(row4l, row1l);
3025 row4h = _mm_xor_si128(row4h, row1h);
3026 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3027 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3028 row3l = _mm_add_epi64(row3l, row4l);
3029 row3h = _mm_add_epi64(row3h, row4h);
3030 row2l = _mm_xor_si128(row2l, row3l);
3031 row2h = _mm_xor_si128(row2h, row3h);
3032 row2l = _mm_shuffle_epi8(row2l, r24);
3033 row2h = _mm_shuffle_epi8(row2h, r24);
3035 b0 = _mm_unpackhi_epi64(m3, m1);
3036 b1 = _mm_blend_epi16(m1, m5, 0xF0);
3038 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3039 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3040 row4l = _mm_xor_si128(row4l, row1l);
3041 row4h = _mm_xor_si128(row4h, row1h);
3042 row4l = _mm_shuffle_epi8(row4l, r16);
3043 row4h = _mm_shuffle_epi8(row4h, r16);
3044 row3l = _mm_add_epi64(row3l, row4l);
3045 row3h = _mm_add_epi64(row3h, row4h);
3046 row2l = _mm_xor_si128(row2l, row3l);
3047 row2h = _mm_xor_si128(row2h, row3h);
3048 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3049 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3051 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3052 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3053 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3054 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3055 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3056 row4l = t1, row4h = t0;
3058 b0 = _mm_unpackhi_epi64(m6, m3);
3059 b1 = _mm_blend_epi16(m6, m1, 0xF0);
3061 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3062 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3063 row4l = _mm_xor_si128(row4l, row1l);
3064 row4h = _mm_xor_si128(row4h, row1h);
3065 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3066 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3067 row3l = _mm_add_epi64(row3l, row4l);
3068 row3h = _mm_add_epi64(row3h, row4h);
3069 row2l = _mm_xor_si128(row2l, row3l);
3070 row2h = _mm_xor_si128(row2h, row3h);
3071 row2l = _mm_shuffle_epi8(row2l, r24);
3072 row2h = _mm_shuffle_epi8(row2h, r24);
3074 b0 = _mm_alignr_epi8(m7, m5, 8);
3075 b1 = _mm_unpackhi_epi64(m0, m4);
3077 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3078 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3079 row4l = _mm_xor_si128(row4l, row1l);
3080 row4h = _mm_xor_si128(row4h, row1h);
3081 row4l = _mm_shuffle_epi8(row4l, r16);
3082 row4h = _mm_shuffle_epi8(row4h, r16);
3083 row3l = _mm_add_epi64(row3l, row4l);
3084 row3h = _mm_add_epi64(row3h, row4h);
3085 row2l = _mm_xor_si128(row2l, row3l);
3086 row2h = _mm_xor_si128(row2h, row3h);
3087 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3088 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3090 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3091 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3092 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3093 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3094 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3095 row4l = t1, row4h = t0;
3097 b0 = _mm_unpackhi_epi64(m2, m7);
3098 b1 = _mm_unpacklo_epi64(m4, m1);
3100 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3101 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3102 row4l = _mm_xor_si128(row4l, row1l);
3103 row4h = _mm_xor_si128(row4h, row1h);
3104 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3105 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3106 row3l = _mm_add_epi64(row3l, row4l);
3107 row3h = _mm_add_epi64(row3h, row4h);
3108 row2l = _mm_xor_si128(row2l, row3l);
3109 row2h = _mm_xor_si128(row2h, row3h);
3110 row2l = _mm_shuffle_epi8(row2l, r24);
3111 row2h = _mm_shuffle_epi8(row2h, r24);
3113 b0 = _mm_unpacklo_epi64(m0, m2);
3114 b1 = _mm_unpacklo_epi64(m3, m5);
3116 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3117 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3118 row4l = _mm_xor_si128(row4l, row1l);
3119 row4h = _mm_xor_si128(row4h, row1h);
3120 row4l = _mm_shuffle_epi8(row4l, r16);
3121 row4h = _mm_shuffle_epi8(row4h, r16);
3122 row3l = _mm_add_epi64(row3l, row4l);
3123 row3h = _mm_add_epi64(row3h, row4h);
3124 row2l = _mm_xor_si128(row2l, row3l);
3125 row2h = _mm_xor_si128(row2h, row3h);
3126 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3127 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3129 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3130 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3131 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3132 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3133 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3134 row4l = t1, row4h = t0;
3136 b0 = _mm_unpacklo_epi64(m3, m7);
3137 b1 = _mm_alignr_epi8(m0, m5, 8);
3139 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3140 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3141 row4l = _mm_xor_si128(row4l, row1l);
3142 row4h = _mm_xor_si128(row4h, row1h);
3143 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3144 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3145 row3l = _mm_add_epi64(row3l, row4l);
3146 row3h = _mm_add_epi64(row3h, row4h);
3147 row2l = _mm_xor_si128(row2l, row3l);
3148 row2h = _mm_xor_si128(row2h, row3h);
3149 row2l = _mm_shuffle_epi8(row2l, r24);
3150 row2h = _mm_shuffle_epi8(row2h, r24);
3152 b0 = _mm_unpackhi_epi64(m7, m4);
3153 b1 = _mm_alignr_epi8(m4, m1, 8);
3155 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3156 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3157 row4l = _mm_xor_si128(row4l, row1l);
3158 row4h = _mm_xor_si128(row4h, row1h);
3159 row4l = _mm_shuffle_epi8(row4l, r16);
3160 row4h = _mm_shuffle_epi8(row4h, r16);
3161 row3l = _mm_add_epi64(row3l, row4l);
3162 row3h = _mm_add_epi64(row3h, row4h);
3163 row2l = _mm_xor_si128(row2l, row3l);
3164 row2h = _mm_xor_si128(row2h, row3h);
3165 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3166 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3168 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3169 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3170 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3171 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3172 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3173 row4l = t1, row4h = t0;
3176 b1 = _mm_alignr_epi8(m5, m0, 8);
3178 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3179 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3180 row4l = _mm_xor_si128(row4l, row1l);
3181 row4h = _mm_xor_si128(row4h, row1h);
3182 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3183 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3184 row3l = _mm_add_epi64(row3l, row4l);
3185 row3h = _mm_add_epi64(row3h, row4h);
3186 row2l = _mm_xor_si128(row2l, row3l);
3187 row2h = _mm_xor_si128(row2h, row3h);
3188 row2l = _mm_shuffle_epi8(row2l, r24);
3189 row2h = _mm_shuffle_epi8(row2h, r24);
3191 b0 = _mm_blend_epi16(m1, m3, 0xF0);
3194 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3195 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3196 row4l = _mm_xor_si128(row4l, row1l);
3197 row4h = _mm_xor_si128(row4h, row1h);
3198 row4l = _mm_shuffle_epi8(row4l, r16);
3199 row4h = _mm_shuffle_epi8(row4h, r16);
3200 row3l = _mm_add_epi64(row3l, row4l);
3201 row3h = _mm_add_epi64(row3h, row4h);
3202 row2l = _mm_xor_si128(row2l, row3l);
3203 row2h = _mm_xor_si128(row2h, row3h);
3204 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3205 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3207 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3208 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3209 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3210 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3211 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3212 row4l = t1, row4h = t0;
3214 b0 = _mm_unpacklo_epi64(m5, m4);
3215 b1 = _mm_unpackhi_epi64(m3, m0);
3217 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3218 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3219 row4l = _mm_xor_si128(row4l, row1l);
3220 row4h = _mm_xor_si128(row4h, row1h);
3221 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3222 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3223 row3l = _mm_add_epi64(row3l, row4l);
3224 row3h = _mm_add_epi64(row3h, row4h);
3225 row2l = _mm_xor_si128(row2l, row3l);
3226 row2h = _mm_xor_si128(row2h, row3h);
3227 row2l = _mm_shuffle_epi8(row2l, r24);
3228 row2h = _mm_shuffle_epi8(row2h, r24);
3230 b0 = _mm_unpacklo_epi64(m1, m2);
3231 b1 = _mm_blend_epi16(m3, m2, 0xF0);
3233 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3234 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3235 row4l = _mm_xor_si128(row4l, row1l);
3236 row4h = _mm_xor_si128(row4h, row1h);
3237 row4l = _mm_shuffle_epi8(row4l, r16);
3238 row4h = _mm_shuffle_epi8(row4h, r16);
3239 row3l = _mm_add_epi64(row3l, row4l);
3240 row3h = _mm_add_epi64(row3h, row4h);
3241 row2l = _mm_xor_si128(row2l, row3l);
3242 row2h = _mm_xor_si128(row2h, row3h);
3243 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3244 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3246 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3247 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3248 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3249 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3250 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3251 row4l = t1, row4h = t0;
3253 b0 = _mm_unpackhi_epi64(m7, m4);
3254 b1 = _mm_unpackhi_epi64(m1, m6);
3256 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3257 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3258 row4l = _mm_xor_si128(row4l, row1l);
3259 row4h = _mm_xor_si128(row4h, row1h);
3260 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3261 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3262 row3l = _mm_add_epi64(row3l, row4l);
3263 row3h = _mm_add_epi64(row3h, row4h);
3264 row2l = _mm_xor_si128(row2l, row3l);
3265 row2h = _mm_xor_si128(row2h, row3h);
3266 row2l = _mm_shuffle_epi8(row2l, r24);
3267 row2h = _mm_shuffle_epi8(row2h, r24);
3269 b0 = _mm_alignr_epi8(m7, m5, 8);
3270 b1 = _mm_unpacklo_epi64(m6, m0);
3272 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3273 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3274 row4l = _mm_xor_si128(row4l, row1l);
3275 row4h = _mm_xor_si128(row4h, row1h);
3276 row4l = _mm_shuffle_epi8(row4l, r16);
3277 row4h = _mm_shuffle_epi8(row4h, r16);
3278 row3l = _mm_add_epi64(row3l, row4l);
3279 row3h = _mm_add_epi64(row3h, row4h);
3280 row2l = _mm_xor_si128(row2l, row3l);
3281 row2h = _mm_xor_si128(row2h, row3h);
3282 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3283 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3285 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3286 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3287 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3288 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3289 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3290 row4l = t1, row4h = t0;
3292 b0 = _mm_unpacklo_epi64(m0, m1);
3293 b1 = _mm_unpacklo_epi64(m2, m3);
3295 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3296 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3297 row4l = _mm_xor_si128(row4l, row1l);
3298 row4h = _mm_xor_si128(row4h, row1h);
3299 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3300 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3301 row3l = _mm_add_epi64(row3l, row4l);
3302 row3h = _mm_add_epi64(row3h, row4h);
3303 row2l = _mm_xor_si128(row2l, row3l);
3304 row2h = _mm_xor_si128(row2h, row3h);
3305 row2l = _mm_shuffle_epi8(row2l, r24);
3306 row2h = _mm_shuffle_epi8(row2h, r24);
3308 b0 = _mm_unpackhi_epi64(m0, m1);
3309 b1 = _mm_unpackhi_epi64(m2, m3);
3311 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3312 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3313 row4l = _mm_xor_si128(row4l, row1l);
3314 row4h = _mm_xor_si128(row4h, row1h);
3315 row4l = _mm_shuffle_epi8(row4l, r16);
3316 row4h = _mm_shuffle_epi8(row4h, r16);
3317 row3l = _mm_add_epi64(row3l, row4l);
3318 row3h = _mm_add_epi64(row3h, row4h);
3319 row2l = _mm_xor_si128(row2l, row3l);
3320 row2h = _mm_xor_si128(row2h, row3h);
3321 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3322 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3324 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3325 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3326 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3327 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3328 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3329 row4l = t1, row4h = t0;
3331 b0 = _mm_unpacklo_epi64(m4, m5);
3332 b1 = _mm_unpacklo_epi64(m6, m7);
3334 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3335 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3336 row4l = _mm_xor_si128(row4l, row1l);
3337 row4h = _mm_xor_si128(row4h, row1h);
3338 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3339 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3340 row3l = _mm_add_epi64(row3l, row4l);
3341 row3h = _mm_add_epi64(row3h, row4h);
3342 row2l = _mm_xor_si128(row2l, row3l);
3343 row2h = _mm_xor_si128(row2h, row3h);
3344 row2l = _mm_shuffle_epi8(row2l, r24);
3345 row2h = _mm_shuffle_epi8(row2h, r24);
3347 b0 = _mm_unpackhi_epi64(m4, m5);
3348 b1 = _mm_unpackhi_epi64(m6, m7);
3350 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3351 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3352 row4l = _mm_xor_si128(row4l, row1l);
3353 row4h = _mm_xor_si128(row4h, row1h);
3354 row4l = _mm_shuffle_epi8(row4l, r16);
3355 row4h = _mm_shuffle_epi8(row4h, r16);
3356 row3l = _mm_add_epi64(row3l, row4l);
3357 row3h = _mm_add_epi64(row3h, row4h);
3358 row2l = _mm_xor_si128(row2l, row3l);
3359 row2h = _mm_xor_si128(row2h, row3h);
3360 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3361 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3363 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3364 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3365 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3366 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3367 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3368 row4l = t1, row4h = t0;
3370 b0 = _mm_unpacklo_epi64(m7, m2);
3371 b1 = _mm_unpackhi_epi64(m4, m6);
3373 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3374 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3375 row4l = _mm_xor_si128(row4l, row1l);
3376 row4h = _mm_xor_si128(row4h, row1h);
3377 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3378 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3379 row3l = _mm_add_epi64(row3l, row4l);
3380 row3h = _mm_add_epi64(row3h, row4h);
3381 row2l = _mm_xor_si128(row2l, row3l);
3382 row2h = _mm_xor_si128(row2h, row3h);
3383 row2l = _mm_shuffle_epi8(row2l, r24);
3384 row2h = _mm_shuffle_epi8(row2h, r24);
3386 b0 = _mm_unpacklo_epi64(m5, m4);
3387 b1 = _mm_alignr_epi8(m3, m7, 8);
3389 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3390 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3391 row4l = _mm_xor_si128(row4l, row1l);
3392 row4h = _mm_xor_si128(row4h, row1h);
3393 row4l = _mm_shuffle_epi8(row4l, r16);
3394 row4h = _mm_shuffle_epi8(row4h, r16);
3395 row3l = _mm_add_epi64(row3l, row4l);
3396 row3h = _mm_add_epi64(row3h, row4h);
3397 row2l = _mm_xor_si128(row2l, row3l);
3398 row2h = _mm_xor_si128(row2h, row3h);
3399 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3400 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3402 t0 = _mm_alignr_epi8(row2h, row2l, 8);
3403 t1 = _mm_alignr_epi8(row2l, row2h, 8);
3404 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3405 t0 = _mm_alignr_epi8(row4h, row4l, 8);
3406 t1 = _mm_alignr_epi8(row4l, row4h, 8);
3407 row4l = t1, row4h = t0;
3409 b0 = _mm_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));
3410 b1 = _mm_unpackhi_epi64(m5, m2);
3412 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3413 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3414 row4l = _mm_xor_si128(row4l, row1l);
3415 row4h = _mm_xor_si128(row4h, row1h);
3416 row4l = _mm_shuffle_epi32(row4l, _MM_SHUFFLE(2,3,0,1));
3417 row4h = _mm_shuffle_epi32(row4h, _MM_SHUFFLE(2,3,0,1));
3418 row3l = _mm_add_epi64(row3l, row4l);
3419 row3h = _mm_add_epi64(row3h, row4h);
3420 row2l = _mm_xor_si128(row2l, row3l);
3421 row2h = _mm_xor_si128(row2h, row3h);
3422 row2l = _mm_shuffle_epi8(row2l, r24);
3423 row2h = _mm_shuffle_epi8(row2h, r24);
3425 b0 = _mm_unpacklo_epi64(m6, m1);
3426 b1 = _mm_unpackhi_epi64(m3, m1);
3428 row1l = _mm_add_epi64(_mm_add_epi64(row1l, b0), row2l);
3429 row1h = _mm_add_epi64(_mm_add_epi64(row1h, b1), row2h);
3430 row4l = _mm_xor_si128(row4l, row1l);
3431 row4h = _mm_xor_si128(row4h, row1h);
3432 row4l = _mm_shuffle_epi8(row4l, r16);
3433 row4h = _mm_shuffle_epi8(row4h, r16);
3434 row3l = _mm_add_epi64(row3l, row4l);
3435 row3h = _mm_add_epi64(row3h, row4h);
3436 row2l = _mm_xor_si128(row2l, row3l);
3437 row2h = _mm_xor_si128(row2h, row3h);
3438 row2l = _mm_xor_si128(_mm_srli_epi64(row2l, 63), _mm_add_epi64(row2l, row2l));
3439 row2h = _mm_xor_si128(_mm_srli_epi64(row2h, 63), _mm_add_epi64(row2h, row2h));
3441 t0 = _mm_alignr_epi8(row2l, row2h, 8);
3442 t1 = _mm_alignr_epi8(row2h, row2l, 8);
3443 row2l = t0, row2h = t1, t0 = row3l, row3l = row3h, row3h = t0;
3444 t0 = _mm_alignr_epi8(row4l, row4h, 8);
3445 t1 = _mm_alignr_epi8(row4h, row4l, 8);
3446 row4l = t1, row4h = t0;
3448 row1l = _mm_xor_si128(row3l, row1l);
3449 row1h = _mm_xor_si128(row3h, row1h);
3450 _mm_storeu_si128((__m128i *)(
void*)(&state.h[0]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[0])), row1l));
3451 _mm_storeu_si128((__m128i *)(
void*)(&state.h[2]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[2])), row1h));
3453 row2l = _mm_xor_si128(row4l, row2l);
3454 row2h = _mm_xor_si128(row4h, row2h);
3455 _mm_storeu_si128((__m128i *)(
void*)(&state.h[4]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[4])), row2l));
3456 _mm_storeu_si128((__m128i *)(
void*)(&state.h[6]), _mm_xor_si128(_mm_loadu_si128((
const __m128i*)(
const void*)(&state.h[6])), row2h));
3458 #endif // CRYPTOPP_BOOL_SSE4_INTRINSICS_AVAILABLE 3460 #if CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE 3463 #define vld1q_u32_rev(x, a,b,c,d) d[1]=c[0],d[2]=b[0],d[3]=a[0]; x = vld1q_u32(d); 3468 static const int LANE_H64 = 1;
3469 static const int LANE_L64 = 0;
3474 assert(
IsAlignedOn(&state.h[0],GetAlignmentOf<uint32x4_t>()));
3475 assert(
IsAlignedOn(&state.h[4],GetAlignmentOf<uint32x4_t>()));
3476 assert(
IsAlignedOn(&state.t[0],GetAlignmentOf<uint32x4_t>()));
3478 CRYPTOPP_ALIGN_DATA(16) uint32_t m0[4], m1[4], m2[4], m3[4], m4[4], m5[4], m6[4], m7[4];
3479 CRYPTOPP_ALIGN_DATA(16) uint32_t m8[4], m9[4], m10[4], m11[4], m12[4], m13[4], m14[4], m15[4];
3482 get(m0[0])(m1[0])(m2[0])(m3[0])(m4[0])(m5[0])(m6[0])(m7[0])(m8[0])(m9[0])(m10[0])(m11[0])(m12[0])(m13[0])(m14[0])(m15[0]);
3484 uint32x4_t row1,row2,row3,row4;
3485 uint32x4_t buf1,buf2,buf3,buf4;
3488 row1 = ff0 = vld1q_u32((
const uint32_t*)&state.h[0]);
3489 row2 = ff1 = vld1q_u32((
const uint32_t*)&state.h[4]);
3490 row3 = vld1q_u32((
const uint32_t*)&BLAKE2S_IV(0));
3491 row4 = veorq_u32(vld1q_u32((
const uint32_t*)&BLAKE2S_IV(4)), vld1q_u32((
const uint32_t*)&state.t[0]));
3494 vld1q_u32_rev(buf1, m6,m4,m2,m0);
3496 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3497 row4 = veorq_u32(row4,row1);
3498 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3499 row3 = vaddq_u32(row3,row4);
3500 row2 = veorq_u32(row2,row3);
3501 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3504 vld1q_u32_rev(buf2, m7,m5,m3,m1);
3506 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3507 row4 = veorq_u32(row4,row1);
3508 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3509 row3 = vaddq_u32(row3,row4);
3510 row2 = veorq_u32(row2,row3);
3511 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3513 row4 = vextq_u32(row4,row4,3);
3514 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3515 row2 = vextq_u32(row2,row2,1);
3518 vld1q_u32_rev(buf3, m14,m12,m10,m8);
3520 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3521 row4 = veorq_u32(row4,row1);
3522 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3523 row3 = vaddq_u32(row3,row4);
3524 row2 = veorq_u32(row2,row3);
3525 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3528 vld1q_u32_rev(buf4, m15,m13,m11,m9);
3530 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3531 row4 = veorq_u32(row4,row1);
3532 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3533 row3 = vaddq_u32(row3,row4);
3534 row2 = veorq_u32(row2,row3);
3535 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3537 row4 = vextq_u32(row4,row4,1);
3538 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3539 row2 = vextq_u32(row2,row2,3);
3542 vld1q_u32_rev(buf1, m13,m9,m4,m14);
3544 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3545 row4 = veorq_u32(row4,row1);
3546 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3547 row3 = vaddq_u32(row3,row4);
3548 row2 = veorq_u32(row2,row3);
3549 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3552 vld1q_u32_rev(buf2, m6,m15,m8,m10);
3554 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3555 row4 = veorq_u32(row4,row1);
3556 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3557 row3 = vaddq_u32(row3,row4);
3558 row2 = veorq_u32(row2,row3);
3559 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3561 row4 = vextq_u32(row4,row4,3);
3562 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3563 row2 = vextq_u32(row2,row2,1);
3566 vld1q_u32_rev(buf3, m5,m11,m0,m1);
3568 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3569 row4 = veorq_u32(row4,row1);
3570 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3571 row3 = vaddq_u32(row3,row4);
3572 row2 = veorq_u32(row2,row3);
3573 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3576 vld1q_u32_rev(buf4, m3,m7,m2,m12);
3578 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3579 row4 = veorq_u32(row4,row1);
3580 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3581 row3 = vaddq_u32(row3,row4);
3582 row2 = veorq_u32(row2,row3);
3583 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3585 row4 = vextq_u32(row4,row4,1);
3586 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3587 row2 = vextq_u32(row2,row2,3);
3590 vld1q_u32_rev(buf1, m15,m5,m12,m11);
3592 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3593 row4 = veorq_u32(row4,row1);
3594 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3595 row3 = vaddq_u32(row3,row4);
3596 row2 = veorq_u32(row2,row3);
3597 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3600 vld1q_u32_rev(buf2, m13,m2,m0,m8);
3602 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3603 row4 = veorq_u32(row4,row1);
3604 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3605 row3 = vaddq_u32(row3,row4);
3606 row2 = veorq_u32(row2,row3);
3607 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3609 row4 = vextq_u32(row4,row4,3);
3610 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3611 row2 = vextq_u32(row2,row2,1);
3614 vld1q_u32_rev(buf3, m9,m7,m3,m10);
3616 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3617 row4 = veorq_u32(row4,row1);
3618 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3619 row3 = vaddq_u32(row3,row4);
3620 row2 = veorq_u32(row2,row3);
3621 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3624 vld1q_u32_rev(buf4, m4,m1,m6,m14);
3626 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3627 row4 = veorq_u32(row4,row1);
3628 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3629 row3 = vaddq_u32(row3,row4);
3630 row2 = veorq_u32(row2,row3);
3631 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3633 row4 = vextq_u32(row4,row4,1);
3634 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3635 row2 = vextq_u32(row2,row2,3);
3638 vld1q_u32_rev(buf1, m11,m13,m3,m7);
3640 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3641 row4 = veorq_u32(row4,row1);
3642 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3643 row3 = vaddq_u32(row3,row4);
3644 row2 = veorq_u32(row2,row3);
3645 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3648 vld1q_u32_rev(buf2, m14,m12,m1,m9);
3650 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3651 row4 = veorq_u32(row4,row1);
3652 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3653 row3 = vaddq_u32(row3,row4);
3654 row2 = veorq_u32(row2,row3);
3655 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3657 row4 = vextq_u32(row4,row4,3);
3658 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3659 row2 = vextq_u32(row2,row2,1);
3662 vld1q_u32_rev(buf3, m15,m4,m5,m2);
3664 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3665 row4 = veorq_u32(row4,row1);
3666 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3667 row3 = vaddq_u32(row3,row4);
3668 row2 = veorq_u32(row2,row3);
3669 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3672 vld1q_u32_rev(buf4, m8,m0,m10,m6);
3674 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3675 row4 = veorq_u32(row4,row1);
3676 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3677 row3 = vaddq_u32(row3,row4);
3678 row2 = veorq_u32(row2,row3);
3679 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3681 row4 = vextq_u32(row4,row4,1);
3682 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3683 row2 = vextq_u32(row2,row2,3);
3686 vld1q_u32_rev(buf1, m10,m2,m5,m9);
3688 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3689 row4 = veorq_u32(row4,row1);
3690 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3691 row3 = vaddq_u32(row3,row4);
3692 row2 = veorq_u32(row2,row3);
3693 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3696 vld1q_u32_rev(buf2, m15,m4,m7,m0);
3698 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3699 row4 = veorq_u32(row4,row1);
3700 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3701 row3 = vaddq_u32(row3,row4);
3702 row2 = veorq_u32(row2,row3);
3703 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3705 row4 = vextq_u32(row4,row4,3);
3706 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3707 row2 = vextq_u32(row2,row2,1);
3710 vld1q_u32_rev(buf3, m3,m6,m11,m14);
3712 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3713 row4 = veorq_u32(row4,row1);
3714 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3715 row3 = vaddq_u32(row3,row4);
3716 row2 = veorq_u32(row2,row3);
3717 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3720 vld1q_u32_rev(buf4, m13,m8,m12,m1);
3722 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3723 row4 = veorq_u32(row4,row1);
3724 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3725 row3 = vaddq_u32(row3,row4);
3726 row2 = veorq_u32(row2,row3);
3727 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3729 row4 = vextq_u32(row4,row4,1);
3730 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3731 row2 = vextq_u32(row2,row2,3);
3734 vld1q_u32_rev(buf1, m8,m0,m6,m2);
3736 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3737 row4 = veorq_u32(row4,row1);
3738 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3739 row3 = vaddq_u32(row3,row4);
3740 row2 = veorq_u32(row2,row3);
3741 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3744 vld1q_u32_rev(buf2, m3,m11,m10,m12);
3746 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3747 row4 = veorq_u32(row4,row1);
3748 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3749 row3 = vaddq_u32(row3,row4);
3750 row2 = veorq_u32(row2,row3);
3751 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3753 row4 = vextq_u32(row4,row4,3);
3754 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3755 row2 = vextq_u32(row2,row2,1);
3758 vld1q_u32_rev(buf3, m1,m15,m7,m4);
3760 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3761 row4 = veorq_u32(row4,row1);
3762 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3763 row3 = vaddq_u32(row3,row4);
3764 row2 = veorq_u32(row2,row3);
3765 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3768 vld1q_u32_rev(buf4, m9,m14,m5,m13);
3770 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3771 row4 = veorq_u32(row4,row1);
3772 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3773 row3 = vaddq_u32(row3,row4);
3774 row2 = veorq_u32(row2,row3);
3775 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3777 row4 = vextq_u32(row4,row4,1);
3778 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3779 row2 = vextq_u32(row2,row2,3);
3782 vld1q_u32_rev(buf1, m4,m14,m1,m12);
3784 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3785 row4 = veorq_u32(row4,row1);
3786 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3787 row3 = vaddq_u32(row3,row4);
3788 row2 = veorq_u32(row2,row3);
3789 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3792 vld1q_u32_rev(buf2, m10,m13,m15,m5);
3794 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3795 row4 = veorq_u32(row4,row1);
3796 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3797 row3 = vaddq_u32(row3,row4);
3798 row2 = veorq_u32(row2,row3);
3799 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3801 row4 = vextq_u32(row4,row4,3);
3802 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3803 row2 = vextq_u32(row2,row2,1);
3806 vld1q_u32_rev(buf3, m8,m9,m6,m0);
3808 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3809 row4 = veorq_u32(row4,row1);
3810 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3811 row3 = vaddq_u32(row3,row4);
3812 row2 = veorq_u32(row2,row3);
3813 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3816 vld1q_u32_rev(buf4, m11,m2,m3,m7);
3818 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3819 row4 = veorq_u32(row4,row1);
3820 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3821 row3 = vaddq_u32(row3,row4);
3822 row2 = veorq_u32(row2,row3);
3823 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3825 row4 = vextq_u32(row4,row4,1);
3826 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3827 row2 = vextq_u32(row2,row2,3);
3830 vld1q_u32_rev(buf1, m3,m12,m7,m13);
3832 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3833 row4 = veorq_u32(row4,row1);
3834 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3835 row3 = vaddq_u32(row3,row4);
3836 row2 = veorq_u32(row2,row3);
3837 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3840 vld1q_u32_rev(buf2, m9,m1,m14,m11);
3842 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3843 row4 = veorq_u32(row4,row1);
3844 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3845 row3 = vaddq_u32(row3,row4);
3846 row2 = veorq_u32(row2,row3);
3847 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3849 row4 = vextq_u32(row4,row4,3);
3850 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3851 row2 = vextq_u32(row2,row2,1);
3854 vld1q_u32_rev(buf3, m2,m8,m15,m5);
3856 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3857 row4 = veorq_u32(row4,row1);
3858 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3859 row3 = vaddq_u32(row3,row4);
3860 row2 = veorq_u32(row2,row3);
3861 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3864 vld1q_u32_rev(buf4, m10,m6,m4,m0);
3866 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3867 row4 = veorq_u32(row4,row1);
3868 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3869 row3 = vaddq_u32(row3,row4);
3870 row2 = veorq_u32(row2,row3);
3871 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3873 row4 = vextq_u32(row4,row4,1);
3874 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3875 row2 = vextq_u32(row2,row2,3);
3878 vld1q_u32_rev(buf1, m0,m11,m14,m6);
3880 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3881 row4 = veorq_u32(row4,row1);
3882 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3883 row3 = vaddq_u32(row3,row4);
3884 row2 = veorq_u32(row2,row3);
3885 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3888 vld1q_u32_rev(buf2, m8,m3,m9,m15);
3890 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3891 row4 = veorq_u32(row4,row1);
3892 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3893 row3 = vaddq_u32(row3,row4);
3894 row2 = veorq_u32(row2,row3);
3895 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3897 row4 = vextq_u32(row4,row4,3);
3898 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3899 row2 = vextq_u32(row2,row2,1);
3902 vld1q_u32_rev(buf3, m10,m1,m13,m12);
3904 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3905 row4 = veorq_u32(row4,row1);
3906 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3907 row3 = vaddq_u32(row3,row4);
3908 row2 = veorq_u32(row2,row3);
3909 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3912 vld1q_u32_rev(buf4, m5,m4,m7,m2);
3914 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3915 row4 = veorq_u32(row4,row1);
3916 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3917 row3 = vaddq_u32(row3,row4);
3918 row2 = veorq_u32(row2,row3);
3919 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3921 row4 = vextq_u32(row4,row4,1);
3922 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3923 row2 = vextq_u32(row2,row2,3);
3926 vld1q_u32_rev(buf1, m1,m7,m8,m10);
3928 row1 = vaddq_u32(vaddq_u32(row1,buf1),row2);
3929 row4 = veorq_u32(row4,row1);
3930 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3931 row3 = vaddq_u32(row3,row4);
3932 row2 = veorq_u32(row2,row3);
3933 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3936 vld1q_u32_rev(buf2, m5,m6,m4,m2);
3938 row1 = vaddq_u32(vaddq_u32(row1,buf2),row2);
3939 row4 = veorq_u32(row4,row1);
3940 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3941 row3 = vaddq_u32(row3,row4);
3942 row2 = veorq_u32(row2,row3);
3943 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3945 row4 = vextq_u32(row4,row4,3);
3946 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3947 row2 = vextq_u32(row2,row2,1);
3950 vld1q_u32_rev(buf3, m13,m3,m9,m15);
3952 row1 = vaddq_u32(vaddq_u32(row1,buf3),row2);
3953 row4 = veorq_u32(row4,row1);
3954 row4 = veorq_u32(vshrq_n_u32(row4,16),vshlq_n_u32(row4,16));
3955 row3 = vaddq_u32(row3,row4);
3956 row2 = veorq_u32(row2,row3);
3957 row2 = veorq_u32(vshrq_n_u32(row2,12),vshlq_n_u32(row2,20));
3960 vld1q_u32_rev(buf4, m0,m12,m14,m11);
3962 row1 = vaddq_u32(vaddq_u32(row1,buf4),row2);
3963 row4 = veorq_u32(row4,row1);
3964 row4 = veorq_u32(vshrq_n_u32(row4,8),vshlq_n_u32(row4,24));
3965 row3 = vaddq_u32(row3,row4);
3966 row2 = veorq_u32(row2,row3);
3967 row2 = veorq_u32(vshrq_n_u32(row2,7),vshlq_n_u32(row2,25));
3969 row4 = vextq_u32(row4,row4,1);
3970 row3 = vcombine_u32(vget_high_u32(row3),vget_low_u32(row3));
3971 row2 = vextq_u32(row2,row2,3);
3973 vst1q_u32((uint32_t*)&state.h[0],veorq_u32(ff0,veorq_u32(row1,row3)));
3974 vst1q_u32((uint32_t*)&state.h[4],veorq_u32(ff1,veorq_u32(row2,row4)));
3980 assert(
IsAlignedOn(&state.h[0],GetAlignmentOf<uint64x2_t>()));
3981 assert(
IsAlignedOn(&state.h[4],GetAlignmentOf<uint64x2_t>()));
3982 assert(
IsAlignedOn(&state.t[0],GetAlignmentOf<uint64x2_t>()));
3984 uint64x2_t m0m1,m2m3,m4m5,m6m7,m8m9,m10m11,m12m13,m14m15;
3986 m0m1 = vreinterpretq_u64_u8(vld1q_u8(input+ 0));
3987 m2m3 = vreinterpretq_u64_u8(vld1q_u8(input+ 16));
3988 m4m5 = vreinterpretq_u64_u8(vld1q_u8(input+ 32));
3989 m6m7 = vreinterpretq_u64_u8(vld1q_u8(input+ 48));
3990 m8m9 = vreinterpretq_u64_u8(vld1q_u8(input+ 64));
3991 m10m11 = vreinterpretq_u64_u8(vld1q_u8(input+ 80));
3992 m12m13 = vreinterpretq_u64_u8(vld1q_u8(input+ 96));
3993 m14m15 = vreinterpretq_u64_u8(vld1q_u8(input+112));
3995 uint64x2_t row1l, row1h, row2l, row2h;
3996 uint64x2_t row3l, row3h, row4l, row4h;
3997 uint64x2_t b0 = {0,0}, b1 = {0,0}, t0, t1;
3999 row1l = vld1q_u64((
const uint64_t *)&state.h[0]);
4000 row1h = vld1q_u64((
const uint64_t *)&state.h[2]);
4001 row2l = vld1q_u64((
const uint64_t *)&state.h[4]);
4002 row2h = vld1q_u64((
const uint64_t *)&state.h[6]);
4003 row3l = vld1q_u64((
const uint64_t *)&BLAKE2B_IV(0));
4004 row3h = vld1q_u64((
const uint64_t *)&BLAKE2B_IV(2));
4005 row4l = veorq_u64(vld1q_u64((
const uint64_t *)&BLAKE2B_IV(4)), vld1q_u64((
const uint64_t*)&state.t[0]));
4006 row4h = veorq_u64(vld1q_u64((
const uint64_t *)&BLAKE2B_IV(6)), vld1q_u64((
const uint64_t*)&state.f[0]));
4008 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4009 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4010 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4011 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4012 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4013 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4014 row4l = veorq_u64(row4l, row1l);
4015 row4h = veorq_u64(row4h, row1h);
4016 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4017 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4018 row3l = vaddq_u64(row3l, row4l);
4019 row3h = vaddq_u64(row3h, row4h);
4020 row2l = veorq_u64(row2l, row3l);
4021 row2h = veorq_u64(row2h, row3h);
4022 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4023 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4025 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4026 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4027 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4028 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_H64);
4029 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4030 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4031 row4l = veorq_u64(row4l, row1l);
4032 row4h = veorq_u64(row4h, row1h);
4033 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4034 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4035 row3l = vaddq_u64(row3l, row4l);
4036 row3h = vaddq_u64(row3h, row4h);
4037 row2l = veorq_u64(row2l, row3l);
4038 row2h = veorq_u64(row2h, row3h);
4039 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4040 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4042 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4043 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4044 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4045 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4046 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4047 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4048 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4049 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4050 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4052 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4053 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4054 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4055 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4056 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4057 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4058 row4l = veorq_u64(row4l, row1l);
4059 row4h = veorq_u64(row4h, row1h);
4060 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4061 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4062 row3l = vaddq_u64(row3l, row4l);
4063 row3h = vaddq_u64(row3h, row4h);
4064 row2l = veorq_u64(row2l, row3l);
4065 row2h = veorq_u64(row2h, row3h);
4066 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4067 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4069 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4070 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4071 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4072 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4073 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4074 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4075 row4l = veorq_u64(row4l, row1l);
4076 row4h = veorq_u64(row4h, row1h);
4077 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4078 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4079 row3l = vaddq_u64(row3l, row4l);
4080 row3h = vaddq_u64(row3h, row4h);
4081 row2l = veorq_u64(row2l, row3l);
4082 row2h = veorq_u64(row2h, row3h);
4083 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4084 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4086 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4087 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4088 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4089 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4090 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4091 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4092 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4093 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4094 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4096 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4097 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4098 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4099 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4100 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4101 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4102 row4l = veorq_u64(row4l, row1l);
4103 row4h = veorq_u64(row4h, row1h);
4104 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4105 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4106 row3l = vaddq_u64(row3l, row4l);
4107 row3h = vaddq_u64(row3h, row4h);
4108 row2l = veorq_u64(row2l, row3l);
4109 row2h = veorq_u64(row2h, row3h);
4110 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4111 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4113 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4114 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4115 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4116 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4117 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4118 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4119 row4l = veorq_u64(row4l, row1l);
4120 row4h = veorq_u64(row4h, row1h);
4121 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4122 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4123 row3l = vaddq_u64(row3l, row4l);
4124 row3h = vaddq_u64(row3h, row4h);
4125 row2l = veorq_u64(row2l, row3l);
4126 row2h = veorq_u64(row2h, row3h);
4127 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4128 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4130 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4131 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4132 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4133 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4134 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4135 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4136 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4137 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4138 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4140 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4141 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
4142 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4143 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4144 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4145 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4146 row4l = veorq_u64(row4l, row1l);
4147 row4h = veorq_u64(row4h, row1h);
4148 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4149 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4150 row3l = vaddq_u64(row3l, row4l);
4151 row3h = vaddq_u64(row3h, row4h);
4152 row2l = veorq_u64(row2l, row3l);
4153 row2h = veorq_u64(row2h, row3h);
4154 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4155 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4157 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4158 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4159 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4160 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4161 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4162 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4163 row4l = veorq_u64(row4l, row1l);
4164 row4h = veorq_u64(row4h, row1h);
4165 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4166 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4167 row3l = vaddq_u64(row3l, row4l);
4168 row3h = vaddq_u64(row3h, row4h);
4169 row2l = veorq_u64(row2l, row3l);
4170 row2h = veorq_u64(row2h, row3h);
4171 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4172 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4174 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4175 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4176 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4177 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4178 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4179 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4180 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4181 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4182 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4184 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4185 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_H64);
4186 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4187 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4188 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4189 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4190 row4l = veorq_u64(row4l, row1l);
4191 row4h = veorq_u64(row4h, row1h);
4192 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4193 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4194 row3l = vaddq_u64(row3l, row4l);
4195 row3h = vaddq_u64(row3h, row4h);
4196 row2l = veorq_u64(row2l, row3l);
4197 row2h = veorq_u64(row2h, row3h);
4198 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4199 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4201 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4202 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
4203 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4204 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4205 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4206 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4207 row4l = veorq_u64(row4l, row1l);
4208 row4h = veorq_u64(row4h, row1h);
4209 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4210 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4211 row3l = vaddq_u64(row3l, row4l);
4212 row3h = vaddq_u64(row3h, row4h);
4213 row2l = veorq_u64(row2l, row3l);
4214 row2h = veorq_u64(row2h, row3h);
4215 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4216 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4218 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4219 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4220 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4221 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4222 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4223 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4224 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4225 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4226 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4228 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4229 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4230 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4231 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4232 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4233 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4234 row4l = veorq_u64(row4l, row1l);
4235 row4h = veorq_u64(row4h, row1h);
4236 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4237 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4238 row3l = vaddq_u64(row3l, row4l);
4239 row3h = vaddq_u64(row3h, row4h);
4240 row2l = veorq_u64(row2l, row3l);
4241 row2h = veorq_u64(row2h, row3h);
4242 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4243 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4245 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4246 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4247 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4248 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_H64);
4249 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4250 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4251 row4l = veorq_u64(row4l, row1l);
4252 row4h = veorq_u64(row4h, row1h);
4253 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4254 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4255 row3l = vaddq_u64(row3l, row4l);
4256 row3h = vaddq_u64(row3h, row4h);
4257 row2l = veorq_u64(row2l, row3l);
4258 row2h = veorq_u64(row2h, row3h);
4259 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4260 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4262 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4263 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4264 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4265 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4266 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4267 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4268 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4269 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4270 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4272 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_L64);
4273 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4274 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4275 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_H64);
4276 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4277 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4278 row4l = veorq_u64(row4l, row1l);
4279 row4h = veorq_u64(row4h, row1h);
4280 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4281 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4282 row3l = vaddq_u64(row3l, row4l);
4283 row3h = vaddq_u64(row3h, row4h);
4284 row2l = veorq_u64(row2l, row3l);
4285 row2h = veorq_u64(row2h, row3h);
4286 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4287 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4289 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4290 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_H64);
4291 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4292 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4293 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4294 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4295 row4l = veorq_u64(row4l, row1l);
4296 row4h = veorq_u64(row4h, row1h);
4297 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4298 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4299 row3l = vaddq_u64(row3l, row4l);
4300 row3h = vaddq_u64(row3h, row4h);
4301 row2l = veorq_u64(row2l, row3l);
4302 row2h = veorq_u64(row2h, row3h);
4303 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4304 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4306 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4307 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4308 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4309 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4310 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4311 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4312 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4313 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4314 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4316 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4317 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4318 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4319 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4320 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4321 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4322 row4l = veorq_u64(row4l, row1l);
4323 row4h = veorq_u64(row4h, row1h);
4324 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4325 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4326 row3l = vaddq_u64(row3l, row4l);
4327 row3h = vaddq_u64(row3h, row4h);
4328 row2l = veorq_u64(row2l, row3l);
4329 row2h = veorq_u64(row2h, row3h);
4330 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4331 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4333 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_L64);
4334 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4335 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_L64);
4336 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4337 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4338 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4339 row4l = veorq_u64(row4l, row1l);
4340 row4h = veorq_u64(row4h, row1h);
4341 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4342 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4343 row3l = vaddq_u64(row3l, row4l);
4344 row3h = vaddq_u64(row3h, row4h);
4345 row2l = veorq_u64(row2l, row3l);
4346 row2h = veorq_u64(row2h, row3h);
4347 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4348 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4350 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4351 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4352 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4353 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4354 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4355 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4356 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4357 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4358 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4360 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4361 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4362 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4363 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4364 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4365 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4366 row4l = veorq_u64(row4l, row1l);
4367 row4h = veorq_u64(row4h, row1h);
4368 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4369 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4370 row3l = vaddq_u64(row3l, row4l);
4371 row3h = vaddq_u64(row3h, row4h);
4372 row2l = veorq_u64(row2l, row3l);
4373 row2h = veorq_u64(row2h, row3h);
4374 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4375 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4377 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4378 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4379 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4380 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4381 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4382 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4383 row4l = veorq_u64(row4l, row1l);
4384 row4h = veorq_u64(row4h, row1h);
4385 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4386 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4387 row3l = vaddq_u64(row3l, row4l);
4388 row3h = vaddq_u64(row3h, row4h);
4389 row2l = veorq_u64(row2l, row3l);
4390 row2h = veorq_u64(row2h, row3h);
4391 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4392 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4394 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4395 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4396 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4397 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4398 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4399 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4400 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4401 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4402 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4404 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4405 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4406 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4407 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4408 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4409 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4410 row4l = veorq_u64(row4l, row1l);
4411 row4h = veorq_u64(row4h, row1h);
4412 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4413 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4414 row3l = vaddq_u64(row3l, row4l);
4415 row3h = vaddq_u64(row3h, row4h);
4416 row2l = veorq_u64(row2l, row3l);
4417 row2h = veorq_u64(row2h, row3h);
4418 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4419 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4421 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4422 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_H64);
4423 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_L64);
4424 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4425 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4426 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4427 row4l = veorq_u64(row4l, row1l);
4428 row4h = veorq_u64(row4h, row1h);
4429 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4430 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4431 row3l = vaddq_u64(row3l, row4l);
4432 row3h = vaddq_u64(row3h, row4h);
4433 row2l = veorq_u64(row2l, row3l);
4434 row2h = veorq_u64(row2h, row3h);
4435 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4436 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4438 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4439 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4440 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4441 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4442 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4443 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4444 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4445 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4446 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4448 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4449 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4450 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_L64);
4451 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4452 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4453 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4454 row4l = veorq_u64(row4l, row1l);
4455 row4h = veorq_u64(row4h, row1h);
4456 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4457 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4458 row3l = vaddq_u64(row3l, row4l);
4459 row3h = vaddq_u64(row3h, row4h);
4460 row2l = veorq_u64(row2l, row3l);
4461 row2h = veorq_u64(row2h, row3h);
4462 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4463 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4465 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4466 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4467 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4468 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4469 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4470 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4471 row4l = veorq_u64(row4l, row1l);
4472 row4h = veorq_u64(row4h, row1h);
4473 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4474 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4475 row3l = vaddq_u64(row3l, row4l);
4476 row3h = vaddq_u64(row3h, row4h);
4477 row2l = veorq_u64(row2l, row3l);
4478 row2h = veorq_u64(row2h, row3h);
4479 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4480 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4482 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4483 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4484 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4485 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4486 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4487 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4488 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4489 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4490 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4492 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_L64);
4493 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4494 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4495 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_H64);
4496 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4497 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4498 row4l = veorq_u64(row4l, row1l);
4499 row4h = veorq_u64(row4h, row1h);
4500 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4501 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4502 row3l = vaddq_u64(row3l, row4l);
4503 row3h = vaddq_u64(row3h, row4h);
4504 row2l = veorq_u64(row2l, row3l);
4505 row2h = veorq_u64(row2h, row3h);
4506 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4507 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4509 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_L64);
4510 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_H64);
4511 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_L64);
4512 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4513 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4514 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4515 row4l = veorq_u64(row4l, row1l);
4516 row4h = veorq_u64(row4h, row1h);
4517 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4518 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4519 row3l = vaddq_u64(row3l, row4l);
4520 row3h = vaddq_u64(row3h, row4h);
4521 row2l = veorq_u64(row2l, row3l);
4522 row2h = veorq_u64(row2h, row3h);
4523 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4524 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4526 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4527 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4528 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4529 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4530 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4531 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4532 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4533 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4534 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4536 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4537 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_H64);
4538 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_L64);
4539 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_H64);
4540 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4541 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4542 row4l = veorq_u64(row4l, row1l);
4543 row4h = veorq_u64(row4h, row1h);
4544 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4545 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4546 row3l = vaddq_u64(row3l, row4l);
4547 row3h = vaddq_u64(row3h, row4h);
4548 row2l = veorq_u64(row2l, row3l);
4549 row2h = veorq_u64(row2h, row3h);
4550 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4551 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4553 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_L64);
4554 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_H64);
4555 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4556 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4557 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4558 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4559 row4l = veorq_u64(row4l, row1l);
4560 row4h = veorq_u64(row4h, row1h);
4561 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4562 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4563 row3l = vaddq_u64(row3l, row4l);
4564 row3h = vaddq_u64(row3h, row4h);
4565 row2l = veorq_u64(row2l, row3l);
4566 row2h = veorq_u64(row2h, row3h);
4567 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4568 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4570 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4571 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4572 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4573 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4574 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4575 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4576 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4577 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4578 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4580 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4581 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_H64);
4582 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4583 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4584 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4585 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4586 row4l = veorq_u64(row4l, row1l);
4587 row4h = veorq_u64(row4h, row1h);
4588 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4589 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4590 row3l = vaddq_u64(row3l, row4l);
4591 row3h = vaddq_u64(row3h, row4h);
4592 row2l = veorq_u64(row2l, row3l);
4593 row2h = veorq_u64(row2h, row3h);
4594 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4595 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4597 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_L64);
4598 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4599 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_L64);
4600 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_H64);
4601 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4602 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4603 row4l = veorq_u64(row4l, row1l);
4604 row4h = veorq_u64(row4h, row1h);
4605 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4606 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4607 row3l = vaddq_u64(row3l, row4l);
4608 row3h = vaddq_u64(row3h, row4h);
4609 row2l = veorq_u64(row2l, row3l);
4610 row2h = veorq_u64(row2h, row3h);
4611 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4612 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4614 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4615 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4616 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4617 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4618 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4619 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4620 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4621 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4622 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4624 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_L64);
4625 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4626 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4627 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
4628 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4629 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4630 row4l = veorq_u64(row4l, row1l);
4631 row4h = veorq_u64(row4h, row1h);
4632 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4633 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4634 row3l = vaddq_u64(row3l, row4l);
4635 row3h = vaddq_u64(row3h, row4h);
4636 row2l = veorq_u64(row2l, row3l);
4637 row2h = veorq_u64(row2h, row3h);
4638 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4639 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4641 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4642 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4643 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4644 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_H64);
4645 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4646 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4647 row4l = veorq_u64(row4l, row1l);
4648 row4h = veorq_u64(row4h, row1h);
4649 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4650 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4651 row3l = vaddq_u64(row3l, row4l);
4652 row3h = vaddq_u64(row3h, row4h);
4653 row2l = veorq_u64(row2l, row3l);
4654 row2h = veorq_u64(row2h, row3h);
4655 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4656 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4658 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4659 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4660 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4661 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4662 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4663 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4664 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4665 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4666 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4668 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b0,LANE_L64);
4669 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_H64);
4670 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_L64);
4671 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b1,LANE_H64);
4672 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4673 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4674 row4l = veorq_u64(row4l, row1l);
4675 row4h = veorq_u64(row4h, row1h);
4676 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4677 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4678 row3l = vaddq_u64(row3l, row4l);
4679 row3h = vaddq_u64(row3h, row4h);
4680 row2l = veorq_u64(row2l, row3l);
4681 row2h = veorq_u64(row2h, row3h);
4682 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4683 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4685 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4686 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4687 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4688 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4689 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4690 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4691 row4l = veorq_u64(row4l, row1l);
4692 row4h = veorq_u64(row4h, row1h);
4693 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4694 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4695 row3l = vaddq_u64(row3l, row4l);
4696 row3h = vaddq_u64(row3h, row4h);
4697 row2l = veorq_u64(row2l, row3l);
4698 row2h = veorq_u64(row2h, row3h);
4699 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4700 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4702 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4703 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4704 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4705 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4706 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4707 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4708 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4709 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4710 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4712 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b0,LANE_L64);
4713 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4714 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
4715 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_H64);
4716 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4717 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4718 row4l = veorq_u64(row4l, row1l);
4719 row4h = veorq_u64(row4h, row1h);
4720 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4721 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4722 row3l = vaddq_u64(row3l, row4l);
4723 row3h = vaddq_u64(row3h, row4h);
4724 row2l = veorq_u64(row2l, row3l);
4725 row2h = veorq_u64(row2h, row3h);
4726 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4727 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4729 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_L64);
4730 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_H64);
4731 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_L64);
4732 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b1,LANE_H64);
4733 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4734 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4735 row4l = veorq_u64(row4l, row1l);
4736 row4h = veorq_u64(row4h, row1h);
4737 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4738 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4739 row3l = vaddq_u64(row3l, row4l);
4740 row3h = vaddq_u64(row3h, row4h);
4741 row2l = veorq_u64(row2l, row3l);
4742 row2h = veorq_u64(row2h, row3h);
4743 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4744 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4746 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4747 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4748 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4749 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4750 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4751 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4752 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4753 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4754 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4756 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
4757 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b0,LANE_H64);
4758 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_L64);
4759 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b1,LANE_H64);
4760 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4761 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4762 row4l = veorq_u64(row4l, row1l);
4763 row4h = veorq_u64(row4h, row1h);
4764 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4765 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4766 row3l = vaddq_u64(row3l, row4l);
4767 row3h = vaddq_u64(row3h, row4h);
4768 row2l = veorq_u64(row2l, row3l);
4769 row2h = veorq_u64(row2h, row3h);
4770 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4771 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4773 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4774 b0 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b0,LANE_H64);
4775 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4776 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4777 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4778 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4779 row4l = veorq_u64(row4l, row1l);
4780 row4h = veorq_u64(row4h, row1h);
4781 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4782 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4783 row3l = vaddq_u64(row3l, row4l);
4784 row3h = vaddq_u64(row3h, row4h);
4785 row2l = veorq_u64(row2l, row3l);
4786 row2h = veorq_u64(row2h, row3h);
4787 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4788 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4790 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4791 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4792 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4793 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4794 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4795 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4796 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4797 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4798 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4800 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4801 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4802 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
4803 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b1,LANE_H64);
4804 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4805 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4806 row4l = veorq_u64(row4l, row1l);
4807 row4h = veorq_u64(row4h, row1h);
4808 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4809 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4810 row3l = vaddq_u64(row3l, row4l);
4811 row3h = vaddq_u64(row3h, row4h);
4812 row2l = veorq_u64(row2l, row3l);
4813 row2h = veorq_u64(row2h, row3h);
4814 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4815 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4817 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_L64);
4818 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4819 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_L64);
4820 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
4821 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4822 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4823 row4l = veorq_u64(row4l, row1l);
4824 row4h = veorq_u64(row4h, row1h);
4825 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4826 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4827 row3l = vaddq_u64(row3l, row4l);
4828 row3h = vaddq_u64(row3h, row4h);
4829 row2l = veorq_u64(row2l, row3l);
4830 row2h = veorq_u64(row2h, row3h);
4831 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4832 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4834 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4835 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4836 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4837 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4838 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4839 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4840 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4841 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4842 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4844 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b0,LANE_L64);
4845 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_H64);
4846 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_L64);
4847 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4848 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4849 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4850 row4l = veorq_u64(row4l, row1l);
4851 row4h = veorq_u64(row4h, row1h);
4852 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4853 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4854 row3l = vaddq_u64(row3l, row4l);
4855 row3h = vaddq_u64(row3h, row4h);
4856 row2l = veorq_u64(row2l, row3l);
4857 row2h = veorq_u64(row2h, row3h);
4858 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4859 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4861 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_L64);
4862 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_H64);
4863 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4864 b1 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b1,LANE_H64);
4865 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4866 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4867 row4l = veorq_u64(row4l, row1l);
4868 row4h = veorq_u64(row4h, row1h);
4869 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4870 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4871 row3l = vaddq_u64(row3l, row4l);
4872 row3h = vaddq_u64(row3h, row4h);
4873 row2l = veorq_u64(row2l, row3l);
4874 row2h = veorq_u64(row2h, row3h);
4875 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4876 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4878 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4879 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4880 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4881 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4882 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4883 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4884 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4885 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4886 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4888 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_L64);
4889 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
4890 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b1,LANE_L64);
4891 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4892 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4893 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4894 row4l = veorq_u64(row4l, row1l);
4895 row4h = veorq_u64(row4h, row1h);
4896 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4897 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4898 row3l = vaddq_u64(row3l, row4l);
4899 row3h = vaddq_u64(row3h, row4h);
4900 row2l = veorq_u64(row2l, row3l);
4901 row2h = veorq_u64(row2h, row3h);
4902 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4903 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4905 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
4906 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b0,LANE_H64);
4907 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_L64);
4908 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_H64);
4909 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4910 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4911 row4l = veorq_u64(row4l, row1l);
4912 row4h = veorq_u64(row4h, row1h);
4913 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4914 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4915 row3l = vaddq_u64(row3l, row4l);
4916 row3h = vaddq_u64(row3h, row4h);
4917 row2l = veorq_u64(row2l, row3l);
4918 row2h = veorq_u64(row2h, row3h);
4919 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4920 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4922 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
4923 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
4924 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
4925 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
4926 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
4927 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
4928 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
4929 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
4930 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
4932 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_L64);
4933 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_H64);
4934 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b1,LANE_L64);
4935 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b1,LANE_H64);
4936 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4937 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4938 row4l = veorq_u64(row4l, row1l);
4939 row4h = veorq_u64(row4h, row1h);
4940 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4941 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4942 row3l = vaddq_u64(row3l, row4l);
4943 row3h = vaddq_u64(row3h, row4h);
4944 row2l = veorq_u64(row2l, row3l);
4945 row2h = veorq_u64(row2h, row3h);
4946 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4947 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4949 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b0,LANE_L64);
4950 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b0,LANE_H64);
4951 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_L64);
4952 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_H64);
4953 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4954 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4955 row4l = veorq_u64(row4l, row1l);
4956 row4h = veorq_u64(row4h, row1h);
4957 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
4958 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
4959 row3l = vaddq_u64(row3l, row4l);
4960 row3h = vaddq_u64(row3h, row4h);
4961 row2l = veorq_u64(row2l, row3l);
4962 row2h = veorq_u64(row2h, row3h);
4963 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
4964 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
4966 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
4967 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
4968 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
4969 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
4970 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
4971 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
4972 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
4973 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
4974 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
4976 b0 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_L64),b0,LANE_L64);
4977 b0 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_L64),b0,LANE_H64);
4978 b1 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_H64),b1,LANE_L64);
4979 b1 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_H64),b1,LANE_H64);
4980 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4981 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4982 row4l = veorq_u64(row4l, row1l);
4983 row4h = veorq_u64(row4h, row1h);
4984 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
4985 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
4986 row3l = vaddq_u64(row3l, row4l);
4987 row3h = vaddq_u64(row3h, row4h);
4988 row2l = veorq_u64(row2l, row3l);
4989 row2h = veorq_u64(row2h, row3h);
4990 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
4991 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
4993 b0 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_L64),b0,LANE_L64);
4994 b0 = vsetq_lane_u64(vgetq_lane_u64(m8m9,LANE_L64),b0,LANE_H64);
4995 b1 = vsetq_lane_u64(vgetq_lane_u64(m14m15,LANE_H64),b1,LANE_L64);
4996 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_L64),b1,LANE_H64);
4997 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
4998 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
4999 row4l = veorq_u64(row4l, row1l);
5000 row4h = veorq_u64(row4h, row1h);
5001 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
5002 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
5003 row3l = vaddq_u64(row3l, row4l);
5004 row3h = vaddq_u64(row3h, row4h);
5005 row2l = veorq_u64(row2l, row3l);
5006 row2h = veorq_u64(row2h, row3h);
5007 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
5008 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
5010 t0 = row4l, t1 = row2l, row4l = row3l, row3l = row3h, row3h = row4l;
5011 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4l,LANE_L64);
5012 row4l = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_L64),row4l,LANE_H64);
5013 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4h,LANE_H64);
5014 row4h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row4h,LANE_L64);
5015 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_H64),row2l,LANE_L64);
5016 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2l,LANE_H64);
5017 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2h,LANE_L64);
5018 row2h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row2h,LANE_H64);
5020 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_H64),b0,LANE_L64);
5021 b0 = vsetq_lane_u64(vgetq_lane_u64(m0m1,LANE_L64),b0,LANE_H64);
5022 b1 = vsetq_lane_u64(vgetq_lane_u64(m10m11,LANE_H64),b1,LANE_L64);
5023 b1 = vsetq_lane_u64(vgetq_lane_u64(m4m5,LANE_H64),b1,LANE_H64);
5024 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
5025 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
5026 row4l = veorq_u64(row4l, row1l);
5027 row4h = veorq_u64(row4h, row1h);
5028 row4l = veorq_u64(vshrq_n_u64(row4l,32),vshlq_n_u64(row4l,32));
5029 row4h = veorq_u64(vshrq_n_u64(row4h,32),vshlq_n_u64(row4h,32));
5030 row3l = vaddq_u64(row3l, row4l);
5031 row3h = vaddq_u64(row3h, row4h);
5032 row2l = veorq_u64(row2l, row3l);
5033 row2h = veorq_u64(row2h, row3h);
5034 row2l = veorq_u64(vshrq_n_u64(row2l,24),vshlq_n_u64(row2l,40));
5035 row2h = veorq_u64(vshrq_n_u64(row2h,24),vshlq_n_u64(row2h,40));
5037 b0 = vsetq_lane_u64(vgetq_lane_u64(m12m13,LANE_L64),b0,LANE_L64);
5038 b0 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_L64),b0,LANE_H64);
5039 b1 = vsetq_lane_u64(vgetq_lane_u64(m6m7,LANE_H64),b1,LANE_L64);
5040 b1 = vsetq_lane_u64(vgetq_lane_u64(m2m3,LANE_H64),b1,LANE_H64);
5041 row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l);
5042 row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h);
5043 row4l = veorq_u64(row4l, row1l);
5044 row4h = veorq_u64(row4h, row1h);
5045 row4l = veorq_u64(vshrq_n_u64(row4l,16),vshlq_n_u64(row4l,48));
5046 row4h = veorq_u64(vshrq_n_u64(row4h,16),vshlq_n_u64(row4h,48));
5047 row3l = vaddq_u64(row3l, row4l);
5048 row3h = vaddq_u64(row3h, row4h);
5049 row2l = veorq_u64(row2l, row3l);
5050 row2h = veorq_u64(row2h, row3h);
5051 row2l = veorq_u64(vshrq_n_u64(row2l,63),vshlq_n_u64(row2l,1));
5052 row2h = veorq_u64(vshrq_n_u64(row2h,63),vshlq_n_u64(row2h,1));
5054 t0 = row3l, row3l = row3h, row3h = t0, t0 = row2l, t1 = row4l;
5055 row2l = vsetq_lane_u64(vgetq_lane_u64(row2l,LANE_L64),row2l,LANE_H64);
5056 row2l = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_H64),row2l,LANE_L64);
5057 row2h = vsetq_lane_u64(vgetq_lane_u64(row2h,LANE_L64),row2h,LANE_H64);
5058 row2h = vsetq_lane_u64(vgetq_lane_u64(t0,LANE_H64),row2h,LANE_L64);
5059 row4l = vsetq_lane_u64(vgetq_lane_u64(row4l,LANE_H64),row4l,LANE_L64);
5060 row4l = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_L64),row4l,LANE_H64);
5061 row4h = vsetq_lane_u64(vgetq_lane_u64(row4h,LANE_H64),row4h,LANE_L64);
5062 row4h = vsetq_lane_u64(vgetq_lane_u64(t1,LANE_L64),row4h,LANE_H64);
5064 row1l = veorq_u64(row3l, row1l);
5065 row1h = veorq_u64(row3h, row1h);
5066 vst1q_u64((uint64_t*)&state.h[0], veorq_u64(vld1q_u64((
const uint64_t*)&state.h[0]), row1l));
5067 vst1q_u64((uint64_t*)&state.h[2], veorq_u64(vld1q_u64((
const uint64_t*)&state.h[2]), row1h));
5069 row2l = veorq_u64(row4l, row2l);
5070 row2h = veorq_u64(row4h, row2h);
5071 vst1q_u64((uint64_t*)&state.h[4], veorq_u64(vld1q_u64((
const uint64_t*)&state.h[4]), row2l));
5072 vst1q_u64((uint64_t*)&state.h[6], veorq_u64(vld1q_u64((
const uint64_t*)&state.h[6]), row2h));
5074 #endif // CRYPTOPP_BOOL_NEON_INTRINSICS_AVAILABLE Used to pass byte array input as part of a NameValuePairs object.
void Restart()
Restart the hash.
Standard names for retrieving values by name when working with NameValuePairs.
const char * DigestSize()
int, in bytes
Classes for working with NameValuePairs.
bool HasSSE4()
Determines SSE4 availability.
BLAKE2 hash implementation.
void TruncatedFinal(byte *hash, size_t size)
Computes the hash of the current message.
Abstract base classes that provide a uniform interface to this library.
void memcpy_s(void *dest, size_t sizeInBytes, const void *src, size_t count)
Bounds checking replacement for memcpy()
size_t size() const
Length of the memory block.
Library configuration file.
const byte * begin() const
Pointer to the first byte in the memory block.
bool IsAlignedOn(const void *ptr, unsigned int alignment)
Determines whether ptr is aligned to a minimum value.
AlgorithmParameters MakeParameters(const char *name, const T &value, bool throwIfNotUsed=true)
Create an object that implements NameValuePairs.
T ConditionalByteReverse(ByteOrder order, T value)
Reverses bytes in a value depending upon endianess.
const char * Salt()
ConstByteArrayParameter.
const NameValuePairs & g_nullNameValuePairs
An empty set of name-value pairs.
#define COUNTOF(arr)
Counts elements in an array.
SecBlock using AllocatorWithCleanup<byte, true> typedef.
Classes for BLAKE2b and BLAKE2s message digests and keyed message digests.
const char * Personalization()
ConstByteArrayParameter.
Functions for CPU features and intrinsics.
BLAKE2 state information.
bool HasSSE2()
Determines SSE2 availability.
Access a block of memory.
Access a block of memory.
Crypto++ library namespace.
void Update(const byte *input, size_t length)
Updates a hash with additional input.