18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2015 Martin Willi 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#include <linux/linkage.h> 98c2ecf20Sopenharmony_ci#include <asm/frame.h> 108c2ecf20Sopenharmony_ci 118c2ecf20Sopenharmony_ci.section .rodata.cst16.ROT8, "aM", @progbits, 16 128c2ecf20Sopenharmony_ci.align 16 138c2ecf20Sopenharmony_ciROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 148c2ecf20Sopenharmony_ci.section .rodata.cst16.ROT16, "aM", @progbits, 16 158c2ecf20Sopenharmony_ci.align 16 168c2ecf20Sopenharmony_ciROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 178c2ecf20Sopenharmony_ci.section .rodata.cst16.CTRINC, "aM", @progbits, 16 188c2ecf20Sopenharmony_ci.align 16 198c2ecf20Sopenharmony_ciCTRINC: .octa 0x00000003000000020000000100000000 208c2ecf20Sopenharmony_ci 218c2ecf20Sopenharmony_ci.text 228c2ecf20Sopenharmony_ci 238c2ecf20Sopenharmony_ci/* 248c2ecf20Sopenharmony_ci * chacha_permute - permute one block 258c2ecf20Sopenharmony_ci * 268c2ecf20Sopenharmony_ci * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This 278c2ecf20Sopenharmony_ci * function performs matrix operations on four words in parallel, but requires 288c2ecf20Sopenharmony_ci * shuffling to rearrange the words after each round. 8/16-bit word rotation is 298c2ecf20Sopenharmony_ci * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word 308c2ecf20Sopenharmony_ci * rotation uses traditional shift+OR. 318c2ecf20Sopenharmony_ci * 328c2ecf20Sopenharmony_ci * The round count is given in %r8d. 338c2ecf20Sopenharmony_ci * 348c2ecf20Sopenharmony_ci * Clobbers: %r8d, %xmm4-%xmm7 358c2ecf20Sopenharmony_ci */ 368c2ecf20Sopenharmony_ciSYM_FUNC_START_LOCAL(chacha_permute) 378c2ecf20Sopenharmony_ci 388c2ecf20Sopenharmony_ci movdqa ROT8(%rip),%xmm4 398c2ecf20Sopenharmony_ci movdqa ROT16(%rip),%xmm5 408c2ecf20Sopenharmony_ci 418c2ecf20Sopenharmony_ci.Ldoubleround: 428c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 438c2ecf20Sopenharmony_ci paddd %xmm1,%xmm0 448c2ecf20Sopenharmony_ci pxor %xmm0,%xmm3 458c2ecf20Sopenharmony_ci pshufb %xmm5,%xmm3 468c2ecf20Sopenharmony_ci 478c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 488c2ecf20Sopenharmony_ci paddd %xmm3,%xmm2 498c2ecf20Sopenharmony_ci pxor %xmm2,%xmm1 508c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm6 518c2ecf20Sopenharmony_ci pslld $12,%xmm6 528c2ecf20Sopenharmony_ci psrld $20,%xmm1 538c2ecf20Sopenharmony_ci por %xmm6,%xmm1 548c2ecf20Sopenharmony_ci 558c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 568c2ecf20Sopenharmony_ci paddd %xmm1,%xmm0 578c2ecf20Sopenharmony_ci pxor %xmm0,%xmm3 588c2ecf20Sopenharmony_ci pshufb %xmm4,%xmm3 598c2ecf20Sopenharmony_ci 608c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 618c2ecf20Sopenharmony_ci paddd %xmm3,%xmm2 628c2ecf20Sopenharmony_ci pxor %xmm2,%xmm1 638c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm7 648c2ecf20Sopenharmony_ci pslld $7,%xmm7 658c2ecf20Sopenharmony_ci psrld $25,%xmm1 668c2ecf20Sopenharmony_ci por %xmm7,%xmm1 678c2ecf20Sopenharmony_ci 688c2ecf20Sopenharmony_ci # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 698c2ecf20Sopenharmony_ci pshufd $0x39,%xmm1,%xmm1 708c2ecf20Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 718c2ecf20Sopenharmony_ci pshufd $0x4e,%xmm2,%xmm2 728c2ecf20Sopenharmony_ci # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 738c2ecf20Sopenharmony_ci pshufd $0x93,%xmm3,%xmm3 748c2ecf20Sopenharmony_ci 758c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 768c2ecf20Sopenharmony_ci paddd %xmm1,%xmm0 778c2ecf20Sopenharmony_ci pxor %xmm0,%xmm3 788c2ecf20Sopenharmony_ci pshufb %xmm5,%xmm3 798c2ecf20Sopenharmony_ci 808c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 818c2ecf20Sopenharmony_ci paddd %xmm3,%xmm2 828c2ecf20Sopenharmony_ci pxor %xmm2,%xmm1 838c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm6 848c2ecf20Sopenharmony_ci pslld $12,%xmm6 858c2ecf20Sopenharmony_ci psrld $20,%xmm1 868c2ecf20Sopenharmony_ci por %xmm6,%xmm1 878c2ecf20Sopenharmony_ci 888c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 898c2ecf20Sopenharmony_ci paddd %xmm1,%xmm0 908c2ecf20Sopenharmony_ci pxor %xmm0,%xmm3 918c2ecf20Sopenharmony_ci pshufb %xmm4,%xmm3 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 948c2ecf20Sopenharmony_ci paddd %xmm3,%xmm2 958c2ecf20Sopenharmony_ci pxor %xmm2,%xmm1 968c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm7 978c2ecf20Sopenharmony_ci pslld $7,%xmm7 988c2ecf20Sopenharmony_ci psrld $25,%xmm1 998c2ecf20Sopenharmony_ci por %xmm7,%xmm1 1008c2ecf20Sopenharmony_ci 1018c2ecf20Sopenharmony_ci # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 1028c2ecf20Sopenharmony_ci pshufd $0x93,%xmm1,%xmm1 1038c2ecf20Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 1048c2ecf20Sopenharmony_ci pshufd $0x4e,%xmm2,%xmm2 1058c2ecf20Sopenharmony_ci # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 1068c2ecf20Sopenharmony_ci pshufd $0x39,%xmm3,%xmm3 1078c2ecf20Sopenharmony_ci 1088c2ecf20Sopenharmony_ci sub $2,%r8d 1098c2ecf20Sopenharmony_ci jnz .Ldoubleround 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci RET 1128c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_permute) 1138c2ecf20Sopenharmony_ci 1148c2ecf20Sopenharmony_ciSYM_FUNC_START(chacha_block_xor_ssse3) 1158c2ecf20Sopenharmony_ci # %rdi: Input state matrix, s 1168c2ecf20Sopenharmony_ci # %rsi: up to 1 data block output, o 1178c2ecf20Sopenharmony_ci # %rdx: up to 1 data block input, i 1188c2ecf20Sopenharmony_ci # %rcx: input/output length in bytes 1198c2ecf20Sopenharmony_ci # %r8d: nrounds 1208c2ecf20Sopenharmony_ci FRAME_BEGIN 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci # x0..3 = s0..3 1238c2ecf20Sopenharmony_ci movdqu 0x00(%rdi),%xmm0 1248c2ecf20Sopenharmony_ci movdqu 0x10(%rdi),%xmm1 1258c2ecf20Sopenharmony_ci movdqu 0x20(%rdi),%xmm2 1268c2ecf20Sopenharmony_ci movdqu 0x30(%rdi),%xmm3 1278c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm8 1288c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm9 1298c2ecf20Sopenharmony_ci movdqa %xmm2,%xmm10 1308c2ecf20Sopenharmony_ci movdqa %xmm3,%xmm11 1318c2ecf20Sopenharmony_ci 1328c2ecf20Sopenharmony_ci mov %rcx,%rax 1338c2ecf20Sopenharmony_ci call chacha_permute 1348c2ecf20Sopenharmony_ci 1358c2ecf20Sopenharmony_ci # o0 = i0 ^ (x0 + s0) 1368c2ecf20Sopenharmony_ci paddd %xmm8,%xmm0 1378c2ecf20Sopenharmony_ci cmp $0x10,%rax 1388c2ecf20Sopenharmony_ci jl .Lxorpart 1398c2ecf20Sopenharmony_ci movdqu 0x00(%rdx),%xmm4 1408c2ecf20Sopenharmony_ci pxor %xmm4,%xmm0 1418c2ecf20Sopenharmony_ci movdqu %xmm0,0x00(%rsi) 1428c2ecf20Sopenharmony_ci # o1 = i1 ^ (x1 + s1) 1438c2ecf20Sopenharmony_ci paddd %xmm9,%xmm1 1448c2ecf20Sopenharmony_ci movdqa %xmm1,%xmm0 1458c2ecf20Sopenharmony_ci cmp $0x20,%rax 1468c2ecf20Sopenharmony_ci jl .Lxorpart 1478c2ecf20Sopenharmony_ci movdqu 0x10(%rdx),%xmm0 1488c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 1498c2ecf20Sopenharmony_ci movdqu %xmm0,0x10(%rsi) 1508c2ecf20Sopenharmony_ci # o2 = i2 ^ (x2 + s2) 1518c2ecf20Sopenharmony_ci paddd %xmm10,%xmm2 1528c2ecf20Sopenharmony_ci movdqa %xmm2,%xmm0 1538c2ecf20Sopenharmony_ci cmp $0x30,%rax 1548c2ecf20Sopenharmony_ci jl .Lxorpart 1558c2ecf20Sopenharmony_ci movdqu 0x20(%rdx),%xmm0 1568c2ecf20Sopenharmony_ci pxor %xmm2,%xmm0 1578c2ecf20Sopenharmony_ci movdqu %xmm0,0x20(%rsi) 1588c2ecf20Sopenharmony_ci # o3 = i3 ^ (x3 + s3) 1598c2ecf20Sopenharmony_ci paddd %xmm11,%xmm3 1608c2ecf20Sopenharmony_ci movdqa %xmm3,%xmm0 1618c2ecf20Sopenharmony_ci cmp $0x40,%rax 1628c2ecf20Sopenharmony_ci jl .Lxorpart 1638c2ecf20Sopenharmony_ci movdqu 0x30(%rdx),%xmm0 1648c2ecf20Sopenharmony_ci pxor %xmm3,%xmm0 1658c2ecf20Sopenharmony_ci movdqu %xmm0,0x30(%rsi) 1668c2ecf20Sopenharmony_ci 1678c2ecf20Sopenharmony_ci.Ldone: 1688c2ecf20Sopenharmony_ci FRAME_END 1698c2ecf20Sopenharmony_ci RET 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci.Lxorpart: 1728c2ecf20Sopenharmony_ci # xor remaining bytes from partial register into output 1738c2ecf20Sopenharmony_ci mov %rax,%r9 1748c2ecf20Sopenharmony_ci and $0x0f,%r9 1758c2ecf20Sopenharmony_ci jz .Ldone 1768c2ecf20Sopenharmony_ci and $~0x0f,%rax 1778c2ecf20Sopenharmony_ci 1788c2ecf20Sopenharmony_ci mov %rsi,%r11 1798c2ecf20Sopenharmony_ci 1808c2ecf20Sopenharmony_ci lea 8(%rsp),%r10 1818c2ecf20Sopenharmony_ci sub $0x10,%rsp 1828c2ecf20Sopenharmony_ci and $~31,%rsp 1838c2ecf20Sopenharmony_ci 1848c2ecf20Sopenharmony_ci lea (%rdx,%rax),%rsi 1858c2ecf20Sopenharmony_ci mov %rsp,%rdi 1868c2ecf20Sopenharmony_ci mov %r9,%rcx 1878c2ecf20Sopenharmony_ci rep movsb 1888c2ecf20Sopenharmony_ci 1898c2ecf20Sopenharmony_ci pxor 0x00(%rsp),%xmm0 1908c2ecf20Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 1918c2ecf20Sopenharmony_ci 1928c2ecf20Sopenharmony_ci mov %rsp,%rsi 1938c2ecf20Sopenharmony_ci lea (%r11,%rax),%rdi 1948c2ecf20Sopenharmony_ci mov %r9,%rcx 1958c2ecf20Sopenharmony_ci rep movsb 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci lea -8(%r10),%rsp 1988c2ecf20Sopenharmony_ci jmp .Ldone 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_block_xor_ssse3) 2018c2ecf20Sopenharmony_ci 2028c2ecf20Sopenharmony_ciSYM_FUNC_START(hchacha_block_ssse3) 2038c2ecf20Sopenharmony_ci # %rdi: Input state matrix, s 2048c2ecf20Sopenharmony_ci # %rsi: output (8 32-bit words) 2058c2ecf20Sopenharmony_ci # %edx: nrounds 2068c2ecf20Sopenharmony_ci FRAME_BEGIN 2078c2ecf20Sopenharmony_ci 2088c2ecf20Sopenharmony_ci movdqu 0x00(%rdi),%xmm0 2098c2ecf20Sopenharmony_ci movdqu 0x10(%rdi),%xmm1 2108c2ecf20Sopenharmony_ci movdqu 0x20(%rdi),%xmm2 2118c2ecf20Sopenharmony_ci movdqu 0x30(%rdi),%xmm3 2128c2ecf20Sopenharmony_ci 2138c2ecf20Sopenharmony_ci mov %edx,%r8d 2148c2ecf20Sopenharmony_ci call chacha_permute 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci movdqu %xmm0,0x00(%rsi) 2178c2ecf20Sopenharmony_ci movdqu %xmm3,0x10(%rsi) 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci FRAME_END 2208c2ecf20Sopenharmony_ci RET 2218c2ecf20Sopenharmony_ciSYM_FUNC_END(hchacha_block_ssse3) 2228c2ecf20Sopenharmony_ci 2238c2ecf20Sopenharmony_ciSYM_FUNC_START(chacha_4block_xor_ssse3) 2248c2ecf20Sopenharmony_ci # %rdi: Input state matrix, s 2258c2ecf20Sopenharmony_ci # %rsi: up to 4 data blocks output, o 2268c2ecf20Sopenharmony_ci # %rdx: up to 4 data blocks input, i 2278c2ecf20Sopenharmony_ci # %rcx: input/output length in bytes 2288c2ecf20Sopenharmony_ci # %r8d: nrounds 2298c2ecf20Sopenharmony_ci 2308c2ecf20Sopenharmony_ci # This function encrypts four consecutive ChaCha blocks by loading the 2318c2ecf20Sopenharmony_ci # the state matrix in SSE registers four times. As we need some scratch 2328c2ecf20Sopenharmony_ci # registers, we save the first four registers on the stack. The 2338c2ecf20Sopenharmony_ci # algorithm performs each operation on the corresponding word of each 2348c2ecf20Sopenharmony_ci # state matrix, hence requires no word shuffling. For final XORing step 2358c2ecf20Sopenharmony_ci # we transpose the matrix by interleaving 32- and then 64-bit words, 2368c2ecf20Sopenharmony_ci # which allows us to do XOR in SSE registers. 8/16-bit word rotation is 2378c2ecf20Sopenharmony_ci # done with the slightly better performing SSSE3 byte shuffling, 2388c2ecf20Sopenharmony_ci # 7/12-bit word rotation uses traditional shift+OR. 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci lea 8(%rsp),%r10 2418c2ecf20Sopenharmony_ci sub $0x80,%rsp 2428c2ecf20Sopenharmony_ci and $~63,%rsp 2438c2ecf20Sopenharmony_ci mov %rcx,%rax 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci # x0..15[0-3] = s0..3[0..3] 2468c2ecf20Sopenharmony_ci movq 0x00(%rdi),%xmm1 2478c2ecf20Sopenharmony_ci pshufd $0x00,%xmm1,%xmm0 2488c2ecf20Sopenharmony_ci pshufd $0x55,%xmm1,%xmm1 2498c2ecf20Sopenharmony_ci movq 0x08(%rdi),%xmm3 2508c2ecf20Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 2518c2ecf20Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 2528c2ecf20Sopenharmony_ci movq 0x10(%rdi),%xmm5 2538c2ecf20Sopenharmony_ci pshufd $0x00,%xmm5,%xmm4 2548c2ecf20Sopenharmony_ci pshufd $0x55,%xmm5,%xmm5 2558c2ecf20Sopenharmony_ci movq 0x18(%rdi),%xmm7 2568c2ecf20Sopenharmony_ci pshufd $0x00,%xmm7,%xmm6 2578c2ecf20Sopenharmony_ci pshufd $0x55,%xmm7,%xmm7 2588c2ecf20Sopenharmony_ci movq 0x20(%rdi),%xmm9 2598c2ecf20Sopenharmony_ci pshufd $0x00,%xmm9,%xmm8 2608c2ecf20Sopenharmony_ci pshufd $0x55,%xmm9,%xmm9 2618c2ecf20Sopenharmony_ci movq 0x28(%rdi),%xmm11 2628c2ecf20Sopenharmony_ci pshufd $0x00,%xmm11,%xmm10 2638c2ecf20Sopenharmony_ci pshufd $0x55,%xmm11,%xmm11 2648c2ecf20Sopenharmony_ci movq 0x30(%rdi),%xmm13 2658c2ecf20Sopenharmony_ci pshufd $0x00,%xmm13,%xmm12 2668c2ecf20Sopenharmony_ci pshufd $0x55,%xmm13,%xmm13 2678c2ecf20Sopenharmony_ci movq 0x38(%rdi),%xmm15 2688c2ecf20Sopenharmony_ci pshufd $0x00,%xmm15,%xmm14 2698c2ecf20Sopenharmony_ci pshufd $0x55,%xmm15,%xmm15 2708c2ecf20Sopenharmony_ci # x0..3 on stack 2718c2ecf20Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 2728c2ecf20Sopenharmony_ci movdqa %xmm1,0x10(%rsp) 2738c2ecf20Sopenharmony_ci movdqa %xmm2,0x20(%rsp) 2748c2ecf20Sopenharmony_ci movdqa %xmm3,0x30(%rsp) 2758c2ecf20Sopenharmony_ci 2768c2ecf20Sopenharmony_ci movdqa CTRINC(%rip),%xmm1 2778c2ecf20Sopenharmony_ci movdqa ROT8(%rip),%xmm2 2788c2ecf20Sopenharmony_ci movdqa ROT16(%rip),%xmm3 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci # x12 += counter values 0-3 2818c2ecf20Sopenharmony_ci paddd %xmm1,%xmm12 2828c2ecf20Sopenharmony_ci 2838c2ecf20Sopenharmony_ci.Ldoubleround4: 2848c2ecf20Sopenharmony_ci # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 2858c2ecf20Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 2868c2ecf20Sopenharmony_ci paddd %xmm4,%xmm0 2878c2ecf20Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 2888c2ecf20Sopenharmony_ci pxor %xmm0,%xmm12 2898c2ecf20Sopenharmony_ci pshufb %xmm3,%xmm12 2908c2ecf20Sopenharmony_ci # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 2918c2ecf20Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 2928c2ecf20Sopenharmony_ci paddd %xmm5,%xmm0 2938c2ecf20Sopenharmony_ci movdqa %xmm0,0x10(%rsp) 2948c2ecf20Sopenharmony_ci pxor %xmm0,%xmm13 2958c2ecf20Sopenharmony_ci pshufb %xmm3,%xmm13 2968c2ecf20Sopenharmony_ci # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 2978c2ecf20Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 2988c2ecf20Sopenharmony_ci paddd %xmm6,%xmm0 2998c2ecf20Sopenharmony_ci movdqa %xmm0,0x20(%rsp) 3008c2ecf20Sopenharmony_ci pxor %xmm0,%xmm14 3018c2ecf20Sopenharmony_ci pshufb %xmm3,%xmm14 3028c2ecf20Sopenharmony_ci # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 3038c2ecf20Sopenharmony_ci movdqa 0x30(%rsp),%xmm0 3048c2ecf20Sopenharmony_ci paddd %xmm7,%xmm0 3058c2ecf20Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 3068c2ecf20Sopenharmony_ci pxor %xmm0,%xmm15 3078c2ecf20Sopenharmony_ci pshufb %xmm3,%xmm15 3088c2ecf20Sopenharmony_ci 3098c2ecf20Sopenharmony_ci # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 3108c2ecf20Sopenharmony_ci paddd %xmm12,%xmm8 3118c2ecf20Sopenharmony_ci pxor %xmm8,%xmm4 3128c2ecf20Sopenharmony_ci movdqa %xmm4,%xmm0 3138c2ecf20Sopenharmony_ci pslld $12,%xmm0 3148c2ecf20Sopenharmony_ci psrld $20,%xmm4 3158c2ecf20Sopenharmony_ci por %xmm0,%xmm4 3168c2ecf20Sopenharmony_ci # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 3178c2ecf20Sopenharmony_ci paddd %xmm13,%xmm9 3188c2ecf20Sopenharmony_ci pxor %xmm9,%xmm5 3198c2ecf20Sopenharmony_ci movdqa %xmm5,%xmm0 3208c2ecf20Sopenharmony_ci pslld $12,%xmm0 3218c2ecf20Sopenharmony_ci psrld $20,%xmm5 3228c2ecf20Sopenharmony_ci por %xmm0,%xmm5 3238c2ecf20Sopenharmony_ci # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 3248c2ecf20Sopenharmony_ci paddd %xmm14,%xmm10 3258c2ecf20Sopenharmony_ci pxor %xmm10,%xmm6 3268c2ecf20Sopenharmony_ci movdqa %xmm6,%xmm0 3278c2ecf20Sopenharmony_ci pslld $12,%xmm0 3288c2ecf20Sopenharmony_ci psrld $20,%xmm6 3298c2ecf20Sopenharmony_ci por %xmm0,%xmm6 3308c2ecf20Sopenharmony_ci # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 3318c2ecf20Sopenharmony_ci paddd %xmm15,%xmm11 3328c2ecf20Sopenharmony_ci pxor %xmm11,%xmm7 3338c2ecf20Sopenharmony_ci movdqa %xmm7,%xmm0 3348c2ecf20Sopenharmony_ci pslld $12,%xmm0 3358c2ecf20Sopenharmony_ci psrld $20,%xmm7 3368c2ecf20Sopenharmony_ci por %xmm0,%xmm7 3378c2ecf20Sopenharmony_ci 3388c2ecf20Sopenharmony_ci # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 3398c2ecf20Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 3408c2ecf20Sopenharmony_ci paddd %xmm4,%xmm0 3418c2ecf20Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 3428c2ecf20Sopenharmony_ci pxor %xmm0,%xmm12 3438c2ecf20Sopenharmony_ci pshufb %xmm2,%xmm12 3448c2ecf20Sopenharmony_ci # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 3458c2ecf20Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 3468c2ecf20Sopenharmony_ci paddd %xmm5,%xmm0 3478c2ecf20Sopenharmony_ci movdqa %xmm0,0x10(%rsp) 3488c2ecf20Sopenharmony_ci pxor %xmm0,%xmm13 3498c2ecf20Sopenharmony_ci pshufb %xmm2,%xmm13 3508c2ecf20Sopenharmony_ci # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 3518c2ecf20Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 3528c2ecf20Sopenharmony_ci paddd %xmm6,%xmm0 3538c2ecf20Sopenharmony_ci movdqa %xmm0,0x20(%rsp) 3548c2ecf20Sopenharmony_ci pxor %xmm0,%xmm14 3558c2ecf20Sopenharmony_ci pshufb %xmm2,%xmm14 3568c2ecf20Sopenharmony_ci # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 3578c2ecf20Sopenharmony_ci movdqa 0x30(%rsp),%xmm0 3588c2ecf20Sopenharmony_ci paddd %xmm7,%xmm0 3598c2ecf20Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 3608c2ecf20Sopenharmony_ci pxor %xmm0,%xmm15 3618c2ecf20Sopenharmony_ci pshufb %xmm2,%xmm15 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_ci # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 3648c2ecf20Sopenharmony_ci paddd %xmm12,%xmm8 3658c2ecf20Sopenharmony_ci pxor %xmm8,%xmm4 3668c2ecf20Sopenharmony_ci movdqa %xmm4,%xmm0 3678c2ecf20Sopenharmony_ci pslld $7,%xmm0 3688c2ecf20Sopenharmony_ci psrld $25,%xmm4 3698c2ecf20Sopenharmony_ci por %xmm0,%xmm4 3708c2ecf20Sopenharmony_ci # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 3718c2ecf20Sopenharmony_ci paddd %xmm13,%xmm9 3728c2ecf20Sopenharmony_ci pxor %xmm9,%xmm5 3738c2ecf20Sopenharmony_ci movdqa %xmm5,%xmm0 3748c2ecf20Sopenharmony_ci pslld $7,%xmm0 3758c2ecf20Sopenharmony_ci psrld $25,%xmm5 3768c2ecf20Sopenharmony_ci por %xmm0,%xmm5 3778c2ecf20Sopenharmony_ci # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 3788c2ecf20Sopenharmony_ci paddd %xmm14,%xmm10 3798c2ecf20Sopenharmony_ci pxor %xmm10,%xmm6 3808c2ecf20Sopenharmony_ci movdqa %xmm6,%xmm0 3818c2ecf20Sopenharmony_ci pslld $7,%xmm0 3828c2ecf20Sopenharmony_ci psrld $25,%xmm6 3838c2ecf20Sopenharmony_ci por %xmm0,%xmm6 3848c2ecf20Sopenharmony_ci # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 3858c2ecf20Sopenharmony_ci paddd %xmm15,%xmm11 3868c2ecf20Sopenharmony_ci pxor %xmm11,%xmm7 3878c2ecf20Sopenharmony_ci movdqa %xmm7,%xmm0 3888c2ecf20Sopenharmony_ci pslld $7,%xmm0 3898c2ecf20Sopenharmony_ci psrld $25,%xmm7 3908c2ecf20Sopenharmony_ci por %xmm0,%xmm7 3918c2ecf20Sopenharmony_ci 3928c2ecf20Sopenharmony_ci # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 3938c2ecf20Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 3948c2ecf20Sopenharmony_ci paddd %xmm5,%xmm0 3958c2ecf20Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 3968c2ecf20Sopenharmony_ci pxor %xmm0,%xmm15 3978c2ecf20Sopenharmony_ci pshufb %xmm3,%xmm15 3988c2ecf20Sopenharmony_ci # x1 += x6, x12 = rotl32(x12 ^ x1, 16) 3998c2ecf20Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 4008c2ecf20Sopenharmony_ci paddd %xmm6,%xmm0 4018c2ecf20Sopenharmony_ci movdqa %xmm0,0x10(%rsp) 4028c2ecf20Sopenharmony_ci pxor %xmm0,%xmm12 4038c2ecf20Sopenharmony_ci pshufb %xmm3,%xmm12 4048c2ecf20Sopenharmony_ci # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 4058c2ecf20Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 4068c2ecf20Sopenharmony_ci paddd %xmm7,%xmm0 4078c2ecf20Sopenharmony_ci movdqa %xmm0,0x20(%rsp) 4088c2ecf20Sopenharmony_ci pxor %xmm0,%xmm13 4098c2ecf20Sopenharmony_ci pshufb %xmm3,%xmm13 4108c2ecf20Sopenharmony_ci # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 4118c2ecf20Sopenharmony_ci movdqa 0x30(%rsp),%xmm0 4128c2ecf20Sopenharmony_ci paddd %xmm4,%xmm0 4138c2ecf20Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 4148c2ecf20Sopenharmony_ci pxor %xmm0,%xmm14 4158c2ecf20Sopenharmony_ci pshufb %xmm3,%xmm14 4168c2ecf20Sopenharmony_ci 4178c2ecf20Sopenharmony_ci # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 4188c2ecf20Sopenharmony_ci paddd %xmm15,%xmm10 4198c2ecf20Sopenharmony_ci pxor %xmm10,%xmm5 4208c2ecf20Sopenharmony_ci movdqa %xmm5,%xmm0 4218c2ecf20Sopenharmony_ci pslld $12,%xmm0 4228c2ecf20Sopenharmony_ci psrld $20,%xmm5 4238c2ecf20Sopenharmony_ci por %xmm0,%xmm5 4248c2ecf20Sopenharmony_ci # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 4258c2ecf20Sopenharmony_ci paddd %xmm12,%xmm11 4268c2ecf20Sopenharmony_ci pxor %xmm11,%xmm6 4278c2ecf20Sopenharmony_ci movdqa %xmm6,%xmm0 4288c2ecf20Sopenharmony_ci pslld $12,%xmm0 4298c2ecf20Sopenharmony_ci psrld $20,%xmm6 4308c2ecf20Sopenharmony_ci por %xmm0,%xmm6 4318c2ecf20Sopenharmony_ci # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 4328c2ecf20Sopenharmony_ci paddd %xmm13,%xmm8 4338c2ecf20Sopenharmony_ci pxor %xmm8,%xmm7 4348c2ecf20Sopenharmony_ci movdqa %xmm7,%xmm0 4358c2ecf20Sopenharmony_ci pslld $12,%xmm0 4368c2ecf20Sopenharmony_ci psrld $20,%xmm7 4378c2ecf20Sopenharmony_ci por %xmm0,%xmm7 4388c2ecf20Sopenharmony_ci # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 4398c2ecf20Sopenharmony_ci paddd %xmm14,%xmm9 4408c2ecf20Sopenharmony_ci pxor %xmm9,%xmm4 4418c2ecf20Sopenharmony_ci movdqa %xmm4,%xmm0 4428c2ecf20Sopenharmony_ci pslld $12,%xmm0 4438c2ecf20Sopenharmony_ci psrld $20,%xmm4 4448c2ecf20Sopenharmony_ci por %xmm0,%xmm4 4458c2ecf20Sopenharmony_ci 4468c2ecf20Sopenharmony_ci # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 4478c2ecf20Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 4488c2ecf20Sopenharmony_ci paddd %xmm5,%xmm0 4498c2ecf20Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 4508c2ecf20Sopenharmony_ci pxor %xmm0,%xmm15 4518c2ecf20Sopenharmony_ci pshufb %xmm2,%xmm15 4528c2ecf20Sopenharmony_ci # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 4538c2ecf20Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 4548c2ecf20Sopenharmony_ci paddd %xmm6,%xmm0 4558c2ecf20Sopenharmony_ci movdqa %xmm0,0x10(%rsp) 4568c2ecf20Sopenharmony_ci pxor %xmm0,%xmm12 4578c2ecf20Sopenharmony_ci pshufb %xmm2,%xmm12 4588c2ecf20Sopenharmony_ci # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 4598c2ecf20Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 4608c2ecf20Sopenharmony_ci paddd %xmm7,%xmm0 4618c2ecf20Sopenharmony_ci movdqa %xmm0,0x20(%rsp) 4628c2ecf20Sopenharmony_ci pxor %xmm0,%xmm13 4638c2ecf20Sopenharmony_ci pshufb %xmm2,%xmm13 4648c2ecf20Sopenharmony_ci # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 4658c2ecf20Sopenharmony_ci movdqa 0x30(%rsp),%xmm0 4668c2ecf20Sopenharmony_ci paddd %xmm4,%xmm0 4678c2ecf20Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 4688c2ecf20Sopenharmony_ci pxor %xmm0,%xmm14 4698c2ecf20Sopenharmony_ci pshufb %xmm2,%xmm14 4708c2ecf20Sopenharmony_ci 4718c2ecf20Sopenharmony_ci # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 4728c2ecf20Sopenharmony_ci paddd %xmm15,%xmm10 4738c2ecf20Sopenharmony_ci pxor %xmm10,%xmm5 4748c2ecf20Sopenharmony_ci movdqa %xmm5,%xmm0 4758c2ecf20Sopenharmony_ci pslld $7,%xmm0 4768c2ecf20Sopenharmony_ci psrld $25,%xmm5 4778c2ecf20Sopenharmony_ci por %xmm0,%xmm5 4788c2ecf20Sopenharmony_ci # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 4798c2ecf20Sopenharmony_ci paddd %xmm12,%xmm11 4808c2ecf20Sopenharmony_ci pxor %xmm11,%xmm6 4818c2ecf20Sopenharmony_ci movdqa %xmm6,%xmm0 4828c2ecf20Sopenharmony_ci pslld $7,%xmm0 4838c2ecf20Sopenharmony_ci psrld $25,%xmm6 4848c2ecf20Sopenharmony_ci por %xmm0,%xmm6 4858c2ecf20Sopenharmony_ci # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 4868c2ecf20Sopenharmony_ci paddd %xmm13,%xmm8 4878c2ecf20Sopenharmony_ci pxor %xmm8,%xmm7 4888c2ecf20Sopenharmony_ci movdqa %xmm7,%xmm0 4898c2ecf20Sopenharmony_ci pslld $7,%xmm0 4908c2ecf20Sopenharmony_ci psrld $25,%xmm7 4918c2ecf20Sopenharmony_ci por %xmm0,%xmm7 4928c2ecf20Sopenharmony_ci # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 4938c2ecf20Sopenharmony_ci paddd %xmm14,%xmm9 4948c2ecf20Sopenharmony_ci pxor %xmm9,%xmm4 4958c2ecf20Sopenharmony_ci movdqa %xmm4,%xmm0 4968c2ecf20Sopenharmony_ci pslld $7,%xmm0 4978c2ecf20Sopenharmony_ci psrld $25,%xmm4 4988c2ecf20Sopenharmony_ci por %xmm0,%xmm4 4998c2ecf20Sopenharmony_ci 5008c2ecf20Sopenharmony_ci sub $2,%r8d 5018c2ecf20Sopenharmony_ci jnz .Ldoubleround4 5028c2ecf20Sopenharmony_ci 5038c2ecf20Sopenharmony_ci # x0[0-3] += s0[0] 5048c2ecf20Sopenharmony_ci # x1[0-3] += s0[1] 5058c2ecf20Sopenharmony_ci movq 0x00(%rdi),%xmm3 5068c2ecf20Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 5078c2ecf20Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 5088c2ecf20Sopenharmony_ci paddd 0x00(%rsp),%xmm2 5098c2ecf20Sopenharmony_ci movdqa %xmm2,0x00(%rsp) 5108c2ecf20Sopenharmony_ci paddd 0x10(%rsp),%xmm3 5118c2ecf20Sopenharmony_ci movdqa %xmm3,0x10(%rsp) 5128c2ecf20Sopenharmony_ci # x2[0-3] += s0[2] 5138c2ecf20Sopenharmony_ci # x3[0-3] += s0[3] 5148c2ecf20Sopenharmony_ci movq 0x08(%rdi),%xmm3 5158c2ecf20Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 5168c2ecf20Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 5178c2ecf20Sopenharmony_ci paddd 0x20(%rsp),%xmm2 5188c2ecf20Sopenharmony_ci movdqa %xmm2,0x20(%rsp) 5198c2ecf20Sopenharmony_ci paddd 0x30(%rsp),%xmm3 5208c2ecf20Sopenharmony_ci movdqa %xmm3,0x30(%rsp) 5218c2ecf20Sopenharmony_ci 5228c2ecf20Sopenharmony_ci # x4[0-3] += s1[0] 5238c2ecf20Sopenharmony_ci # x5[0-3] += s1[1] 5248c2ecf20Sopenharmony_ci movq 0x10(%rdi),%xmm3 5258c2ecf20Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 5268c2ecf20Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 5278c2ecf20Sopenharmony_ci paddd %xmm2,%xmm4 5288c2ecf20Sopenharmony_ci paddd %xmm3,%xmm5 5298c2ecf20Sopenharmony_ci # x6[0-3] += s1[2] 5308c2ecf20Sopenharmony_ci # x7[0-3] += s1[3] 5318c2ecf20Sopenharmony_ci movq 0x18(%rdi),%xmm3 5328c2ecf20Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 5338c2ecf20Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 5348c2ecf20Sopenharmony_ci paddd %xmm2,%xmm6 5358c2ecf20Sopenharmony_ci paddd %xmm3,%xmm7 5368c2ecf20Sopenharmony_ci 5378c2ecf20Sopenharmony_ci # x8[0-3] += s2[0] 5388c2ecf20Sopenharmony_ci # x9[0-3] += s2[1] 5398c2ecf20Sopenharmony_ci movq 0x20(%rdi),%xmm3 5408c2ecf20Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 5418c2ecf20Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 5428c2ecf20Sopenharmony_ci paddd %xmm2,%xmm8 5438c2ecf20Sopenharmony_ci paddd %xmm3,%xmm9 5448c2ecf20Sopenharmony_ci # x10[0-3] += s2[2] 5458c2ecf20Sopenharmony_ci # x11[0-3] += s2[3] 5468c2ecf20Sopenharmony_ci movq 0x28(%rdi),%xmm3 5478c2ecf20Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 5488c2ecf20Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 5498c2ecf20Sopenharmony_ci paddd %xmm2,%xmm10 5508c2ecf20Sopenharmony_ci paddd %xmm3,%xmm11 5518c2ecf20Sopenharmony_ci 5528c2ecf20Sopenharmony_ci # x12[0-3] += s3[0] 5538c2ecf20Sopenharmony_ci # x13[0-3] += s3[1] 5548c2ecf20Sopenharmony_ci movq 0x30(%rdi),%xmm3 5558c2ecf20Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 5568c2ecf20Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 5578c2ecf20Sopenharmony_ci paddd %xmm2,%xmm12 5588c2ecf20Sopenharmony_ci paddd %xmm3,%xmm13 5598c2ecf20Sopenharmony_ci # x14[0-3] += s3[2] 5608c2ecf20Sopenharmony_ci # x15[0-3] += s3[3] 5618c2ecf20Sopenharmony_ci movq 0x38(%rdi),%xmm3 5628c2ecf20Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 5638c2ecf20Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 5648c2ecf20Sopenharmony_ci paddd %xmm2,%xmm14 5658c2ecf20Sopenharmony_ci paddd %xmm3,%xmm15 5668c2ecf20Sopenharmony_ci 5678c2ecf20Sopenharmony_ci # x12 += counter values 0-3 5688c2ecf20Sopenharmony_ci paddd %xmm1,%xmm12 5698c2ecf20Sopenharmony_ci 5708c2ecf20Sopenharmony_ci # interleave 32-bit words in state n, n+1 5718c2ecf20Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 5728c2ecf20Sopenharmony_ci movdqa 0x10(%rsp),%xmm1 5738c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm2 5748c2ecf20Sopenharmony_ci punpckldq %xmm1,%xmm2 5758c2ecf20Sopenharmony_ci punpckhdq %xmm1,%xmm0 5768c2ecf20Sopenharmony_ci movdqa %xmm2,0x00(%rsp) 5778c2ecf20Sopenharmony_ci movdqa %xmm0,0x10(%rsp) 5788c2ecf20Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 5798c2ecf20Sopenharmony_ci movdqa 0x30(%rsp),%xmm1 5808c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm2 5818c2ecf20Sopenharmony_ci punpckldq %xmm1,%xmm2 5828c2ecf20Sopenharmony_ci punpckhdq %xmm1,%xmm0 5838c2ecf20Sopenharmony_ci movdqa %xmm2,0x20(%rsp) 5848c2ecf20Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 5858c2ecf20Sopenharmony_ci movdqa %xmm4,%xmm0 5868c2ecf20Sopenharmony_ci punpckldq %xmm5,%xmm4 5878c2ecf20Sopenharmony_ci punpckhdq %xmm5,%xmm0 5888c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm5 5898c2ecf20Sopenharmony_ci movdqa %xmm6,%xmm0 5908c2ecf20Sopenharmony_ci punpckldq %xmm7,%xmm6 5918c2ecf20Sopenharmony_ci punpckhdq %xmm7,%xmm0 5928c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm7 5938c2ecf20Sopenharmony_ci movdqa %xmm8,%xmm0 5948c2ecf20Sopenharmony_ci punpckldq %xmm9,%xmm8 5958c2ecf20Sopenharmony_ci punpckhdq %xmm9,%xmm0 5968c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm9 5978c2ecf20Sopenharmony_ci movdqa %xmm10,%xmm0 5988c2ecf20Sopenharmony_ci punpckldq %xmm11,%xmm10 5998c2ecf20Sopenharmony_ci punpckhdq %xmm11,%xmm0 6008c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm11 6018c2ecf20Sopenharmony_ci movdqa %xmm12,%xmm0 6028c2ecf20Sopenharmony_ci punpckldq %xmm13,%xmm12 6038c2ecf20Sopenharmony_ci punpckhdq %xmm13,%xmm0 6048c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm13 6058c2ecf20Sopenharmony_ci movdqa %xmm14,%xmm0 6068c2ecf20Sopenharmony_ci punpckldq %xmm15,%xmm14 6078c2ecf20Sopenharmony_ci punpckhdq %xmm15,%xmm0 6088c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm15 6098c2ecf20Sopenharmony_ci 6108c2ecf20Sopenharmony_ci # interleave 64-bit words in state n, n+2 6118c2ecf20Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 6128c2ecf20Sopenharmony_ci movdqa 0x20(%rsp),%xmm1 6138c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm2 6148c2ecf20Sopenharmony_ci punpcklqdq %xmm1,%xmm2 6158c2ecf20Sopenharmony_ci punpckhqdq %xmm1,%xmm0 6168c2ecf20Sopenharmony_ci movdqa %xmm2,0x00(%rsp) 6178c2ecf20Sopenharmony_ci movdqa %xmm0,0x20(%rsp) 6188c2ecf20Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 6198c2ecf20Sopenharmony_ci movdqa 0x30(%rsp),%xmm1 6208c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm2 6218c2ecf20Sopenharmony_ci punpcklqdq %xmm1,%xmm2 6228c2ecf20Sopenharmony_ci punpckhqdq %xmm1,%xmm0 6238c2ecf20Sopenharmony_ci movdqa %xmm2,0x10(%rsp) 6248c2ecf20Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 6258c2ecf20Sopenharmony_ci movdqa %xmm4,%xmm0 6268c2ecf20Sopenharmony_ci punpcklqdq %xmm6,%xmm4 6278c2ecf20Sopenharmony_ci punpckhqdq %xmm6,%xmm0 6288c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm6 6298c2ecf20Sopenharmony_ci movdqa %xmm5,%xmm0 6308c2ecf20Sopenharmony_ci punpcklqdq %xmm7,%xmm5 6318c2ecf20Sopenharmony_ci punpckhqdq %xmm7,%xmm0 6328c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm7 6338c2ecf20Sopenharmony_ci movdqa %xmm8,%xmm0 6348c2ecf20Sopenharmony_ci punpcklqdq %xmm10,%xmm8 6358c2ecf20Sopenharmony_ci punpckhqdq %xmm10,%xmm0 6368c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm10 6378c2ecf20Sopenharmony_ci movdqa %xmm9,%xmm0 6388c2ecf20Sopenharmony_ci punpcklqdq %xmm11,%xmm9 6398c2ecf20Sopenharmony_ci punpckhqdq %xmm11,%xmm0 6408c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm11 6418c2ecf20Sopenharmony_ci movdqa %xmm12,%xmm0 6428c2ecf20Sopenharmony_ci punpcklqdq %xmm14,%xmm12 6438c2ecf20Sopenharmony_ci punpckhqdq %xmm14,%xmm0 6448c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm14 6458c2ecf20Sopenharmony_ci movdqa %xmm13,%xmm0 6468c2ecf20Sopenharmony_ci punpcklqdq %xmm15,%xmm13 6478c2ecf20Sopenharmony_ci punpckhqdq %xmm15,%xmm0 6488c2ecf20Sopenharmony_ci movdqa %xmm0,%xmm15 6498c2ecf20Sopenharmony_ci 6508c2ecf20Sopenharmony_ci # xor with corresponding input, write to output 6518c2ecf20Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 6528c2ecf20Sopenharmony_ci cmp $0x10,%rax 6538c2ecf20Sopenharmony_ci jl .Lxorpart4 6548c2ecf20Sopenharmony_ci movdqu 0x00(%rdx),%xmm1 6558c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 6568c2ecf20Sopenharmony_ci movdqu %xmm0,0x00(%rsi) 6578c2ecf20Sopenharmony_ci 6588c2ecf20Sopenharmony_ci movdqu %xmm4,%xmm0 6598c2ecf20Sopenharmony_ci cmp $0x20,%rax 6608c2ecf20Sopenharmony_ci jl .Lxorpart4 6618c2ecf20Sopenharmony_ci movdqu 0x10(%rdx),%xmm1 6628c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 6638c2ecf20Sopenharmony_ci movdqu %xmm0,0x10(%rsi) 6648c2ecf20Sopenharmony_ci 6658c2ecf20Sopenharmony_ci movdqu %xmm8,%xmm0 6668c2ecf20Sopenharmony_ci cmp $0x30,%rax 6678c2ecf20Sopenharmony_ci jl .Lxorpart4 6688c2ecf20Sopenharmony_ci movdqu 0x20(%rdx),%xmm1 6698c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 6708c2ecf20Sopenharmony_ci movdqu %xmm0,0x20(%rsi) 6718c2ecf20Sopenharmony_ci 6728c2ecf20Sopenharmony_ci movdqu %xmm12,%xmm0 6738c2ecf20Sopenharmony_ci cmp $0x40,%rax 6748c2ecf20Sopenharmony_ci jl .Lxorpart4 6758c2ecf20Sopenharmony_ci movdqu 0x30(%rdx),%xmm1 6768c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 6778c2ecf20Sopenharmony_ci movdqu %xmm0,0x30(%rsi) 6788c2ecf20Sopenharmony_ci 6798c2ecf20Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 6808c2ecf20Sopenharmony_ci cmp $0x50,%rax 6818c2ecf20Sopenharmony_ci jl .Lxorpart4 6828c2ecf20Sopenharmony_ci movdqu 0x40(%rdx),%xmm1 6838c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 6848c2ecf20Sopenharmony_ci movdqu %xmm0,0x40(%rsi) 6858c2ecf20Sopenharmony_ci 6868c2ecf20Sopenharmony_ci movdqu %xmm6,%xmm0 6878c2ecf20Sopenharmony_ci cmp $0x60,%rax 6888c2ecf20Sopenharmony_ci jl .Lxorpart4 6898c2ecf20Sopenharmony_ci movdqu 0x50(%rdx),%xmm1 6908c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 6918c2ecf20Sopenharmony_ci movdqu %xmm0,0x50(%rsi) 6928c2ecf20Sopenharmony_ci 6938c2ecf20Sopenharmony_ci movdqu %xmm10,%xmm0 6948c2ecf20Sopenharmony_ci cmp $0x70,%rax 6958c2ecf20Sopenharmony_ci jl .Lxorpart4 6968c2ecf20Sopenharmony_ci movdqu 0x60(%rdx),%xmm1 6978c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 6988c2ecf20Sopenharmony_ci movdqu %xmm0,0x60(%rsi) 6998c2ecf20Sopenharmony_ci 7008c2ecf20Sopenharmony_ci movdqu %xmm14,%xmm0 7018c2ecf20Sopenharmony_ci cmp $0x80,%rax 7028c2ecf20Sopenharmony_ci jl .Lxorpart4 7038c2ecf20Sopenharmony_ci movdqu 0x70(%rdx),%xmm1 7048c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 7058c2ecf20Sopenharmony_ci movdqu %xmm0,0x70(%rsi) 7068c2ecf20Sopenharmony_ci 7078c2ecf20Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 7088c2ecf20Sopenharmony_ci cmp $0x90,%rax 7098c2ecf20Sopenharmony_ci jl .Lxorpart4 7108c2ecf20Sopenharmony_ci movdqu 0x80(%rdx),%xmm1 7118c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 7128c2ecf20Sopenharmony_ci movdqu %xmm0,0x80(%rsi) 7138c2ecf20Sopenharmony_ci 7148c2ecf20Sopenharmony_ci movdqu %xmm5,%xmm0 7158c2ecf20Sopenharmony_ci cmp $0xa0,%rax 7168c2ecf20Sopenharmony_ci jl .Lxorpart4 7178c2ecf20Sopenharmony_ci movdqu 0x90(%rdx),%xmm1 7188c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 7198c2ecf20Sopenharmony_ci movdqu %xmm0,0x90(%rsi) 7208c2ecf20Sopenharmony_ci 7218c2ecf20Sopenharmony_ci movdqu %xmm9,%xmm0 7228c2ecf20Sopenharmony_ci cmp $0xb0,%rax 7238c2ecf20Sopenharmony_ci jl .Lxorpart4 7248c2ecf20Sopenharmony_ci movdqu 0xa0(%rdx),%xmm1 7258c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 7268c2ecf20Sopenharmony_ci movdqu %xmm0,0xa0(%rsi) 7278c2ecf20Sopenharmony_ci 7288c2ecf20Sopenharmony_ci movdqu %xmm13,%xmm0 7298c2ecf20Sopenharmony_ci cmp $0xc0,%rax 7308c2ecf20Sopenharmony_ci jl .Lxorpart4 7318c2ecf20Sopenharmony_ci movdqu 0xb0(%rdx),%xmm1 7328c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 7338c2ecf20Sopenharmony_ci movdqu %xmm0,0xb0(%rsi) 7348c2ecf20Sopenharmony_ci 7358c2ecf20Sopenharmony_ci movdqa 0x30(%rsp),%xmm0 7368c2ecf20Sopenharmony_ci cmp $0xd0,%rax 7378c2ecf20Sopenharmony_ci jl .Lxorpart4 7388c2ecf20Sopenharmony_ci movdqu 0xc0(%rdx),%xmm1 7398c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 7408c2ecf20Sopenharmony_ci movdqu %xmm0,0xc0(%rsi) 7418c2ecf20Sopenharmony_ci 7428c2ecf20Sopenharmony_ci movdqu %xmm7,%xmm0 7438c2ecf20Sopenharmony_ci cmp $0xe0,%rax 7448c2ecf20Sopenharmony_ci jl .Lxorpart4 7458c2ecf20Sopenharmony_ci movdqu 0xd0(%rdx),%xmm1 7468c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 7478c2ecf20Sopenharmony_ci movdqu %xmm0,0xd0(%rsi) 7488c2ecf20Sopenharmony_ci 7498c2ecf20Sopenharmony_ci movdqu %xmm11,%xmm0 7508c2ecf20Sopenharmony_ci cmp $0xf0,%rax 7518c2ecf20Sopenharmony_ci jl .Lxorpart4 7528c2ecf20Sopenharmony_ci movdqu 0xe0(%rdx),%xmm1 7538c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 7548c2ecf20Sopenharmony_ci movdqu %xmm0,0xe0(%rsi) 7558c2ecf20Sopenharmony_ci 7568c2ecf20Sopenharmony_ci movdqu %xmm15,%xmm0 7578c2ecf20Sopenharmony_ci cmp $0x100,%rax 7588c2ecf20Sopenharmony_ci jl .Lxorpart4 7598c2ecf20Sopenharmony_ci movdqu 0xf0(%rdx),%xmm1 7608c2ecf20Sopenharmony_ci pxor %xmm1,%xmm0 7618c2ecf20Sopenharmony_ci movdqu %xmm0,0xf0(%rsi) 7628c2ecf20Sopenharmony_ci 7638c2ecf20Sopenharmony_ci.Ldone4: 7648c2ecf20Sopenharmony_ci lea -8(%r10),%rsp 7658c2ecf20Sopenharmony_ci RET 7668c2ecf20Sopenharmony_ci 7678c2ecf20Sopenharmony_ci.Lxorpart4: 7688c2ecf20Sopenharmony_ci # xor remaining bytes from partial register into output 7698c2ecf20Sopenharmony_ci mov %rax,%r9 7708c2ecf20Sopenharmony_ci and $0x0f,%r9 7718c2ecf20Sopenharmony_ci jz .Ldone4 7728c2ecf20Sopenharmony_ci and $~0x0f,%rax 7738c2ecf20Sopenharmony_ci 7748c2ecf20Sopenharmony_ci mov %rsi,%r11 7758c2ecf20Sopenharmony_ci 7768c2ecf20Sopenharmony_ci lea (%rdx,%rax),%rsi 7778c2ecf20Sopenharmony_ci mov %rsp,%rdi 7788c2ecf20Sopenharmony_ci mov %r9,%rcx 7798c2ecf20Sopenharmony_ci rep movsb 7808c2ecf20Sopenharmony_ci 7818c2ecf20Sopenharmony_ci pxor 0x00(%rsp),%xmm0 7828c2ecf20Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 7838c2ecf20Sopenharmony_ci 7848c2ecf20Sopenharmony_ci mov %rsp,%rsi 7858c2ecf20Sopenharmony_ci lea (%r11,%rax),%rdi 7868c2ecf20Sopenharmony_ci mov %r9,%rcx 7878c2ecf20Sopenharmony_ci rep movsb 7888c2ecf20Sopenharmony_ci 7898c2ecf20Sopenharmony_ci jmp .Ldone4 7908c2ecf20Sopenharmony_ci 7918c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_4block_xor_ssse3) 792