18c2ecf20Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * ChaCha 256-bit cipher algorithm, x64 AVX2 functions 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright (C) 2015 Martin Willi 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#include <linux/linkage.h> 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci.section .rodata.cst32.ROT8, "aM", @progbits, 32 118c2ecf20Sopenharmony_ci.align 32 128c2ecf20Sopenharmony_ciROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 138c2ecf20Sopenharmony_ci .octa 0x0e0d0c0f0a09080b0605040702010003 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci.section .rodata.cst32.ROT16, "aM", @progbits, 32 168c2ecf20Sopenharmony_ci.align 32 178c2ecf20Sopenharmony_ciROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 188c2ecf20Sopenharmony_ci .octa 0x0d0c0f0e09080b0a0504070601000302 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci.section .rodata.cst32.CTRINC, "aM", @progbits, 32 218c2ecf20Sopenharmony_ci.align 32 228c2ecf20Sopenharmony_ciCTRINC: .octa 0x00000003000000020000000100000000 238c2ecf20Sopenharmony_ci .octa 0x00000007000000060000000500000004 248c2ecf20Sopenharmony_ci 258c2ecf20Sopenharmony_ci.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 268c2ecf20Sopenharmony_ci.align 32 278c2ecf20Sopenharmony_ciCTR2BL: .octa 0x00000000000000000000000000000000 288c2ecf20Sopenharmony_ci .octa 0x00000000000000000000000000000001 298c2ecf20Sopenharmony_ci 308c2ecf20Sopenharmony_ci.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 318c2ecf20Sopenharmony_ci.align 32 328c2ecf20Sopenharmony_ciCTR4BL: .octa 0x00000000000000000000000000000002 338c2ecf20Sopenharmony_ci .octa 0x00000000000000000000000000000003 348c2ecf20Sopenharmony_ci 358c2ecf20Sopenharmony_ci.text 368c2ecf20Sopenharmony_ci 378c2ecf20Sopenharmony_ciSYM_FUNC_START(chacha_2block_xor_avx2) 388c2ecf20Sopenharmony_ci # %rdi: Input state matrix, s 398c2ecf20Sopenharmony_ci # %rsi: up to 2 data blocks output, o 408c2ecf20Sopenharmony_ci # %rdx: up to 2 data blocks input, i 418c2ecf20Sopenharmony_ci # %rcx: input/output length in bytes 428c2ecf20Sopenharmony_ci # %r8d: nrounds 438c2ecf20Sopenharmony_ci 448c2ecf20Sopenharmony_ci # This function encrypts two ChaCha blocks by loading the state 458c2ecf20Sopenharmony_ci # matrix twice across four AVX registers. It performs matrix operations 468c2ecf20Sopenharmony_ci # on four words in each matrix in parallel, but requires shuffling to 478c2ecf20Sopenharmony_ci # rearrange the words after each round. 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_ci vzeroupper 508c2ecf20Sopenharmony_ci 518c2ecf20Sopenharmony_ci # x0..3[0-2] = s0..3 528c2ecf20Sopenharmony_ci vbroadcasti128 0x00(%rdi),%ymm0 538c2ecf20Sopenharmony_ci vbroadcasti128 0x10(%rdi),%ymm1 548c2ecf20Sopenharmony_ci vbroadcasti128 0x20(%rdi),%ymm2 558c2ecf20Sopenharmony_ci vbroadcasti128 0x30(%rdi),%ymm3 568c2ecf20Sopenharmony_ci 578c2ecf20Sopenharmony_ci vpaddd CTR2BL(%rip),%ymm3,%ymm3 588c2ecf20Sopenharmony_ci 598c2ecf20Sopenharmony_ci vmovdqa %ymm0,%ymm8 608c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm9 618c2ecf20Sopenharmony_ci vmovdqa %ymm2,%ymm10 628c2ecf20Sopenharmony_ci vmovdqa %ymm3,%ymm11 638c2ecf20Sopenharmony_ci 648c2ecf20Sopenharmony_ci vmovdqa ROT8(%rip),%ymm4 658c2ecf20Sopenharmony_ci vmovdqa ROT16(%rip),%ymm5 668c2ecf20Sopenharmony_ci 678c2ecf20Sopenharmony_ci mov %rcx,%rax 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_ci.Ldoubleround: 708c2ecf20Sopenharmony_ci 718c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 728c2ecf20Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 738c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 748c2ecf20Sopenharmony_ci vpshufb %ymm5,%ymm3,%ymm3 758c2ecf20Sopenharmony_ci 768c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 778c2ecf20Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 788c2ecf20Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 798c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm6 808c2ecf20Sopenharmony_ci vpslld $12,%ymm6,%ymm6 818c2ecf20Sopenharmony_ci vpsrld $20,%ymm1,%ymm1 828c2ecf20Sopenharmony_ci vpor %ymm6,%ymm1,%ymm1 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 858c2ecf20Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 868c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 878c2ecf20Sopenharmony_ci vpshufb %ymm4,%ymm3,%ymm3 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 908c2ecf20Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 918c2ecf20Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 928c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm7 938c2ecf20Sopenharmony_ci vpslld $7,%ymm7,%ymm7 948c2ecf20Sopenharmony_ci vpsrld $25,%ymm1,%ymm1 958c2ecf20Sopenharmony_ci vpor %ymm7,%ymm1,%ymm1 968c2ecf20Sopenharmony_ci 978c2ecf20Sopenharmony_ci # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 988c2ecf20Sopenharmony_ci vpshufd $0x39,%ymm1,%ymm1 998c2ecf20Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 1008c2ecf20Sopenharmony_ci vpshufd $0x4e,%ymm2,%ymm2 1018c2ecf20Sopenharmony_ci # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 1028c2ecf20Sopenharmony_ci vpshufd $0x93,%ymm3,%ymm3 1038c2ecf20Sopenharmony_ci 1048c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 1058c2ecf20Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 1068c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 1078c2ecf20Sopenharmony_ci vpshufb %ymm5,%ymm3,%ymm3 1088c2ecf20Sopenharmony_ci 1098c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 1108c2ecf20Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 1118c2ecf20Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 1128c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm6 1138c2ecf20Sopenharmony_ci vpslld $12,%ymm6,%ymm6 1148c2ecf20Sopenharmony_ci vpsrld $20,%ymm1,%ymm1 1158c2ecf20Sopenharmony_ci vpor %ymm6,%ymm1,%ymm1 1168c2ecf20Sopenharmony_ci 1178c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 1188c2ecf20Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 1198c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 1208c2ecf20Sopenharmony_ci vpshufb %ymm4,%ymm3,%ymm3 1218c2ecf20Sopenharmony_ci 1228c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 1238c2ecf20Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 1248c2ecf20Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 1258c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm7 1268c2ecf20Sopenharmony_ci vpslld $7,%ymm7,%ymm7 1278c2ecf20Sopenharmony_ci vpsrld $25,%ymm1,%ymm1 1288c2ecf20Sopenharmony_ci vpor %ymm7,%ymm1,%ymm1 1298c2ecf20Sopenharmony_ci 1308c2ecf20Sopenharmony_ci # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 1318c2ecf20Sopenharmony_ci vpshufd $0x93,%ymm1,%ymm1 1328c2ecf20Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 1338c2ecf20Sopenharmony_ci vpshufd $0x4e,%ymm2,%ymm2 1348c2ecf20Sopenharmony_ci # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 1358c2ecf20Sopenharmony_ci vpshufd $0x39,%ymm3,%ymm3 1368c2ecf20Sopenharmony_ci 1378c2ecf20Sopenharmony_ci sub $2,%r8d 1388c2ecf20Sopenharmony_ci jnz .Ldoubleround 1398c2ecf20Sopenharmony_ci 1408c2ecf20Sopenharmony_ci # o0 = i0 ^ (x0 + s0) 1418c2ecf20Sopenharmony_ci vpaddd %ymm8,%ymm0,%ymm7 1428c2ecf20Sopenharmony_ci cmp $0x10,%rax 1438c2ecf20Sopenharmony_ci jl .Lxorpart2 1448c2ecf20Sopenharmony_ci vpxor 0x00(%rdx),%xmm7,%xmm6 1458c2ecf20Sopenharmony_ci vmovdqu %xmm6,0x00(%rsi) 1468c2ecf20Sopenharmony_ci vextracti128 $1,%ymm7,%xmm0 1478c2ecf20Sopenharmony_ci # o1 = i1 ^ (x1 + s1) 1488c2ecf20Sopenharmony_ci vpaddd %ymm9,%ymm1,%ymm7 1498c2ecf20Sopenharmony_ci cmp $0x20,%rax 1508c2ecf20Sopenharmony_ci jl .Lxorpart2 1518c2ecf20Sopenharmony_ci vpxor 0x10(%rdx),%xmm7,%xmm6 1528c2ecf20Sopenharmony_ci vmovdqu %xmm6,0x10(%rsi) 1538c2ecf20Sopenharmony_ci vextracti128 $1,%ymm7,%xmm1 1548c2ecf20Sopenharmony_ci # o2 = i2 ^ (x2 + s2) 1558c2ecf20Sopenharmony_ci vpaddd %ymm10,%ymm2,%ymm7 1568c2ecf20Sopenharmony_ci cmp $0x30,%rax 1578c2ecf20Sopenharmony_ci jl .Lxorpart2 1588c2ecf20Sopenharmony_ci vpxor 0x20(%rdx),%xmm7,%xmm6 1598c2ecf20Sopenharmony_ci vmovdqu %xmm6,0x20(%rsi) 1608c2ecf20Sopenharmony_ci vextracti128 $1,%ymm7,%xmm2 1618c2ecf20Sopenharmony_ci # o3 = i3 ^ (x3 + s3) 1628c2ecf20Sopenharmony_ci vpaddd %ymm11,%ymm3,%ymm7 1638c2ecf20Sopenharmony_ci cmp $0x40,%rax 1648c2ecf20Sopenharmony_ci jl .Lxorpart2 1658c2ecf20Sopenharmony_ci vpxor 0x30(%rdx),%xmm7,%xmm6 1668c2ecf20Sopenharmony_ci vmovdqu %xmm6,0x30(%rsi) 1678c2ecf20Sopenharmony_ci vextracti128 $1,%ymm7,%xmm3 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_ci # xor and write second block 1708c2ecf20Sopenharmony_ci vmovdqa %xmm0,%xmm7 1718c2ecf20Sopenharmony_ci cmp $0x50,%rax 1728c2ecf20Sopenharmony_ci jl .Lxorpart2 1738c2ecf20Sopenharmony_ci vpxor 0x40(%rdx),%xmm7,%xmm6 1748c2ecf20Sopenharmony_ci vmovdqu %xmm6,0x40(%rsi) 1758c2ecf20Sopenharmony_ci 1768c2ecf20Sopenharmony_ci vmovdqa %xmm1,%xmm7 1778c2ecf20Sopenharmony_ci cmp $0x60,%rax 1788c2ecf20Sopenharmony_ci jl .Lxorpart2 1798c2ecf20Sopenharmony_ci vpxor 0x50(%rdx),%xmm7,%xmm6 1808c2ecf20Sopenharmony_ci vmovdqu %xmm6,0x50(%rsi) 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_ci vmovdqa %xmm2,%xmm7 1838c2ecf20Sopenharmony_ci cmp $0x70,%rax 1848c2ecf20Sopenharmony_ci jl .Lxorpart2 1858c2ecf20Sopenharmony_ci vpxor 0x60(%rdx),%xmm7,%xmm6 1868c2ecf20Sopenharmony_ci vmovdqu %xmm6,0x60(%rsi) 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_ci vmovdqa %xmm3,%xmm7 1898c2ecf20Sopenharmony_ci cmp $0x80,%rax 1908c2ecf20Sopenharmony_ci jl .Lxorpart2 1918c2ecf20Sopenharmony_ci vpxor 0x70(%rdx),%xmm7,%xmm6 1928c2ecf20Sopenharmony_ci vmovdqu %xmm6,0x70(%rsi) 1938c2ecf20Sopenharmony_ci 1948c2ecf20Sopenharmony_ci.Ldone2: 1958c2ecf20Sopenharmony_ci vzeroupper 1968c2ecf20Sopenharmony_ci RET 1978c2ecf20Sopenharmony_ci 1988c2ecf20Sopenharmony_ci.Lxorpart2: 1998c2ecf20Sopenharmony_ci # xor remaining bytes from partial register into output 2008c2ecf20Sopenharmony_ci mov %rax,%r9 2018c2ecf20Sopenharmony_ci and $0x0f,%r9 2028c2ecf20Sopenharmony_ci jz .Ldone2 2038c2ecf20Sopenharmony_ci and $~0x0f,%rax 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci mov %rsi,%r11 2068c2ecf20Sopenharmony_ci 2078c2ecf20Sopenharmony_ci lea 8(%rsp),%r10 2088c2ecf20Sopenharmony_ci sub $0x10,%rsp 2098c2ecf20Sopenharmony_ci and $~31,%rsp 2108c2ecf20Sopenharmony_ci 2118c2ecf20Sopenharmony_ci lea (%rdx,%rax),%rsi 2128c2ecf20Sopenharmony_ci mov %rsp,%rdi 2138c2ecf20Sopenharmony_ci mov %r9,%rcx 2148c2ecf20Sopenharmony_ci rep movsb 2158c2ecf20Sopenharmony_ci 2168c2ecf20Sopenharmony_ci vpxor 0x00(%rsp),%xmm7,%xmm7 2178c2ecf20Sopenharmony_ci vmovdqa %xmm7,0x00(%rsp) 2188c2ecf20Sopenharmony_ci 2198c2ecf20Sopenharmony_ci mov %rsp,%rsi 2208c2ecf20Sopenharmony_ci lea (%r11,%rax),%rdi 2218c2ecf20Sopenharmony_ci mov %r9,%rcx 2228c2ecf20Sopenharmony_ci rep movsb 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_ci lea -8(%r10),%rsp 2258c2ecf20Sopenharmony_ci jmp .Ldone2 2268c2ecf20Sopenharmony_ci 2278c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_2block_xor_avx2) 2288c2ecf20Sopenharmony_ci 2298c2ecf20Sopenharmony_ciSYM_FUNC_START(chacha_4block_xor_avx2) 2308c2ecf20Sopenharmony_ci # %rdi: Input state matrix, s 2318c2ecf20Sopenharmony_ci # %rsi: up to 4 data blocks output, o 2328c2ecf20Sopenharmony_ci # %rdx: up to 4 data blocks input, i 2338c2ecf20Sopenharmony_ci # %rcx: input/output length in bytes 2348c2ecf20Sopenharmony_ci # %r8d: nrounds 2358c2ecf20Sopenharmony_ci 2368c2ecf20Sopenharmony_ci # This function encrypts four ChaCha blocks by loading the state 2378c2ecf20Sopenharmony_ci # matrix four times across eight AVX registers. It performs matrix 2388c2ecf20Sopenharmony_ci # operations on four words in two matrices in parallel, sequentially 2398c2ecf20Sopenharmony_ci # to the operations on the four words of the other two matrices. The 2408c2ecf20Sopenharmony_ci # required word shuffling has a rather high latency, we can do the 2418c2ecf20Sopenharmony_ci # arithmetic on two matrix-pairs without much slowdown. 2428c2ecf20Sopenharmony_ci 2438c2ecf20Sopenharmony_ci vzeroupper 2448c2ecf20Sopenharmony_ci 2458c2ecf20Sopenharmony_ci # x0..3[0-4] = s0..3 2468c2ecf20Sopenharmony_ci vbroadcasti128 0x00(%rdi),%ymm0 2478c2ecf20Sopenharmony_ci vbroadcasti128 0x10(%rdi),%ymm1 2488c2ecf20Sopenharmony_ci vbroadcasti128 0x20(%rdi),%ymm2 2498c2ecf20Sopenharmony_ci vbroadcasti128 0x30(%rdi),%ymm3 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_ci vmovdqa %ymm0,%ymm4 2528c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm5 2538c2ecf20Sopenharmony_ci vmovdqa %ymm2,%ymm6 2548c2ecf20Sopenharmony_ci vmovdqa %ymm3,%ymm7 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci vpaddd CTR2BL(%rip),%ymm3,%ymm3 2578c2ecf20Sopenharmony_ci vpaddd CTR4BL(%rip),%ymm7,%ymm7 2588c2ecf20Sopenharmony_ci 2598c2ecf20Sopenharmony_ci vmovdqa %ymm0,%ymm11 2608c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm12 2618c2ecf20Sopenharmony_ci vmovdqa %ymm2,%ymm13 2628c2ecf20Sopenharmony_ci vmovdqa %ymm3,%ymm14 2638c2ecf20Sopenharmony_ci vmovdqa %ymm7,%ymm15 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci vmovdqa ROT8(%rip),%ymm8 2668c2ecf20Sopenharmony_ci vmovdqa ROT16(%rip),%ymm9 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci mov %rcx,%rax 2698c2ecf20Sopenharmony_ci 2708c2ecf20Sopenharmony_ci.Ldoubleround4: 2718c2ecf20Sopenharmony_ci 2728c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 2738c2ecf20Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 2748c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 2758c2ecf20Sopenharmony_ci vpshufb %ymm9,%ymm3,%ymm3 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci vpaddd %ymm5,%ymm4,%ymm4 2788c2ecf20Sopenharmony_ci vpxor %ymm4,%ymm7,%ymm7 2798c2ecf20Sopenharmony_ci vpshufb %ymm9,%ymm7,%ymm7 2808c2ecf20Sopenharmony_ci 2818c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 2828c2ecf20Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 2838c2ecf20Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 2848c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm10 2858c2ecf20Sopenharmony_ci vpslld $12,%ymm10,%ymm10 2868c2ecf20Sopenharmony_ci vpsrld $20,%ymm1,%ymm1 2878c2ecf20Sopenharmony_ci vpor %ymm10,%ymm1,%ymm1 2888c2ecf20Sopenharmony_ci 2898c2ecf20Sopenharmony_ci vpaddd %ymm7,%ymm6,%ymm6 2908c2ecf20Sopenharmony_ci vpxor %ymm6,%ymm5,%ymm5 2918c2ecf20Sopenharmony_ci vmovdqa %ymm5,%ymm10 2928c2ecf20Sopenharmony_ci vpslld $12,%ymm10,%ymm10 2938c2ecf20Sopenharmony_ci vpsrld $20,%ymm5,%ymm5 2948c2ecf20Sopenharmony_ci vpor %ymm10,%ymm5,%ymm5 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 2978c2ecf20Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 2988c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 2998c2ecf20Sopenharmony_ci vpshufb %ymm8,%ymm3,%ymm3 3008c2ecf20Sopenharmony_ci 3018c2ecf20Sopenharmony_ci vpaddd %ymm5,%ymm4,%ymm4 3028c2ecf20Sopenharmony_ci vpxor %ymm4,%ymm7,%ymm7 3038c2ecf20Sopenharmony_ci vpshufb %ymm8,%ymm7,%ymm7 3048c2ecf20Sopenharmony_ci 3058c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 3068c2ecf20Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 3078c2ecf20Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 3088c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm10 3098c2ecf20Sopenharmony_ci vpslld $7,%ymm10,%ymm10 3108c2ecf20Sopenharmony_ci vpsrld $25,%ymm1,%ymm1 3118c2ecf20Sopenharmony_ci vpor %ymm10,%ymm1,%ymm1 3128c2ecf20Sopenharmony_ci 3138c2ecf20Sopenharmony_ci vpaddd %ymm7,%ymm6,%ymm6 3148c2ecf20Sopenharmony_ci vpxor %ymm6,%ymm5,%ymm5 3158c2ecf20Sopenharmony_ci vmovdqa %ymm5,%ymm10 3168c2ecf20Sopenharmony_ci vpslld $7,%ymm10,%ymm10 3178c2ecf20Sopenharmony_ci vpsrld $25,%ymm5,%ymm5 3188c2ecf20Sopenharmony_ci vpor %ymm10,%ymm5,%ymm5 3198c2ecf20Sopenharmony_ci 3208c2ecf20Sopenharmony_ci # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 3218c2ecf20Sopenharmony_ci vpshufd $0x39,%ymm1,%ymm1 3228c2ecf20Sopenharmony_ci vpshufd $0x39,%ymm5,%ymm5 3238c2ecf20Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 3248c2ecf20Sopenharmony_ci vpshufd $0x4e,%ymm2,%ymm2 3258c2ecf20Sopenharmony_ci vpshufd $0x4e,%ymm6,%ymm6 3268c2ecf20Sopenharmony_ci # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 3278c2ecf20Sopenharmony_ci vpshufd $0x93,%ymm3,%ymm3 3288c2ecf20Sopenharmony_ci vpshufd $0x93,%ymm7,%ymm7 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 3318c2ecf20Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 3328c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 3338c2ecf20Sopenharmony_ci vpshufb %ymm9,%ymm3,%ymm3 3348c2ecf20Sopenharmony_ci 3358c2ecf20Sopenharmony_ci vpaddd %ymm5,%ymm4,%ymm4 3368c2ecf20Sopenharmony_ci vpxor %ymm4,%ymm7,%ymm7 3378c2ecf20Sopenharmony_ci vpshufb %ymm9,%ymm7,%ymm7 3388c2ecf20Sopenharmony_ci 3398c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 3408c2ecf20Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 3418c2ecf20Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 3428c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm10 3438c2ecf20Sopenharmony_ci vpslld $12,%ymm10,%ymm10 3448c2ecf20Sopenharmony_ci vpsrld $20,%ymm1,%ymm1 3458c2ecf20Sopenharmony_ci vpor %ymm10,%ymm1,%ymm1 3468c2ecf20Sopenharmony_ci 3478c2ecf20Sopenharmony_ci vpaddd %ymm7,%ymm6,%ymm6 3488c2ecf20Sopenharmony_ci vpxor %ymm6,%ymm5,%ymm5 3498c2ecf20Sopenharmony_ci vmovdqa %ymm5,%ymm10 3508c2ecf20Sopenharmony_ci vpslld $12,%ymm10,%ymm10 3518c2ecf20Sopenharmony_ci vpsrld $20,%ymm5,%ymm5 3528c2ecf20Sopenharmony_ci vpor %ymm10,%ymm5,%ymm5 3538c2ecf20Sopenharmony_ci 3548c2ecf20Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 3558c2ecf20Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 3568c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 3578c2ecf20Sopenharmony_ci vpshufb %ymm8,%ymm3,%ymm3 3588c2ecf20Sopenharmony_ci 3598c2ecf20Sopenharmony_ci vpaddd %ymm5,%ymm4,%ymm4 3608c2ecf20Sopenharmony_ci vpxor %ymm4,%ymm7,%ymm7 3618c2ecf20Sopenharmony_ci vpshufb %ymm8,%ymm7,%ymm7 3628c2ecf20Sopenharmony_ci 3638c2ecf20Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 3648c2ecf20Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 3658c2ecf20Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 3668c2ecf20Sopenharmony_ci vmovdqa %ymm1,%ymm10 3678c2ecf20Sopenharmony_ci vpslld $7,%ymm10,%ymm10 3688c2ecf20Sopenharmony_ci vpsrld $25,%ymm1,%ymm1 3698c2ecf20Sopenharmony_ci vpor %ymm10,%ymm1,%ymm1 3708c2ecf20Sopenharmony_ci 3718c2ecf20Sopenharmony_ci vpaddd %ymm7,%ymm6,%ymm6 3728c2ecf20Sopenharmony_ci vpxor %ymm6,%ymm5,%ymm5 3738c2ecf20Sopenharmony_ci vmovdqa %ymm5,%ymm10 3748c2ecf20Sopenharmony_ci vpslld $7,%ymm10,%ymm10 3758c2ecf20Sopenharmony_ci vpsrld $25,%ymm5,%ymm5 3768c2ecf20Sopenharmony_ci vpor %ymm10,%ymm5,%ymm5 3778c2ecf20Sopenharmony_ci 3788c2ecf20Sopenharmony_ci # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 3798c2ecf20Sopenharmony_ci vpshufd $0x93,%ymm1,%ymm1 3808c2ecf20Sopenharmony_ci vpshufd $0x93,%ymm5,%ymm5 3818c2ecf20Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 3828c2ecf20Sopenharmony_ci vpshufd $0x4e,%ymm2,%ymm2 3838c2ecf20Sopenharmony_ci vpshufd $0x4e,%ymm6,%ymm6 3848c2ecf20Sopenharmony_ci # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 3858c2ecf20Sopenharmony_ci vpshufd $0x39,%ymm3,%ymm3 3868c2ecf20Sopenharmony_ci vpshufd $0x39,%ymm7,%ymm7 3878c2ecf20Sopenharmony_ci 3888c2ecf20Sopenharmony_ci sub $2,%r8d 3898c2ecf20Sopenharmony_ci jnz .Ldoubleround4 3908c2ecf20Sopenharmony_ci 3918c2ecf20Sopenharmony_ci # o0 = i0 ^ (x0 + s0), first block 3928c2ecf20Sopenharmony_ci vpaddd %ymm11,%ymm0,%ymm10 3938c2ecf20Sopenharmony_ci cmp $0x10,%rax 3948c2ecf20Sopenharmony_ci jl .Lxorpart4 3958c2ecf20Sopenharmony_ci vpxor 0x00(%rdx),%xmm10,%xmm9 3968c2ecf20Sopenharmony_ci vmovdqu %xmm9,0x00(%rsi) 3978c2ecf20Sopenharmony_ci vextracti128 $1,%ymm10,%xmm0 3988c2ecf20Sopenharmony_ci # o1 = i1 ^ (x1 + s1), first block 3998c2ecf20Sopenharmony_ci vpaddd %ymm12,%ymm1,%ymm10 4008c2ecf20Sopenharmony_ci cmp $0x20,%rax 4018c2ecf20Sopenharmony_ci jl .Lxorpart4 4028c2ecf20Sopenharmony_ci vpxor 0x10(%rdx),%xmm10,%xmm9 4038c2ecf20Sopenharmony_ci vmovdqu %xmm9,0x10(%rsi) 4048c2ecf20Sopenharmony_ci vextracti128 $1,%ymm10,%xmm1 4058c2ecf20Sopenharmony_ci # o2 = i2 ^ (x2 + s2), first block 4068c2ecf20Sopenharmony_ci vpaddd %ymm13,%ymm2,%ymm10 4078c2ecf20Sopenharmony_ci cmp $0x30,%rax 4088c2ecf20Sopenharmony_ci jl .Lxorpart4 4098c2ecf20Sopenharmony_ci vpxor 0x20(%rdx),%xmm10,%xmm9 4108c2ecf20Sopenharmony_ci vmovdqu %xmm9,0x20(%rsi) 4118c2ecf20Sopenharmony_ci vextracti128 $1,%ymm10,%xmm2 4128c2ecf20Sopenharmony_ci # o3 = i3 ^ (x3 + s3), first block 4138c2ecf20Sopenharmony_ci vpaddd %ymm14,%ymm3,%ymm10 4148c2ecf20Sopenharmony_ci cmp $0x40,%rax 4158c2ecf20Sopenharmony_ci jl .Lxorpart4 4168c2ecf20Sopenharmony_ci vpxor 0x30(%rdx),%xmm10,%xmm9 4178c2ecf20Sopenharmony_ci vmovdqu %xmm9,0x30(%rsi) 4188c2ecf20Sopenharmony_ci vextracti128 $1,%ymm10,%xmm3 4198c2ecf20Sopenharmony_ci 4208c2ecf20Sopenharmony_ci # xor and write second block 4218c2ecf20Sopenharmony_ci vmovdqa %xmm0,%xmm10 4228c2ecf20Sopenharmony_ci cmp $0x50,%rax 4238c2ecf20Sopenharmony_ci jl .Lxorpart4 4248c2ecf20Sopenharmony_ci vpxor 0x40(%rdx),%xmm10,%xmm9 4258c2ecf20Sopenharmony_ci vmovdqu %xmm9,0x40(%rsi) 4268c2ecf20Sopenharmony_ci 4278c2ecf20Sopenharmony_ci vmovdqa %xmm1,%xmm10 4288c2ecf20Sopenharmony_ci cmp $0x60,%rax 4298c2ecf20Sopenharmony_ci jl .Lxorpart4 4308c2ecf20Sopenharmony_ci vpxor 0x50(%rdx),%xmm10,%xmm9 4318c2ecf20Sopenharmony_ci vmovdqu %xmm9,0x50(%rsi) 4328c2ecf20Sopenharmony_ci 4338c2ecf20Sopenharmony_ci vmovdqa %xmm2,%xmm10 4348c2ecf20Sopenharmony_ci cmp $0x70,%rax 4358c2ecf20Sopenharmony_ci jl .Lxorpart4 4368c2ecf20Sopenharmony_ci vpxor 0x60(%rdx),%xmm10,%xmm9 4378c2ecf20Sopenharmony_ci vmovdqu %xmm9,0x60(%rsi) 4388c2ecf20Sopenharmony_ci 4398c2ecf20Sopenharmony_ci vmovdqa %xmm3,%xmm10 4408c2ecf20Sopenharmony_ci cmp $0x80,%rax 4418c2ecf20Sopenharmony_ci jl .Lxorpart4 4428c2ecf20Sopenharmony_ci vpxor 0x70(%rdx),%xmm10,%xmm9 4438c2ecf20Sopenharmony_ci vmovdqu %xmm9,0x70(%rsi) 4448c2ecf20Sopenharmony_ci 4458c2ecf20Sopenharmony_ci # o0 = i0 ^ (x0 + s0), third block 4468c2ecf20Sopenharmony_ci vpaddd %ymm11,%ymm4,%ymm10 4478c2ecf20Sopenharmony_ci cmp $0x90,%rax 4488c2ecf20Sopenharmony_ci jl .Lxorpart4 4498c2ecf20Sopenharmony_ci vpxor 0x80(%rdx),%xmm10,%xmm9 4508c2ecf20Sopenharmony_ci vmovdqu %xmm9,0x80(%rsi) 4518c2ecf20Sopenharmony_ci vextracti128 $1,%ymm10,%xmm4 4528c2ecf20Sopenharmony_ci # o1 = i1 ^ (x1 + s1), third block 4538c2ecf20Sopenharmony_ci vpaddd %ymm12,%ymm5,%ymm10 4548c2ecf20Sopenharmony_ci cmp $0xa0,%rax 4558c2ecf20Sopenharmony_ci jl .Lxorpart4 4568c2ecf20Sopenharmony_ci vpxor 0x90(%rdx),%xmm10,%xmm9 4578c2ecf20Sopenharmony_ci vmovdqu %xmm9,0x90(%rsi) 4588c2ecf20Sopenharmony_ci vextracti128 $1,%ymm10,%xmm5 4598c2ecf20Sopenharmony_ci # o2 = i2 ^ (x2 + s2), third block 4608c2ecf20Sopenharmony_ci vpaddd %ymm13,%ymm6,%ymm10 4618c2ecf20Sopenharmony_ci cmp $0xb0,%rax 4628c2ecf20Sopenharmony_ci jl .Lxorpart4 4638c2ecf20Sopenharmony_ci vpxor 0xa0(%rdx),%xmm10,%xmm9 4648c2ecf20Sopenharmony_ci vmovdqu %xmm9,0xa0(%rsi) 4658c2ecf20Sopenharmony_ci vextracti128 $1,%ymm10,%xmm6 4668c2ecf20Sopenharmony_ci # o3 = i3 ^ (x3 + s3), third block 4678c2ecf20Sopenharmony_ci vpaddd %ymm15,%ymm7,%ymm10 4688c2ecf20Sopenharmony_ci cmp $0xc0,%rax 4698c2ecf20Sopenharmony_ci jl .Lxorpart4 4708c2ecf20Sopenharmony_ci vpxor 0xb0(%rdx),%xmm10,%xmm9 4718c2ecf20Sopenharmony_ci vmovdqu %xmm9,0xb0(%rsi) 4728c2ecf20Sopenharmony_ci vextracti128 $1,%ymm10,%xmm7 4738c2ecf20Sopenharmony_ci 4748c2ecf20Sopenharmony_ci # xor and write fourth block 4758c2ecf20Sopenharmony_ci vmovdqa %xmm4,%xmm10 4768c2ecf20Sopenharmony_ci cmp $0xd0,%rax 4778c2ecf20Sopenharmony_ci jl .Lxorpart4 4788c2ecf20Sopenharmony_ci vpxor 0xc0(%rdx),%xmm10,%xmm9 4798c2ecf20Sopenharmony_ci vmovdqu %xmm9,0xc0(%rsi) 4808c2ecf20Sopenharmony_ci 4818c2ecf20Sopenharmony_ci vmovdqa %xmm5,%xmm10 4828c2ecf20Sopenharmony_ci cmp $0xe0,%rax 4838c2ecf20Sopenharmony_ci jl .Lxorpart4 4848c2ecf20Sopenharmony_ci vpxor 0xd0(%rdx),%xmm10,%xmm9 4858c2ecf20Sopenharmony_ci vmovdqu %xmm9,0xd0(%rsi) 4868c2ecf20Sopenharmony_ci 4878c2ecf20Sopenharmony_ci vmovdqa %xmm6,%xmm10 4888c2ecf20Sopenharmony_ci cmp $0xf0,%rax 4898c2ecf20Sopenharmony_ci jl .Lxorpart4 4908c2ecf20Sopenharmony_ci vpxor 0xe0(%rdx),%xmm10,%xmm9 4918c2ecf20Sopenharmony_ci vmovdqu %xmm9,0xe0(%rsi) 4928c2ecf20Sopenharmony_ci 4938c2ecf20Sopenharmony_ci vmovdqa %xmm7,%xmm10 4948c2ecf20Sopenharmony_ci cmp $0x100,%rax 4958c2ecf20Sopenharmony_ci jl .Lxorpart4 4968c2ecf20Sopenharmony_ci vpxor 0xf0(%rdx),%xmm10,%xmm9 4978c2ecf20Sopenharmony_ci vmovdqu %xmm9,0xf0(%rsi) 4988c2ecf20Sopenharmony_ci 4998c2ecf20Sopenharmony_ci.Ldone4: 5008c2ecf20Sopenharmony_ci vzeroupper 5018c2ecf20Sopenharmony_ci RET 5028c2ecf20Sopenharmony_ci 5038c2ecf20Sopenharmony_ci.Lxorpart4: 5048c2ecf20Sopenharmony_ci # xor remaining bytes from partial register into output 5058c2ecf20Sopenharmony_ci mov %rax,%r9 5068c2ecf20Sopenharmony_ci and $0x0f,%r9 5078c2ecf20Sopenharmony_ci jz .Ldone4 5088c2ecf20Sopenharmony_ci and $~0x0f,%rax 5098c2ecf20Sopenharmony_ci 5108c2ecf20Sopenharmony_ci mov %rsi,%r11 5118c2ecf20Sopenharmony_ci 5128c2ecf20Sopenharmony_ci lea 8(%rsp),%r10 5138c2ecf20Sopenharmony_ci sub $0x10,%rsp 5148c2ecf20Sopenharmony_ci and $~31,%rsp 5158c2ecf20Sopenharmony_ci 5168c2ecf20Sopenharmony_ci lea (%rdx,%rax),%rsi 5178c2ecf20Sopenharmony_ci mov %rsp,%rdi 5188c2ecf20Sopenharmony_ci mov %r9,%rcx 5198c2ecf20Sopenharmony_ci rep movsb 5208c2ecf20Sopenharmony_ci 5218c2ecf20Sopenharmony_ci vpxor 0x00(%rsp),%xmm10,%xmm10 5228c2ecf20Sopenharmony_ci vmovdqa %xmm10,0x00(%rsp) 5238c2ecf20Sopenharmony_ci 5248c2ecf20Sopenharmony_ci mov %rsp,%rsi 5258c2ecf20Sopenharmony_ci lea (%r11,%rax),%rdi 5268c2ecf20Sopenharmony_ci mov %r9,%rcx 5278c2ecf20Sopenharmony_ci rep movsb 5288c2ecf20Sopenharmony_ci 5298c2ecf20Sopenharmony_ci lea -8(%r10),%rsp 5308c2ecf20Sopenharmony_ci jmp .Ldone4 5318c2ecf20Sopenharmony_ci 5328c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_4block_xor_avx2) 5338c2ecf20Sopenharmony_ci 5348c2ecf20Sopenharmony_ciSYM_FUNC_START(chacha_8block_xor_avx2) 5358c2ecf20Sopenharmony_ci # %rdi: Input state matrix, s 5368c2ecf20Sopenharmony_ci # %rsi: up to 8 data blocks output, o 5378c2ecf20Sopenharmony_ci # %rdx: up to 8 data blocks input, i 5388c2ecf20Sopenharmony_ci # %rcx: input/output length in bytes 5398c2ecf20Sopenharmony_ci # %r8d: nrounds 5408c2ecf20Sopenharmony_ci 5418c2ecf20Sopenharmony_ci # This function encrypts eight consecutive ChaCha blocks by loading 5428c2ecf20Sopenharmony_ci # the state matrix in AVX registers eight times. As we need some 5438c2ecf20Sopenharmony_ci # scratch registers, we save the first four registers on the stack. The 5448c2ecf20Sopenharmony_ci # algorithm performs each operation on the corresponding word of each 5458c2ecf20Sopenharmony_ci # state matrix, hence requires no word shuffling. For final XORing step 5468c2ecf20Sopenharmony_ci # we transpose the matrix by interleaving 32-, 64- and then 128-bit 5478c2ecf20Sopenharmony_ci # words, which allows us to do XOR in AVX registers. 8/16-bit word 5488c2ecf20Sopenharmony_ci # rotation is done with the slightly better performing byte shuffling, 5498c2ecf20Sopenharmony_ci # 7/12-bit word rotation uses traditional shift+OR. 5508c2ecf20Sopenharmony_ci 5518c2ecf20Sopenharmony_ci vzeroupper 5528c2ecf20Sopenharmony_ci # 4 * 32 byte stack, 32-byte aligned 5538c2ecf20Sopenharmony_ci lea 8(%rsp),%r10 5548c2ecf20Sopenharmony_ci and $~31, %rsp 5558c2ecf20Sopenharmony_ci sub $0x80, %rsp 5568c2ecf20Sopenharmony_ci mov %rcx,%rax 5578c2ecf20Sopenharmony_ci 5588c2ecf20Sopenharmony_ci # x0..15[0-7] = s[0..15] 5598c2ecf20Sopenharmony_ci vpbroadcastd 0x00(%rdi),%ymm0 5608c2ecf20Sopenharmony_ci vpbroadcastd 0x04(%rdi),%ymm1 5618c2ecf20Sopenharmony_ci vpbroadcastd 0x08(%rdi),%ymm2 5628c2ecf20Sopenharmony_ci vpbroadcastd 0x0c(%rdi),%ymm3 5638c2ecf20Sopenharmony_ci vpbroadcastd 0x10(%rdi),%ymm4 5648c2ecf20Sopenharmony_ci vpbroadcastd 0x14(%rdi),%ymm5 5658c2ecf20Sopenharmony_ci vpbroadcastd 0x18(%rdi),%ymm6 5668c2ecf20Sopenharmony_ci vpbroadcastd 0x1c(%rdi),%ymm7 5678c2ecf20Sopenharmony_ci vpbroadcastd 0x20(%rdi),%ymm8 5688c2ecf20Sopenharmony_ci vpbroadcastd 0x24(%rdi),%ymm9 5698c2ecf20Sopenharmony_ci vpbroadcastd 0x28(%rdi),%ymm10 5708c2ecf20Sopenharmony_ci vpbroadcastd 0x2c(%rdi),%ymm11 5718c2ecf20Sopenharmony_ci vpbroadcastd 0x30(%rdi),%ymm12 5728c2ecf20Sopenharmony_ci vpbroadcastd 0x34(%rdi),%ymm13 5738c2ecf20Sopenharmony_ci vpbroadcastd 0x38(%rdi),%ymm14 5748c2ecf20Sopenharmony_ci vpbroadcastd 0x3c(%rdi),%ymm15 5758c2ecf20Sopenharmony_ci # x0..3 on stack 5768c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 5778c2ecf20Sopenharmony_ci vmovdqa %ymm1,0x20(%rsp) 5788c2ecf20Sopenharmony_ci vmovdqa %ymm2,0x40(%rsp) 5798c2ecf20Sopenharmony_ci vmovdqa %ymm3,0x60(%rsp) 5808c2ecf20Sopenharmony_ci 5818c2ecf20Sopenharmony_ci vmovdqa CTRINC(%rip),%ymm1 5828c2ecf20Sopenharmony_ci vmovdqa ROT8(%rip),%ymm2 5838c2ecf20Sopenharmony_ci vmovdqa ROT16(%rip),%ymm3 5848c2ecf20Sopenharmony_ci 5858c2ecf20Sopenharmony_ci # x12 += counter values 0-3 5868c2ecf20Sopenharmony_ci vpaddd %ymm1,%ymm12,%ymm12 5878c2ecf20Sopenharmony_ci 5888c2ecf20Sopenharmony_ci.Ldoubleround8: 5898c2ecf20Sopenharmony_ci # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 5908c2ecf20Sopenharmony_ci vpaddd 0x00(%rsp),%ymm4,%ymm0 5918c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 5928c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm12,%ymm12 5938c2ecf20Sopenharmony_ci vpshufb %ymm3,%ymm12,%ymm12 5948c2ecf20Sopenharmony_ci # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 5958c2ecf20Sopenharmony_ci vpaddd 0x20(%rsp),%ymm5,%ymm0 5968c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x20(%rsp) 5978c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm13,%ymm13 5988c2ecf20Sopenharmony_ci vpshufb %ymm3,%ymm13,%ymm13 5998c2ecf20Sopenharmony_ci # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 6008c2ecf20Sopenharmony_ci vpaddd 0x40(%rsp),%ymm6,%ymm0 6018c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x40(%rsp) 6028c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm14,%ymm14 6038c2ecf20Sopenharmony_ci vpshufb %ymm3,%ymm14,%ymm14 6048c2ecf20Sopenharmony_ci # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 6058c2ecf20Sopenharmony_ci vpaddd 0x60(%rsp),%ymm7,%ymm0 6068c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x60(%rsp) 6078c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm15,%ymm15 6088c2ecf20Sopenharmony_ci vpshufb %ymm3,%ymm15,%ymm15 6098c2ecf20Sopenharmony_ci 6108c2ecf20Sopenharmony_ci # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 6118c2ecf20Sopenharmony_ci vpaddd %ymm12,%ymm8,%ymm8 6128c2ecf20Sopenharmony_ci vpxor %ymm8,%ymm4,%ymm4 6138c2ecf20Sopenharmony_ci vpslld $12,%ymm4,%ymm0 6148c2ecf20Sopenharmony_ci vpsrld $20,%ymm4,%ymm4 6158c2ecf20Sopenharmony_ci vpor %ymm0,%ymm4,%ymm4 6168c2ecf20Sopenharmony_ci # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 6178c2ecf20Sopenharmony_ci vpaddd %ymm13,%ymm9,%ymm9 6188c2ecf20Sopenharmony_ci vpxor %ymm9,%ymm5,%ymm5 6198c2ecf20Sopenharmony_ci vpslld $12,%ymm5,%ymm0 6208c2ecf20Sopenharmony_ci vpsrld $20,%ymm5,%ymm5 6218c2ecf20Sopenharmony_ci vpor %ymm0,%ymm5,%ymm5 6228c2ecf20Sopenharmony_ci # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 6238c2ecf20Sopenharmony_ci vpaddd %ymm14,%ymm10,%ymm10 6248c2ecf20Sopenharmony_ci vpxor %ymm10,%ymm6,%ymm6 6258c2ecf20Sopenharmony_ci vpslld $12,%ymm6,%ymm0 6268c2ecf20Sopenharmony_ci vpsrld $20,%ymm6,%ymm6 6278c2ecf20Sopenharmony_ci vpor %ymm0,%ymm6,%ymm6 6288c2ecf20Sopenharmony_ci # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 6298c2ecf20Sopenharmony_ci vpaddd %ymm15,%ymm11,%ymm11 6308c2ecf20Sopenharmony_ci vpxor %ymm11,%ymm7,%ymm7 6318c2ecf20Sopenharmony_ci vpslld $12,%ymm7,%ymm0 6328c2ecf20Sopenharmony_ci vpsrld $20,%ymm7,%ymm7 6338c2ecf20Sopenharmony_ci vpor %ymm0,%ymm7,%ymm7 6348c2ecf20Sopenharmony_ci 6358c2ecf20Sopenharmony_ci # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 6368c2ecf20Sopenharmony_ci vpaddd 0x00(%rsp),%ymm4,%ymm0 6378c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 6388c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm12,%ymm12 6398c2ecf20Sopenharmony_ci vpshufb %ymm2,%ymm12,%ymm12 6408c2ecf20Sopenharmony_ci # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 6418c2ecf20Sopenharmony_ci vpaddd 0x20(%rsp),%ymm5,%ymm0 6428c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x20(%rsp) 6438c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm13,%ymm13 6448c2ecf20Sopenharmony_ci vpshufb %ymm2,%ymm13,%ymm13 6458c2ecf20Sopenharmony_ci # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 6468c2ecf20Sopenharmony_ci vpaddd 0x40(%rsp),%ymm6,%ymm0 6478c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x40(%rsp) 6488c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm14,%ymm14 6498c2ecf20Sopenharmony_ci vpshufb %ymm2,%ymm14,%ymm14 6508c2ecf20Sopenharmony_ci # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 6518c2ecf20Sopenharmony_ci vpaddd 0x60(%rsp),%ymm7,%ymm0 6528c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x60(%rsp) 6538c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm15,%ymm15 6548c2ecf20Sopenharmony_ci vpshufb %ymm2,%ymm15,%ymm15 6558c2ecf20Sopenharmony_ci 6568c2ecf20Sopenharmony_ci # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 6578c2ecf20Sopenharmony_ci vpaddd %ymm12,%ymm8,%ymm8 6588c2ecf20Sopenharmony_ci vpxor %ymm8,%ymm4,%ymm4 6598c2ecf20Sopenharmony_ci vpslld $7,%ymm4,%ymm0 6608c2ecf20Sopenharmony_ci vpsrld $25,%ymm4,%ymm4 6618c2ecf20Sopenharmony_ci vpor %ymm0,%ymm4,%ymm4 6628c2ecf20Sopenharmony_ci # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 6638c2ecf20Sopenharmony_ci vpaddd %ymm13,%ymm9,%ymm9 6648c2ecf20Sopenharmony_ci vpxor %ymm9,%ymm5,%ymm5 6658c2ecf20Sopenharmony_ci vpslld $7,%ymm5,%ymm0 6668c2ecf20Sopenharmony_ci vpsrld $25,%ymm5,%ymm5 6678c2ecf20Sopenharmony_ci vpor %ymm0,%ymm5,%ymm5 6688c2ecf20Sopenharmony_ci # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 6698c2ecf20Sopenharmony_ci vpaddd %ymm14,%ymm10,%ymm10 6708c2ecf20Sopenharmony_ci vpxor %ymm10,%ymm6,%ymm6 6718c2ecf20Sopenharmony_ci vpslld $7,%ymm6,%ymm0 6728c2ecf20Sopenharmony_ci vpsrld $25,%ymm6,%ymm6 6738c2ecf20Sopenharmony_ci vpor %ymm0,%ymm6,%ymm6 6748c2ecf20Sopenharmony_ci # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 6758c2ecf20Sopenharmony_ci vpaddd %ymm15,%ymm11,%ymm11 6768c2ecf20Sopenharmony_ci vpxor %ymm11,%ymm7,%ymm7 6778c2ecf20Sopenharmony_ci vpslld $7,%ymm7,%ymm0 6788c2ecf20Sopenharmony_ci vpsrld $25,%ymm7,%ymm7 6798c2ecf20Sopenharmony_ci vpor %ymm0,%ymm7,%ymm7 6808c2ecf20Sopenharmony_ci 6818c2ecf20Sopenharmony_ci # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 6828c2ecf20Sopenharmony_ci vpaddd 0x00(%rsp),%ymm5,%ymm0 6838c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 6848c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm15,%ymm15 6858c2ecf20Sopenharmony_ci vpshufb %ymm3,%ymm15,%ymm15 6868c2ecf20Sopenharmony_ci # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 6878c2ecf20Sopenharmony_ci vpaddd 0x20(%rsp),%ymm6,%ymm0 6888c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x20(%rsp) 6898c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm12,%ymm12 6908c2ecf20Sopenharmony_ci vpshufb %ymm3,%ymm12,%ymm12 6918c2ecf20Sopenharmony_ci # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 6928c2ecf20Sopenharmony_ci vpaddd 0x40(%rsp),%ymm7,%ymm0 6938c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x40(%rsp) 6948c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm13,%ymm13 6958c2ecf20Sopenharmony_ci vpshufb %ymm3,%ymm13,%ymm13 6968c2ecf20Sopenharmony_ci # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 6978c2ecf20Sopenharmony_ci vpaddd 0x60(%rsp),%ymm4,%ymm0 6988c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x60(%rsp) 6998c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm14,%ymm14 7008c2ecf20Sopenharmony_ci vpshufb %ymm3,%ymm14,%ymm14 7018c2ecf20Sopenharmony_ci 7028c2ecf20Sopenharmony_ci # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 7038c2ecf20Sopenharmony_ci vpaddd %ymm15,%ymm10,%ymm10 7048c2ecf20Sopenharmony_ci vpxor %ymm10,%ymm5,%ymm5 7058c2ecf20Sopenharmony_ci vpslld $12,%ymm5,%ymm0 7068c2ecf20Sopenharmony_ci vpsrld $20,%ymm5,%ymm5 7078c2ecf20Sopenharmony_ci vpor %ymm0,%ymm5,%ymm5 7088c2ecf20Sopenharmony_ci # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 7098c2ecf20Sopenharmony_ci vpaddd %ymm12,%ymm11,%ymm11 7108c2ecf20Sopenharmony_ci vpxor %ymm11,%ymm6,%ymm6 7118c2ecf20Sopenharmony_ci vpslld $12,%ymm6,%ymm0 7128c2ecf20Sopenharmony_ci vpsrld $20,%ymm6,%ymm6 7138c2ecf20Sopenharmony_ci vpor %ymm0,%ymm6,%ymm6 7148c2ecf20Sopenharmony_ci # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 7158c2ecf20Sopenharmony_ci vpaddd %ymm13,%ymm8,%ymm8 7168c2ecf20Sopenharmony_ci vpxor %ymm8,%ymm7,%ymm7 7178c2ecf20Sopenharmony_ci vpslld $12,%ymm7,%ymm0 7188c2ecf20Sopenharmony_ci vpsrld $20,%ymm7,%ymm7 7198c2ecf20Sopenharmony_ci vpor %ymm0,%ymm7,%ymm7 7208c2ecf20Sopenharmony_ci # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 7218c2ecf20Sopenharmony_ci vpaddd %ymm14,%ymm9,%ymm9 7228c2ecf20Sopenharmony_ci vpxor %ymm9,%ymm4,%ymm4 7238c2ecf20Sopenharmony_ci vpslld $12,%ymm4,%ymm0 7248c2ecf20Sopenharmony_ci vpsrld $20,%ymm4,%ymm4 7258c2ecf20Sopenharmony_ci vpor %ymm0,%ymm4,%ymm4 7268c2ecf20Sopenharmony_ci 7278c2ecf20Sopenharmony_ci # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 7288c2ecf20Sopenharmony_ci vpaddd 0x00(%rsp),%ymm5,%ymm0 7298c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 7308c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm15,%ymm15 7318c2ecf20Sopenharmony_ci vpshufb %ymm2,%ymm15,%ymm15 7328c2ecf20Sopenharmony_ci # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 7338c2ecf20Sopenharmony_ci vpaddd 0x20(%rsp),%ymm6,%ymm0 7348c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x20(%rsp) 7358c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm12,%ymm12 7368c2ecf20Sopenharmony_ci vpshufb %ymm2,%ymm12,%ymm12 7378c2ecf20Sopenharmony_ci # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 7388c2ecf20Sopenharmony_ci vpaddd 0x40(%rsp),%ymm7,%ymm0 7398c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x40(%rsp) 7408c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm13,%ymm13 7418c2ecf20Sopenharmony_ci vpshufb %ymm2,%ymm13,%ymm13 7428c2ecf20Sopenharmony_ci # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 7438c2ecf20Sopenharmony_ci vpaddd 0x60(%rsp),%ymm4,%ymm0 7448c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x60(%rsp) 7458c2ecf20Sopenharmony_ci vpxor %ymm0,%ymm14,%ymm14 7468c2ecf20Sopenharmony_ci vpshufb %ymm2,%ymm14,%ymm14 7478c2ecf20Sopenharmony_ci 7488c2ecf20Sopenharmony_ci # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 7498c2ecf20Sopenharmony_ci vpaddd %ymm15,%ymm10,%ymm10 7508c2ecf20Sopenharmony_ci vpxor %ymm10,%ymm5,%ymm5 7518c2ecf20Sopenharmony_ci vpslld $7,%ymm5,%ymm0 7528c2ecf20Sopenharmony_ci vpsrld $25,%ymm5,%ymm5 7538c2ecf20Sopenharmony_ci vpor %ymm0,%ymm5,%ymm5 7548c2ecf20Sopenharmony_ci # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 7558c2ecf20Sopenharmony_ci vpaddd %ymm12,%ymm11,%ymm11 7568c2ecf20Sopenharmony_ci vpxor %ymm11,%ymm6,%ymm6 7578c2ecf20Sopenharmony_ci vpslld $7,%ymm6,%ymm0 7588c2ecf20Sopenharmony_ci vpsrld $25,%ymm6,%ymm6 7598c2ecf20Sopenharmony_ci vpor %ymm0,%ymm6,%ymm6 7608c2ecf20Sopenharmony_ci # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 7618c2ecf20Sopenharmony_ci vpaddd %ymm13,%ymm8,%ymm8 7628c2ecf20Sopenharmony_ci vpxor %ymm8,%ymm7,%ymm7 7638c2ecf20Sopenharmony_ci vpslld $7,%ymm7,%ymm0 7648c2ecf20Sopenharmony_ci vpsrld $25,%ymm7,%ymm7 7658c2ecf20Sopenharmony_ci vpor %ymm0,%ymm7,%ymm7 7668c2ecf20Sopenharmony_ci # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 7678c2ecf20Sopenharmony_ci vpaddd %ymm14,%ymm9,%ymm9 7688c2ecf20Sopenharmony_ci vpxor %ymm9,%ymm4,%ymm4 7698c2ecf20Sopenharmony_ci vpslld $7,%ymm4,%ymm0 7708c2ecf20Sopenharmony_ci vpsrld $25,%ymm4,%ymm4 7718c2ecf20Sopenharmony_ci vpor %ymm0,%ymm4,%ymm4 7728c2ecf20Sopenharmony_ci 7738c2ecf20Sopenharmony_ci sub $2,%r8d 7748c2ecf20Sopenharmony_ci jnz .Ldoubleround8 7758c2ecf20Sopenharmony_ci 7768c2ecf20Sopenharmony_ci # x0..15[0-3] += s[0..15] 7778c2ecf20Sopenharmony_ci vpbroadcastd 0x00(%rdi),%ymm0 7788c2ecf20Sopenharmony_ci vpaddd 0x00(%rsp),%ymm0,%ymm0 7798c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 7808c2ecf20Sopenharmony_ci vpbroadcastd 0x04(%rdi),%ymm0 7818c2ecf20Sopenharmony_ci vpaddd 0x20(%rsp),%ymm0,%ymm0 7828c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x20(%rsp) 7838c2ecf20Sopenharmony_ci vpbroadcastd 0x08(%rdi),%ymm0 7848c2ecf20Sopenharmony_ci vpaddd 0x40(%rsp),%ymm0,%ymm0 7858c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x40(%rsp) 7868c2ecf20Sopenharmony_ci vpbroadcastd 0x0c(%rdi),%ymm0 7878c2ecf20Sopenharmony_ci vpaddd 0x60(%rsp),%ymm0,%ymm0 7888c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x60(%rsp) 7898c2ecf20Sopenharmony_ci vpbroadcastd 0x10(%rdi),%ymm0 7908c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm4,%ymm4 7918c2ecf20Sopenharmony_ci vpbroadcastd 0x14(%rdi),%ymm0 7928c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm5,%ymm5 7938c2ecf20Sopenharmony_ci vpbroadcastd 0x18(%rdi),%ymm0 7948c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm6,%ymm6 7958c2ecf20Sopenharmony_ci vpbroadcastd 0x1c(%rdi),%ymm0 7968c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm7,%ymm7 7978c2ecf20Sopenharmony_ci vpbroadcastd 0x20(%rdi),%ymm0 7988c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm8,%ymm8 7998c2ecf20Sopenharmony_ci vpbroadcastd 0x24(%rdi),%ymm0 8008c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm9,%ymm9 8018c2ecf20Sopenharmony_ci vpbroadcastd 0x28(%rdi),%ymm0 8028c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm10,%ymm10 8038c2ecf20Sopenharmony_ci vpbroadcastd 0x2c(%rdi),%ymm0 8048c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm11,%ymm11 8058c2ecf20Sopenharmony_ci vpbroadcastd 0x30(%rdi),%ymm0 8068c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm12,%ymm12 8078c2ecf20Sopenharmony_ci vpbroadcastd 0x34(%rdi),%ymm0 8088c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm13,%ymm13 8098c2ecf20Sopenharmony_ci vpbroadcastd 0x38(%rdi),%ymm0 8108c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm14,%ymm14 8118c2ecf20Sopenharmony_ci vpbroadcastd 0x3c(%rdi),%ymm0 8128c2ecf20Sopenharmony_ci vpaddd %ymm0,%ymm15,%ymm15 8138c2ecf20Sopenharmony_ci 8148c2ecf20Sopenharmony_ci # x12 += counter values 0-3 8158c2ecf20Sopenharmony_ci vpaddd %ymm1,%ymm12,%ymm12 8168c2ecf20Sopenharmony_ci 8178c2ecf20Sopenharmony_ci # interleave 32-bit words in state n, n+1 8188c2ecf20Sopenharmony_ci vmovdqa 0x00(%rsp),%ymm0 8198c2ecf20Sopenharmony_ci vmovdqa 0x20(%rsp),%ymm1 8208c2ecf20Sopenharmony_ci vpunpckldq %ymm1,%ymm0,%ymm2 8218c2ecf20Sopenharmony_ci vpunpckhdq %ymm1,%ymm0,%ymm1 8228c2ecf20Sopenharmony_ci vmovdqa %ymm2,0x00(%rsp) 8238c2ecf20Sopenharmony_ci vmovdqa %ymm1,0x20(%rsp) 8248c2ecf20Sopenharmony_ci vmovdqa 0x40(%rsp),%ymm0 8258c2ecf20Sopenharmony_ci vmovdqa 0x60(%rsp),%ymm1 8268c2ecf20Sopenharmony_ci vpunpckldq %ymm1,%ymm0,%ymm2 8278c2ecf20Sopenharmony_ci vpunpckhdq %ymm1,%ymm0,%ymm1 8288c2ecf20Sopenharmony_ci vmovdqa %ymm2,0x40(%rsp) 8298c2ecf20Sopenharmony_ci vmovdqa %ymm1,0x60(%rsp) 8308c2ecf20Sopenharmony_ci vmovdqa %ymm4,%ymm0 8318c2ecf20Sopenharmony_ci vpunpckldq %ymm5,%ymm0,%ymm4 8328c2ecf20Sopenharmony_ci vpunpckhdq %ymm5,%ymm0,%ymm5 8338c2ecf20Sopenharmony_ci vmovdqa %ymm6,%ymm0 8348c2ecf20Sopenharmony_ci vpunpckldq %ymm7,%ymm0,%ymm6 8358c2ecf20Sopenharmony_ci vpunpckhdq %ymm7,%ymm0,%ymm7 8368c2ecf20Sopenharmony_ci vmovdqa %ymm8,%ymm0 8378c2ecf20Sopenharmony_ci vpunpckldq %ymm9,%ymm0,%ymm8 8388c2ecf20Sopenharmony_ci vpunpckhdq %ymm9,%ymm0,%ymm9 8398c2ecf20Sopenharmony_ci vmovdqa %ymm10,%ymm0 8408c2ecf20Sopenharmony_ci vpunpckldq %ymm11,%ymm0,%ymm10 8418c2ecf20Sopenharmony_ci vpunpckhdq %ymm11,%ymm0,%ymm11 8428c2ecf20Sopenharmony_ci vmovdqa %ymm12,%ymm0 8438c2ecf20Sopenharmony_ci vpunpckldq %ymm13,%ymm0,%ymm12 8448c2ecf20Sopenharmony_ci vpunpckhdq %ymm13,%ymm0,%ymm13 8458c2ecf20Sopenharmony_ci vmovdqa %ymm14,%ymm0 8468c2ecf20Sopenharmony_ci vpunpckldq %ymm15,%ymm0,%ymm14 8478c2ecf20Sopenharmony_ci vpunpckhdq %ymm15,%ymm0,%ymm15 8488c2ecf20Sopenharmony_ci 8498c2ecf20Sopenharmony_ci # interleave 64-bit words in state n, n+2 8508c2ecf20Sopenharmony_ci vmovdqa 0x00(%rsp),%ymm0 8518c2ecf20Sopenharmony_ci vmovdqa 0x40(%rsp),%ymm2 8528c2ecf20Sopenharmony_ci vpunpcklqdq %ymm2,%ymm0,%ymm1 8538c2ecf20Sopenharmony_ci vpunpckhqdq %ymm2,%ymm0,%ymm2 8548c2ecf20Sopenharmony_ci vmovdqa %ymm1,0x00(%rsp) 8558c2ecf20Sopenharmony_ci vmovdqa %ymm2,0x40(%rsp) 8568c2ecf20Sopenharmony_ci vmovdqa 0x20(%rsp),%ymm0 8578c2ecf20Sopenharmony_ci vmovdqa 0x60(%rsp),%ymm2 8588c2ecf20Sopenharmony_ci vpunpcklqdq %ymm2,%ymm0,%ymm1 8598c2ecf20Sopenharmony_ci vpunpckhqdq %ymm2,%ymm0,%ymm2 8608c2ecf20Sopenharmony_ci vmovdqa %ymm1,0x20(%rsp) 8618c2ecf20Sopenharmony_ci vmovdqa %ymm2,0x60(%rsp) 8628c2ecf20Sopenharmony_ci vmovdqa %ymm4,%ymm0 8638c2ecf20Sopenharmony_ci vpunpcklqdq %ymm6,%ymm0,%ymm4 8648c2ecf20Sopenharmony_ci vpunpckhqdq %ymm6,%ymm0,%ymm6 8658c2ecf20Sopenharmony_ci vmovdqa %ymm5,%ymm0 8668c2ecf20Sopenharmony_ci vpunpcklqdq %ymm7,%ymm0,%ymm5 8678c2ecf20Sopenharmony_ci vpunpckhqdq %ymm7,%ymm0,%ymm7 8688c2ecf20Sopenharmony_ci vmovdqa %ymm8,%ymm0 8698c2ecf20Sopenharmony_ci vpunpcklqdq %ymm10,%ymm0,%ymm8 8708c2ecf20Sopenharmony_ci vpunpckhqdq %ymm10,%ymm0,%ymm10 8718c2ecf20Sopenharmony_ci vmovdqa %ymm9,%ymm0 8728c2ecf20Sopenharmony_ci vpunpcklqdq %ymm11,%ymm0,%ymm9 8738c2ecf20Sopenharmony_ci vpunpckhqdq %ymm11,%ymm0,%ymm11 8748c2ecf20Sopenharmony_ci vmovdqa %ymm12,%ymm0 8758c2ecf20Sopenharmony_ci vpunpcklqdq %ymm14,%ymm0,%ymm12 8768c2ecf20Sopenharmony_ci vpunpckhqdq %ymm14,%ymm0,%ymm14 8778c2ecf20Sopenharmony_ci vmovdqa %ymm13,%ymm0 8788c2ecf20Sopenharmony_ci vpunpcklqdq %ymm15,%ymm0,%ymm13 8798c2ecf20Sopenharmony_ci vpunpckhqdq %ymm15,%ymm0,%ymm15 8808c2ecf20Sopenharmony_ci 8818c2ecf20Sopenharmony_ci # interleave 128-bit words in state n, n+4 8828c2ecf20Sopenharmony_ci # xor/write first four blocks 8838c2ecf20Sopenharmony_ci vmovdqa 0x00(%rsp),%ymm1 8848c2ecf20Sopenharmony_ci vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 8858c2ecf20Sopenharmony_ci cmp $0x0020,%rax 8868c2ecf20Sopenharmony_ci jl .Lxorpart8 8878c2ecf20Sopenharmony_ci vpxor 0x0000(%rdx),%ymm0,%ymm0 8888c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x0000(%rsi) 8898c2ecf20Sopenharmony_ci vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 8908c2ecf20Sopenharmony_ci 8918c2ecf20Sopenharmony_ci vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 8928c2ecf20Sopenharmony_ci cmp $0x0040,%rax 8938c2ecf20Sopenharmony_ci jl .Lxorpart8 8948c2ecf20Sopenharmony_ci vpxor 0x0020(%rdx),%ymm0,%ymm0 8958c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x0020(%rsi) 8968c2ecf20Sopenharmony_ci vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 8978c2ecf20Sopenharmony_ci 8988c2ecf20Sopenharmony_ci vmovdqa 0x40(%rsp),%ymm1 8998c2ecf20Sopenharmony_ci vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 9008c2ecf20Sopenharmony_ci cmp $0x0060,%rax 9018c2ecf20Sopenharmony_ci jl .Lxorpart8 9028c2ecf20Sopenharmony_ci vpxor 0x0040(%rdx),%ymm0,%ymm0 9038c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x0040(%rsi) 9048c2ecf20Sopenharmony_ci vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 9058c2ecf20Sopenharmony_ci 9068c2ecf20Sopenharmony_ci vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 9078c2ecf20Sopenharmony_ci cmp $0x0080,%rax 9088c2ecf20Sopenharmony_ci jl .Lxorpart8 9098c2ecf20Sopenharmony_ci vpxor 0x0060(%rdx),%ymm0,%ymm0 9108c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x0060(%rsi) 9118c2ecf20Sopenharmony_ci vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 9128c2ecf20Sopenharmony_ci 9138c2ecf20Sopenharmony_ci vmovdqa 0x20(%rsp),%ymm1 9148c2ecf20Sopenharmony_ci vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 9158c2ecf20Sopenharmony_ci cmp $0x00a0,%rax 9168c2ecf20Sopenharmony_ci jl .Lxorpart8 9178c2ecf20Sopenharmony_ci vpxor 0x0080(%rdx),%ymm0,%ymm0 9188c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x0080(%rsi) 9198c2ecf20Sopenharmony_ci vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 9208c2ecf20Sopenharmony_ci 9218c2ecf20Sopenharmony_ci vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 9228c2ecf20Sopenharmony_ci cmp $0x00c0,%rax 9238c2ecf20Sopenharmony_ci jl .Lxorpart8 9248c2ecf20Sopenharmony_ci vpxor 0x00a0(%rdx),%ymm0,%ymm0 9258c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x00a0(%rsi) 9268c2ecf20Sopenharmony_ci vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 9278c2ecf20Sopenharmony_ci 9288c2ecf20Sopenharmony_ci vmovdqa 0x60(%rsp),%ymm1 9298c2ecf20Sopenharmony_ci vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 9308c2ecf20Sopenharmony_ci cmp $0x00e0,%rax 9318c2ecf20Sopenharmony_ci jl .Lxorpart8 9328c2ecf20Sopenharmony_ci vpxor 0x00c0(%rdx),%ymm0,%ymm0 9338c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x00c0(%rsi) 9348c2ecf20Sopenharmony_ci vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 9358c2ecf20Sopenharmony_ci 9368c2ecf20Sopenharmony_ci vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 9378c2ecf20Sopenharmony_ci cmp $0x0100,%rax 9388c2ecf20Sopenharmony_ci jl .Lxorpart8 9398c2ecf20Sopenharmony_ci vpxor 0x00e0(%rdx),%ymm0,%ymm0 9408c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x00e0(%rsi) 9418c2ecf20Sopenharmony_ci vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 9428c2ecf20Sopenharmony_ci 9438c2ecf20Sopenharmony_ci # xor remaining blocks, write to output 9448c2ecf20Sopenharmony_ci vmovdqa %ymm4,%ymm0 9458c2ecf20Sopenharmony_ci cmp $0x0120,%rax 9468c2ecf20Sopenharmony_ci jl .Lxorpart8 9478c2ecf20Sopenharmony_ci vpxor 0x0100(%rdx),%ymm0,%ymm0 9488c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x0100(%rsi) 9498c2ecf20Sopenharmony_ci 9508c2ecf20Sopenharmony_ci vmovdqa %ymm12,%ymm0 9518c2ecf20Sopenharmony_ci cmp $0x0140,%rax 9528c2ecf20Sopenharmony_ci jl .Lxorpart8 9538c2ecf20Sopenharmony_ci vpxor 0x0120(%rdx),%ymm0,%ymm0 9548c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x0120(%rsi) 9558c2ecf20Sopenharmony_ci 9568c2ecf20Sopenharmony_ci vmovdqa %ymm6,%ymm0 9578c2ecf20Sopenharmony_ci cmp $0x0160,%rax 9588c2ecf20Sopenharmony_ci jl .Lxorpart8 9598c2ecf20Sopenharmony_ci vpxor 0x0140(%rdx),%ymm0,%ymm0 9608c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x0140(%rsi) 9618c2ecf20Sopenharmony_ci 9628c2ecf20Sopenharmony_ci vmovdqa %ymm14,%ymm0 9638c2ecf20Sopenharmony_ci cmp $0x0180,%rax 9648c2ecf20Sopenharmony_ci jl .Lxorpart8 9658c2ecf20Sopenharmony_ci vpxor 0x0160(%rdx),%ymm0,%ymm0 9668c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x0160(%rsi) 9678c2ecf20Sopenharmony_ci 9688c2ecf20Sopenharmony_ci vmovdqa %ymm5,%ymm0 9698c2ecf20Sopenharmony_ci cmp $0x01a0,%rax 9708c2ecf20Sopenharmony_ci jl .Lxorpart8 9718c2ecf20Sopenharmony_ci vpxor 0x0180(%rdx),%ymm0,%ymm0 9728c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x0180(%rsi) 9738c2ecf20Sopenharmony_ci 9748c2ecf20Sopenharmony_ci vmovdqa %ymm13,%ymm0 9758c2ecf20Sopenharmony_ci cmp $0x01c0,%rax 9768c2ecf20Sopenharmony_ci jl .Lxorpart8 9778c2ecf20Sopenharmony_ci vpxor 0x01a0(%rdx),%ymm0,%ymm0 9788c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x01a0(%rsi) 9798c2ecf20Sopenharmony_ci 9808c2ecf20Sopenharmony_ci vmovdqa %ymm7,%ymm0 9818c2ecf20Sopenharmony_ci cmp $0x01e0,%rax 9828c2ecf20Sopenharmony_ci jl .Lxorpart8 9838c2ecf20Sopenharmony_ci vpxor 0x01c0(%rdx),%ymm0,%ymm0 9848c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x01c0(%rsi) 9858c2ecf20Sopenharmony_ci 9868c2ecf20Sopenharmony_ci vmovdqa %ymm15,%ymm0 9878c2ecf20Sopenharmony_ci cmp $0x0200,%rax 9888c2ecf20Sopenharmony_ci jl .Lxorpart8 9898c2ecf20Sopenharmony_ci vpxor 0x01e0(%rdx),%ymm0,%ymm0 9908c2ecf20Sopenharmony_ci vmovdqu %ymm0,0x01e0(%rsi) 9918c2ecf20Sopenharmony_ci 9928c2ecf20Sopenharmony_ci.Ldone8: 9938c2ecf20Sopenharmony_ci vzeroupper 9948c2ecf20Sopenharmony_ci lea -8(%r10),%rsp 9958c2ecf20Sopenharmony_ci RET 9968c2ecf20Sopenharmony_ci 9978c2ecf20Sopenharmony_ci.Lxorpart8: 9988c2ecf20Sopenharmony_ci # xor remaining bytes from partial register into output 9998c2ecf20Sopenharmony_ci mov %rax,%r9 10008c2ecf20Sopenharmony_ci and $0x1f,%r9 10018c2ecf20Sopenharmony_ci jz .Ldone8 10028c2ecf20Sopenharmony_ci and $~0x1f,%rax 10038c2ecf20Sopenharmony_ci 10048c2ecf20Sopenharmony_ci mov %rsi,%r11 10058c2ecf20Sopenharmony_ci 10068c2ecf20Sopenharmony_ci lea (%rdx,%rax),%rsi 10078c2ecf20Sopenharmony_ci mov %rsp,%rdi 10088c2ecf20Sopenharmony_ci mov %r9,%rcx 10098c2ecf20Sopenharmony_ci rep movsb 10108c2ecf20Sopenharmony_ci 10118c2ecf20Sopenharmony_ci vpxor 0x00(%rsp),%ymm0,%ymm0 10128c2ecf20Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 10138c2ecf20Sopenharmony_ci 10148c2ecf20Sopenharmony_ci mov %rsp,%rsi 10158c2ecf20Sopenharmony_ci lea (%r11,%rax),%rdi 10168c2ecf20Sopenharmony_ci mov %r9,%rcx 10178c2ecf20Sopenharmony_ci rep movsb 10188c2ecf20Sopenharmony_ci 10198c2ecf20Sopenharmony_ci jmp .Ldone8 10208c2ecf20Sopenharmony_ci 10218c2ecf20Sopenharmony_ciSYM_FUNC_END(chacha_8block_xor_avx2) 1022