162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * ChaCha 256-bit cipher algorithm, x64 AVX2 functions 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2015 Martin Willi 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci#include <linux/linkage.h> 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci.section .rodata.cst32.ROT8, "aM", @progbits, 32 1162306a36Sopenharmony_ci.align 32 1262306a36Sopenharmony_ciROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 1362306a36Sopenharmony_ci .octa 0x0e0d0c0f0a09080b0605040702010003 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci.section .rodata.cst32.ROT16, "aM", @progbits, 32 1662306a36Sopenharmony_ci.align 32 1762306a36Sopenharmony_ciROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 1862306a36Sopenharmony_ci .octa 0x0d0c0f0e09080b0a0504070601000302 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci.section .rodata.cst32.CTRINC, "aM", @progbits, 32 2162306a36Sopenharmony_ci.align 32 2262306a36Sopenharmony_ciCTRINC: .octa 0x00000003000000020000000100000000 2362306a36Sopenharmony_ci .octa 0x00000007000000060000000500000004 2462306a36Sopenharmony_ci 2562306a36Sopenharmony_ci.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 2662306a36Sopenharmony_ci.align 32 2762306a36Sopenharmony_ciCTR2BL: .octa 0x00000000000000000000000000000000 2862306a36Sopenharmony_ci .octa 0x00000000000000000000000000000001 2962306a36Sopenharmony_ci 3062306a36Sopenharmony_ci.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 3162306a36Sopenharmony_ci.align 32 3262306a36Sopenharmony_ciCTR4BL: .octa 0x00000000000000000000000000000002 3362306a36Sopenharmony_ci .octa 0x00000000000000000000000000000003 3462306a36Sopenharmony_ci 3562306a36Sopenharmony_ci.text 3662306a36Sopenharmony_ci 3762306a36Sopenharmony_ciSYM_FUNC_START(chacha_2block_xor_avx2) 3862306a36Sopenharmony_ci # %rdi: Input state matrix, s 3962306a36Sopenharmony_ci # %rsi: up to 2 data blocks output, o 4062306a36Sopenharmony_ci # %rdx: up to 2 data blocks input, i 4162306a36Sopenharmony_ci # %rcx: input/output length in bytes 4262306a36Sopenharmony_ci # %r8d: nrounds 4362306a36Sopenharmony_ci 4462306a36Sopenharmony_ci # This function encrypts two ChaCha blocks by loading the state 4562306a36Sopenharmony_ci # matrix twice across four AVX registers. It performs matrix operations 4662306a36Sopenharmony_ci # on four words in each matrix in parallel, but requires shuffling to 4762306a36Sopenharmony_ci # rearrange the words after each round. 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_ci vzeroupper 5062306a36Sopenharmony_ci 5162306a36Sopenharmony_ci # x0..3[0-2] = s0..3 5262306a36Sopenharmony_ci vbroadcasti128 0x00(%rdi),%ymm0 5362306a36Sopenharmony_ci vbroadcasti128 0x10(%rdi),%ymm1 5462306a36Sopenharmony_ci vbroadcasti128 0x20(%rdi),%ymm2 5562306a36Sopenharmony_ci vbroadcasti128 0x30(%rdi),%ymm3 5662306a36Sopenharmony_ci 5762306a36Sopenharmony_ci vpaddd CTR2BL(%rip),%ymm3,%ymm3 5862306a36Sopenharmony_ci 5962306a36Sopenharmony_ci vmovdqa %ymm0,%ymm8 6062306a36Sopenharmony_ci vmovdqa %ymm1,%ymm9 6162306a36Sopenharmony_ci vmovdqa %ymm2,%ymm10 6262306a36Sopenharmony_ci vmovdqa %ymm3,%ymm11 6362306a36Sopenharmony_ci 6462306a36Sopenharmony_ci vmovdqa ROT8(%rip),%ymm4 6562306a36Sopenharmony_ci vmovdqa ROT16(%rip),%ymm5 6662306a36Sopenharmony_ci 6762306a36Sopenharmony_ci mov %rcx,%rax 6862306a36Sopenharmony_ci 6962306a36Sopenharmony_ci.Ldoubleround: 7062306a36Sopenharmony_ci 7162306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 7262306a36Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 7362306a36Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 7462306a36Sopenharmony_ci vpshufb %ymm5,%ymm3,%ymm3 7562306a36Sopenharmony_ci 7662306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 7762306a36Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 7862306a36Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 7962306a36Sopenharmony_ci vmovdqa %ymm1,%ymm6 8062306a36Sopenharmony_ci vpslld $12,%ymm6,%ymm6 8162306a36Sopenharmony_ci vpsrld $20,%ymm1,%ymm1 8262306a36Sopenharmony_ci vpor %ymm6,%ymm1,%ymm1 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 8562306a36Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 8662306a36Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 8762306a36Sopenharmony_ci vpshufb %ymm4,%ymm3,%ymm3 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 9062306a36Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 9162306a36Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 9262306a36Sopenharmony_ci vmovdqa %ymm1,%ymm7 9362306a36Sopenharmony_ci vpslld $7,%ymm7,%ymm7 9462306a36Sopenharmony_ci vpsrld $25,%ymm1,%ymm1 9562306a36Sopenharmony_ci vpor %ymm7,%ymm1,%ymm1 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 9862306a36Sopenharmony_ci vpshufd $0x39,%ymm1,%ymm1 9962306a36Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 10062306a36Sopenharmony_ci vpshufd $0x4e,%ymm2,%ymm2 10162306a36Sopenharmony_ci # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 10262306a36Sopenharmony_ci vpshufd $0x93,%ymm3,%ymm3 10362306a36Sopenharmony_ci 10462306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 10562306a36Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 10662306a36Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 10762306a36Sopenharmony_ci vpshufb %ymm5,%ymm3,%ymm3 10862306a36Sopenharmony_ci 10962306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 11062306a36Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 11162306a36Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 11262306a36Sopenharmony_ci vmovdqa %ymm1,%ymm6 11362306a36Sopenharmony_ci vpslld $12,%ymm6,%ymm6 11462306a36Sopenharmony_ci vpsrld $20,%ymm1,%ymm1 11562306a36Sopenharmony_ci vpor %ymm6,%ymm1,%ymm1 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 11862306a36Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 11962306a36Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 12062306a36Sopenharmony_ci vpshufb %ymm4,%ymm3,%ymm3 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 12362306a36Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 12462306a36Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 12562306a36Sopenharmony_ci vmovdqa %ymm1,%ymm7 12662306a36Sopenharmony_ci vpslld $7,%ymm7,%ymm7 12762306a36Sopenharmony_ci vpsrld $25,%ymm1,%ymm1 12862306a36Sopenharmony_ci vpor %ymm7,%ymm1,%ymm1 12962306a36Sopenharmony_ci 13062306a36Sopenharmony_ci # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 13162306a36Sopenharmony_ci vpshufd $0x93,%ymm1,%ymm1 13262306a36Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 13362306a36Sopenharmony_ci vpshufd $0x4e,%ymm2,%ymm2 13462306a36Sopenharmony_ci # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 13562306a36Sopenharmony_ci vpshufd $0x39,%ymm3,%ymm3 13662306a36Sopenharmony_ci 13762306a36Sopenharmony_ci sub $2,%r8d 13862306a36Sopenharmony_ci jnz .Ldoubleround 13962306a36Sopenharmony_ci 14062306a36Sopenharmony_ci # o0 = i0 ^ (x0 + s0) 14162306a36Sopenharmony_ci vpaddd %ymm8,%ymm0,%ymm7 14262306a36Sopenharmony_ci cmp $0x10,%rax 14362306a36Sopenharmony_ci jl .Lxorpart2 14462306a36Sopenharmony_ci vpxor 0x00(%rdx),%xmm7,%xmm6 14562306a36Sopenharmony_ci vmovdqu %xmm6,0x00(%rsi) 14662306a36Sopenharmony_ci vextracti128 $1,%ymm7,%xmm0 14762306a36Sopenharmony_ci # o1 = i1 ^ (x1 + s1) 14862306a36Sopenharmony_ci vpaddd %ymm9,%ymm1,%ymm7 14962306a36Sopenharmony_ci cmp $0x20,%rax 15062306a36Sopenharmony_ci jl .Lxorpart2 15162306a36Sopenharmony_ci vpxor 0x10(%rdx),%xmm7,%xmm6 15262306a36Sopenharmony_ci vmovdqu %xmm6,0x10(%rsi) 15362306a36Sopenharmony_ci vextracti128 $1,%ymm7,%xmm1 15462306a36Sopenharmony_ci # o2 = i2 ^ (x2 + s2) 15562306a36Sopenharmony_ci vpaddd %ymm10,%ymm2,%ymm7 15662306a36Sopenharmony_ci cmp $0x30,%rax 15762306a36Sopenharmony_ci jl .Lxorpart2 15862306a36Sopenharmony_ci vpxor 0x20(%rdx),%xmm7,%xmm6 15962306a36Sopenharmony_ci vmovdqu %xmm6,0x20(%rsi) 16062306a36Sopenharmony_ci vextracti128 $1,%ymm7,%xmm2 16162306a36Sopenharmony_ci # o3 = i3 ^ (x3 + s3) 16262306a36Sopenharmony_ci vpaddd %ymm11,%ymm3,%ymm7 16362306a36Sopenharmony_ci cmp $0x40,%rax 16462306a36Sopenharmony_ci jl .Lxorpart2 16562306a36Sopenharmony_ci vpxor 0x30(%rdx),%xmm7,%xmm6 16662306a36Sopenharmony_ci vmovdqu %xmm6,0x30(%rsi) 16762306a36Sopenharmony_ci vextracti128 $1,%ymm7,%xmm3 16862306a36Sopenharmony_ci 16962306a36Sopenharmony_ci # xor and write second block 17062306a36Sopenharmony_ci vmovdqa %xmm0,%xmm7 17162306a36Sopenharmony_ci cmp $0x50,%rax 17262306a36Sopenharmony_ci jl .Lxorpart2 17362306a36Sopenharmony_ci vpxor 0x40(%rdx),%xmm7,%xmm6 17462306a36Sopenharmony_ci vmovdqu %xmm6,0x40(%rsi) 17562306a36Sopenharmony_ci 17662306a36Sopenharmony_ci vmovdqa %xmm1,%xmm7 17762306a36Sopenharmony_ci cmp $0x60,%rax 17862306a36Sopenharmony_ci jl .Lxorpart2 17962306a36Sopenharmony_ci vpxor 0x50(%rdx),%xmm7,%xmm6 18062306a36Sopenharmony_ci vmovdqu %xmm6,0x50(%rsi) 18162306a36Sopenharmony_ci 18262306a36Sopenharmony_ci vmovdqa %xmm2,%xmm7 18362306a36Sopenharmony_ci cmp $0x70,%rax 18462306a36Sopenharmony_ci jl .Lxorpart2 18562306a36Sopenharmony_ci vpxor 0x60(%rdx),%xmm7,%xmm6 18662306a36Sopenharmony_ci vmovdqu %xmm6,0x60(%rsi) 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci vmovdqa %xmm3,%xmm7 18962306a36Sopenharmony_ci cmp $0x80,%rax 19062306a36Sopenharmony_ci jl .Lxorpart2 19162306a36Sopenharmony_ci vpxor 0x70(%rdx),%xmm7,%xmm6 19262306a36Sopenharmony_ci vmovdqu %xmm6,0x70(%rsi) 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci.Ldone2: 19562306a36Sopenharmony_ci vzeroupper 19662306a36Sopenharmony_ci RET 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci.Lxorpart2: 19962306a36Sopenharmony_ci # xor remaining bytes from partial register into output 20062306a36Sopenharmony_ci mov %rax,%r9 20162306a36Sopenharmony_ci and $0x0f,%r9 20262306a36Sopenharmony_ci jz .Ldone2 20362306a36Sopenharmony_ci and $~0x0f,%rax 20462306a36Sopenharmony_ci 20562306a36Sopenharmony_ci mov %rsi,%r11 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci lea 8(%rsp),%r10 20862306a36Sopenharmony_ci sub $0x10,%rsp 20962306a36Sopenharmony_ci and $~31,%rsp 21062306a36Sopenharmony_ci 21162306a36Sopenharmony_ci lea (%rdx,%rax),%rsi 21262306a36Sopenharmony_ci mov %rsp,%rdi 21362306a36Sopenharmony_ci mov %r9,%rcx 21462306a36Sopenharmony_ci rep movsb 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci vpxor 0x00(%rsp),%xmm7,%xmm7 21762306a36Sopenharmony_ci vmovdqa %xmm7,0x00(%rsp) 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci mov %rsp,%rsi 22062306a36Sopenharmony_ci lea (%r11,%rax),%rdi 22162306a36Sopenharmony_ci mov %r9,%rcx 22262306a36Sopenharmony_ci rep movsb 22362306a36Sopenharmony_ci 22462306a36Sopenharmony_ci lea -8(%r10),%rsp 22562306a36Sopenharmony_ci jmp .Ldone2 22662306a36Sopenharmony_ci 22762306a36Sopenharmony_ciSYM_FUNC_END(chacha_2block_xor_avx2) 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ciSYM_FUNC_START(chacha_4block_xor_avx2) 23062306a36Sopenharmony_ci # %rdi: Input state matrix, s 23162306a36Sopenharmony_ci # %rsi: up to 4 data blocks output, o 23262306a36Sopenharmony_ci # %rdx: up to 4 data blocks input, i 23362306a36Sopenharmony_ci # %rcx: input/output length in bytes 23462306a36Sopenharmony_ci # %r8d: nrounds 23562306a36Sopenharmony_ci 23662306a36Sopenharmony_ci # This function encrypts four ChaCha blocks by loading the state 23762306a36Sopenharmony_ci # matrix four times across eight AVX registers. It performs matrix 23862306a36Sopenharmony_ci # operations on four words in two matrices in parallel, sequentially 23962306a36Sopenharmony_ci # to the operations on the four words of the other two matrices. The 24062306a36Sopenharmony_ci # required word shuffling has a rather high latency, we can do the 24162306a36Sopenharmony_ci # arithmetic on two matrix-pairs without much slowdown. 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci vzeroupper 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci # x0..3[0-4] = s0..3 24662306a36Sopenharmony_ci vbroadcasti128 0x00(%rdi),%ymm0 24762306a36Sopenharmony_ci vbroadcasti128 0x10(%rdi),%ymm1 24862306a36Sopenharmony_ci vbroadcasti128 0x20(%rdi),%ymm2 24962306a36Sopenharmony_ci vbroadcasti128 0x30(%rdi),%ymm3 25062306a36Sopenharmony_ci 25162306a36Sopenharmony_ci vmovdqa %ymm0,%ymm4 25262306a36Sopenharmony_ci vmovdqa %ymm1,%ymm5 25362306a36Sopenharmony_ci vmovdqa %ymm2,%ymm6 25462306a36Sopenharmony_ci vmovdqa %ymm3,%ymm7 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_ci vpaddd CTR2BL(%rip),%ymm3,%ymm3 25762306a36Sopenharmony_ci vpaddd CTR4BL(%rip),%ymm7,%ymm7 25862306a36Sopenharmony_ci 25962306a36Sopenharmony_ci vmovdqa %ymm0,%ymm11 26062306a36Sopenharmony_ci vmovdqa %ymm1,%ymm12 26162306a36Sopenharmony_ci vmovdqa %ymm2,%ymm13 26262306a36Sopenharmony_ci vmovdqa %ymm3,%ymm14 26362306a36Sopenharmony_ci vmovdqa %ymm7,%ymm15 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci vmovdqa ROT8(%rip),%ymm8 26662306a36Sopenharmony_ci vmovdqa ROT16(%rip),%ymm9 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci mov %rcx,%rax 26962306a36Sopenharmony_ci 27062306a36Sopenharmony_ci.Ldoubleround4: 27162306a36Sopenharmony_ci 27262306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 27362306a36Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 27462306a36Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 27562306a36Sopenharmony_ci vpshufb %ymm9,%ymm3,%ymm3 27662306a36Sopenharmony_ci 27762306a36Sopenharmony_ci vpaddd %ymm5,%ymm4,%ymm4 27862306a36Sopenharmony_ci vpxor %ymm4,%ymm7,%ymm7 27962306a36Sopenharmony_ci vpshufb %ymm9,%ymm7,%ymm7 28062306a36Sopenharmony_ci 28162306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 28262306a36Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 28362306a36Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 28462306a36Sopenharmony_ci vmovdqa %ymm1,%ymm10 28562306a36Sopenharmony_ci vpslld $12,%ymm10,%ymm10 28662306a36Sopenharmony_ci vpsrld $20,%ymm1,%ymm1 28762306a36Sopenharmony_ci vpor %ymm10,%ymm1,%ymm1 28862306a36Sopenharmony_ci 28962306a36Sopenharmony_ci vpaddd %ymm7,%ymm6,%ymm6 29062306a36Sopenharmony_ci vpxor %ymm6,%ymm5,%ymm5 29162306a36Sopenharmony_ci vmovdqa %ymm5,%ymm10 29262306a36Sopenharmony_ci vpslld $12,%ymm10,%ymm10 29362306a36Sopenharmony_ci vpsrld $20,%ymm5,%ymm5 29462306a36Sopenharmony_ci vpor %ymm10,%ymm5,%ymm5 29562306a36Sopenharmony_ci 29662306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 29762306a36Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 29862306a36Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 29962306a36Sopenharmony_ci vpshufb %ymm8,%ymm3,%ymm3 30062306a36Sopenharmony_ci 30162306a36Sopenharmony_ci vpaddd %ymm5,%ymm4,%ymm4 30262306a36Sopenharmony_ci vpxor %ymm4,%ymm7,%ymm7 30362306a36Sopenharmony_ci vpshufb %ymm8,%ymm7,%ymm7 30462306a36Sopenharmony_ci 30562306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 30662306a36Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 30762306a36Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 30862306a36Sopenharmony_ci vmovdqa %ymm1,%ymm10 30962306a36Sopenharmony_ci vpslld $7,%ymm10,%ymm10 31062306a36Sopenharmony_ci vpsrld $25,%ymm1,%ymm1 31162306a36Sopenharmony_ci vpor %ymm10,%ymm1,%ymm1 31262306a36Sopenharmony_ci 31362306a36Sopenharmony_ci vpaddd %ymm7,%ymm6,%ymm6 31462306a36Sopenharmony_ci vpxor %ymm6,%ymm5,%ymm5 31562306a36Sopenharmony_ci vmovdqa %ymm5,%ymm10 31662306a36Sopenharmony_ci vpslld $7,%ymm10,%ymm10 31762306a36Sopenharmony_ci vpsrld $25,%ymm5,%ymm5 31862306a36Sopenharmony_ci vpor %ymm10,%ymm5,%ymm5 31962306a36Sopenharmony_ci 32062306a36Sopenharmony_ci # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 32162306a36Sopenharmony_ci vpshufd $0x39,%ymm1,%ymm1 32262306a36Sopenharmony_ci vpshufd $0x39,%ymm5,%ymm5 32362306a36Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 32462306a36Sopenharmony_ci vpshufd $0x4e,%ymm2,%ymm2 32562306a36Sopenharmony_ci vpshufd $0x4e,%ymm6,%ymm6 32662306a36Sopenharmony_ci # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 32762306a36Sopenharmony_ci vpshufd $0x93,%ymm3,%ymm3 32862306a36Sopenharmony_ci vpshufd $0x93,%ymm7,%ymm7 32962306a36Sopenharmony_ci 33062306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 33162306a36Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 33262306a36Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 33362306a36Sopenharmony_ci vpshufb %ymm9,%ymm3,%ymm3 33462306a36Sopenharmony_ci 33562306a36Sopenharmony_ci vpaddd %ymm5,%ymm4,%ymm4 33662306a36Sopenharmony_ci vpxor %ymm4,%ymm7,%ymm7 33762306a36Sopenharmony_ci vpshufb %ymm9,%ymm7,%ymm7 33862306a36Sopenharmony_ci 33962306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 34062306a36Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 34162306a36Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 34262306a36Sopenharmony_ci vmovdqa %ymm1,%ymm10 34362306a36Sopenharmony_ci vpslld $12,%ymm10,%ymm10 34462306a36Sopenharmony_ci vpsrld $20,%ymm1,%ymm1 34562306a36Sopenharmony_ci vpor %ymm10,%ymm1,%ymm1 34662306a36Sopenharmony_ci 34762306a36Sopenharmony_ci vpaddd %ymm7,%ymm6,%ymm6 34862306a36Sopenharmony_ci vpxor %ymm6,%ymm5,%ymm5 34962306a36Sopenharmony_ci vmovdqa %ymm5,%ymm10 35062306a36Sopenharmony_ci vpslld $12,%ymm10,%ymm10 35162306a36Sopenharmony_ci vpsrld $20,%ymm5,%ymm5 35262306a36Sopenharmony_ci vpor %ymm10,%ymm5,%ymm5 35362306a36Sopenharmony_ci 35462306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 35562306a36Sopenharmony_ci vpaddd %ymm1,%ymm0,%ymm0 35662306a36Sopenharmony_ci vpxor %ymm0,%ymm3,%ymm3 35762306a36Sopenharmony_ci vpshufb %ymm8,%ymm3,%ymm3 35862306a36Sopenharmony_ci 35962306a36Sopenharmony_ci vpaddd %ymm5,%ymm4,%ymm4 36062306a36Sopenharmony_ci vpxor %ymm4,%ymm7,%ymm7 36162306a36Sopenharmony_ci vpshufb %ymm8,%ymm7,%ymm7 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 36462306a36Sopenharmony_ci vpaddd %ymm3,%ymm2,%ymm2 36562306a36Sopenharmony_ci vpxor %ymm2,%ymm1,%ymm1 36662306a36Sopenharmony_ci vmovdqa %ymm1,%ymm10 36762306a36Sopenharmony_ci vpslld $7,%ymm10,%ymm10 36862306a36Sopenharmony_ci vpsrld $25,%ymm1,%ymm1 36962306a36Sopenharmony_ci vpor %ymm10,%ymm1,%ymm1 37062306a36Sopenharmony_ci 37162306a36Sopenharmony_ci vpaddd %ymm7,%ymm6,%ymm6 37262306a36Sopenharmony_ci vpxor %ymm6,%ymm5,%ymm5 37362306a36Sopenharmony_ci vmovdqa %ymm5,%ymm10 37462306a36Sopenharmony_ci vpslld $7,%ymm10,%ymm10 37562306a36Sopenharmony_ci vpsrld $25,%ymm5,%ymm5 37662306a36Sopenharmony_ci vpor %ymm10,%ymm5,%ymm5 37762306a36Sopenharmony_ci 37862306a36Sopenharmony_ci # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 37962306a36Sopenharmony_ci vpshufd $0x93,%ymm1,%ymm1 38062306a36Sopenharmony_ci vpshufd $0x93,%ymm5,%ymm5 38162306a36Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 38262306a36Sopenharmony_ci vpshufd $0x4e,%ymm2,%ymm2 38362306a36Sopenharmony_ci vpshufd $0x4e,%ymm6,%ymm6 38462306a36Sopenharmony_ci # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 38562306a36Sopenharmony_ci vpshufd $0x39,%ymm3,%ymm3 38662306a36Sopenharmony_ci vpshufd $0x39,%ymm7,%ymm7 38762306a36Sopenharmony_ci 38862306a36Sopenharmony_ci sub $2,%r8d 38962306a36Sopenharmony_ci jnz .Ldoubleround4 39062306a36Sopenharmony_ci 39162306a36Sopenharmony_ci # o0 = i0 ^ (x0 + s0), first block 39262306a36Sopenharmony_ci vpaddd %ymm11,%ymm0,%ymm10 39362306a36Sopenharmony_ci cmp $0x10,%rax 39462306a36Sopenharmony_ci jl .Lxorpart4 39562306a36Sopenharmony_ci vpxor 0x00(%rdx),%xmm10,%xmm9 39662306a36Sopenharmony_ci vmovdqu %xmm9,0x00(%rsi) 39762306a36Sopenharmony_ci vextracti128 $1,%ymm10,%xmm0 39862306a36Sopenharmony_ci # o1 = i1 ^ (x1 + s1), first block 39962306a36Sopenharmony_ci vpaddd %ymm12,%ymm1,%ymm10 40062306a36Sopenharmony_ci cmp $0x20,%rax 40162306a36Sopenharmony_ci jl .Lxorpart4 40262306a36Sopenharmony_ci vpxor 0x10(%rdx),%xmm10,%xmm9 40362306a36Sopenharmony_ci vmovdqu %xmm9,0x10(%rsi) 40462306a36Sopenharmony_ci vextracti128 $1,%ymm10,%xmm1 40562306a36Sopenharmony_ci # o2 = i2 ^ (x2 + s2), first block 40662306a36Sopenharmony_ci vpaddd %ymm13,%ymm2,%ymm10 40762306a36Sopenharmony_ci cmp $0x30,%rax 40862306a36Sopenharmony_ci jl .Lxorpart4 40962306a36Sopenharmony_ci vpxor 0x20(%rdx),%xmm10,%xmm9 41062306a36Sopenharmony_ci vmovdqu %xmm9,0x20(%rsi) 41162306a36Sopenharmony_ci vextracti128 $1,%ymm10,%xmm2 41262306a36Sopenharmony_ci # o3 = i3 ^ (x3 + s3), first block 41362306a36Sopenharmony_ci vpaddd %ymm14,%ymm3,%ymm10 41462306a36Sopenharmony_ci cmp $0x40,%rax 41562306a36Sopenharmony_ci jl .Lxorpart4 41662306a36Sopenharmony_ci vpxor 0x30(%rdx),%xmm10,%xmm9 41762306a36Sopenharmony_ci vmovdqu %xmm9,0x30(%rsi) 41862306a36Sopenharmony_ci vextracti128 $1,%ymm10,%xmm3 41962306a36Sopenharmony_ci 42062306a36Sopenharmony_ci # xor and write second block 42162306a36Sopenharmony_ci vmovdqa %xmm0,%xmm10 42262306a36Sopenharmony_ci cmp $0x50,%rax 42362306a36Sopenharmony_ci jl .Lxorpart4 42462306a36Sopenharmony_ci vpxor 0x40(%rdx),%xmm10,%xmm9 42562306a36Sopenharmony_ci vmovdqu %xmm9,0x40(%rsi) 42662306a36Sopenharmony_ci 42762306a36Sopenharmony_ci vmovdqa %xmm1,%xmm10 42862306a36Sopenharmony_ci cmp $0x60,%rax 42962306a36Sopenharmony_ci jl .Lxorpart4 43062306a36Sopenharmony_ci vpxor 0x50(%rdx),%xmm10,%xmm9 43162306a36Sopenharmony_ci vmovdqu %xmm9,0x50(%rsi) 43262306a36Sopenharmony_ci 43362306a36Sopenharmony_ci vmovdqa %xmm2,%xmm10 43462306a36Sopenharmony_ci cmp $0x70,%rax 43562306a36Sopenharmony_ci jl .Lxorpart4 43662306a36Sopenharmony_ci vpxor 0x60(%rdx),%xmm10,%xmm9 43762306a36Sopenharmony_ci vmovdqu %xmm9,0x60(%rsi) 43862306a36Sopenharmony_ci 43962306a36Sopenharmony_ci vmovdqa %xmm3,%xmm10 44062306a36Sopenharmony_ci cmp $0x80,%rax 44162306a36Sopenharmony_ci jl .Lxorpart4 44262306a36Sopenharmony_ci vpxor 0x70(%rdx),%xmm10,%xmm9 44362306a36Sopenharmony_ci vmovdqu %xmm9,0x70(%rsi) 44462306a36Sopenharmony_ci 44562306a36Sopenharmony_ci # o0 = i0 ^ (x0 + s0), third block 44662306a36Sopenharmony_ci vpaddd %ymm11,%ymm4,%ymm10 44762306a36Sopenharmony_ci cmp $0x90,%rax 44862306a36Sopenharmony_ci jl .Lxorpart4 44962306a36Sopenharmony_ci vpxor 0x80(%rdx),%xmm10,%xmm9 45062306a36Sopenharmony_ci vmovdqu %xmm9,0x80(%rsi) 45162306a36Sopenharmony_ci vextracti128 $1,%ymm10,%xmm4 45262306a36Sopenharmony_ci # o1 = i1 ^ (x1 + s1), third block 45362306a36Sopenharmony_ci vpaddd %ymm12,%ymm5,%ymm10 45462306a36Sopenharmony_ci cmp $0xa0,%rax 45562306a36Sopenharmony_ci jl .Lxorpart4 45662306a36Sopenharmony_ci vpxor 0x90(%rdx),%xmm10,%xmm9 45762306a36Sopenharmony_ci vmovdqu %xmm9,0x90(%rsi) 45862306a36Sopenharmony_ci vextracti128 $1,%ymm10,%xmm5 45962306a36Sopenharmony_ci # o2 = i2 ^ (x2 + s2), third block 46062306a36Sopenharmony_ci vpaddd %ymm13,%ymm6,%ymm10 46162306a36Sopenharmony_ci cmp $0xb0,%rax 46262306a36Sopenharmony_ci jl .Lxorpart4 46362306a36Sopenharmony_ci vpxor 0xa0(%rdx),%xmm10,%xmm9 46462306a36Sopenharmony_ci vmovdqu %xmm9,0xa0(%rsi) 46562306a36Sopenharmony_ci vextracti128 $1,%ymm10,%xmm6 46662306a36Sopenharmony_ci # o3 = i3 ^ (x3 + s3), third block 46762306a36Sopenharmony_ci vpaddd %ymm15,%ymm7,%ymm10 46862306a36Sopenharmony_ci cmp $0xc0,%rax 46962306a36Sopenharmony_ci jl .Lxorpart4 47062306a36Sopenharmony_ci vpxor 0xb0(%rdx),%xmm10,%xmm9 47162306a36Sopenharmony_ci vmovdqu %xmm9,0xb0(%rsi) 47262306a36Sopenharmony_ci vextracti128 $1,%ymm10,%xmm7 47362306a36Sopenharmony_ci 47462306a36Sopenharmony_ci # xor and write fourth block 47562306a36Sopenharmony_ci vmovdqa %xmm4,%xmm10 47662306a36Sopenharmony_ci cmp $0xd0,%rax 47762306a36Sopenharmony_ci jl .Lxorpart4 47862306a36Sopenharmony_ci vpxor 0xc0(%rdx),%xmm10,%xmm9 47962306a36Sopenharmony_ci vmovdqu %xmm9,0xc0(%rsi) 48062306a36Sopenharmony_ci 48162306a36Sopenharmony_ci vmovdqa %xmm5,%xmm10 48262306a36Sopenharmony_ci cmp $0xe0,%rax 48362306a36Sopenharmony_ci jl .Lxorpart4 48462306a36Sopenharmony_ci vpxor 0xd0(%rdx),%xmm10,%xmm9 48562306a36Sopenharmony_ci vmovdqu %xmm9,0xd0(%rsi) 48662306a36Sopenharmony_ci 48762306a36Sopenharmony_ci vmovdqa %xmm6,%xmm10 48862306a36Sopenharmony_ci cmp $0xf0,%rax 48962306a36Sopenharmony_ci jl .Lxorpart4 49062306a36Sopenharmony_ci vpxor 0xe0(%rdx),%xmm10,%xmm9 49162306a36Sopenharmony_ci vmovdqu %xmm9,0xe0(%rsi) 49262306a36Sopenharmony_ci 49362306a36Sopenharmony_ci vmovdqa %xmm7,%xmm10 49462306a36Sopenharmony_ci cmp $0x100,%rax 49562306a36Sopenharmony_ci jl .Lxorpart4 49662306a36Sopenharmony_ci vpxor 0xf0(%rdx),%xmm10,%xmm9 49762306a36Sopenharmony_ci vmovdqu %xmm9,0xf0(%rsi) 49862306a36Sopenharmony_ci 49962306a36Sopenharmony_ci.Ldone4: 50062306a36Sopenharmony_ci vzeroupper 50162306a36Sopenharmony_ci RET 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_ci.Lxorpart4: 50462306a36Sopenharmony_ci # xor remaining bytes from partial register into output 50562306a36Sopenharmony_ci mov %rax,%r9 50662306a36Sopenharmony_ci and $0x0f,%r9 50762306a36Sopenharmony_ci jz .Ldone4 50862306a36Sopenharmony_ci and $~0x0f,%rax 50962306a36Sopenharmony_ci 51062306a36Sopenharmony_ci mov %rsi,%r11 51162306a36Sopenharmony_ci 51262306a36Sopenharmony_ci lea 8(%rsp),%r10 51362306a36Sopenharmony_ci sub $0x10,%rsp 51462306a36Sopenharmony_ci and $~31,%rsp 51562306a36Sopenharmony_ci 51662306a36Sopenharmony_ci lea (%rdx,%rax),%rsi 51762306a36Sopenharmony_ci mov %rsp,%rdi 51862306a36Sopenharmony_ci mov %r9,%rcx 51962306a36Sopenharmony_ci rep movsb 52062306a36Sopenharmony_ci 52162306a36Sopenharmony_ci vpxor 0x00(%rsp),%xmm10,%xmm10 52262306a36Sopenharmony_ci vmovdqa %xmm10,0x00(%rsp) 52362306a36Sopenharmony_ci 52462306a36Sopenharmony_ci mov %rsp,%rsi 52562306a36Sopenharmony_ci lea (%r11,%rax),%rdi 52662306a36Sopenharmony_ci mov %r9,%rcx 52762306a36Sopenharmony_ci rep movsb 52862306a36Sopenharmony_ci 52962306a36Sopenharmony_ci lea -8(%r10),%rsp 53062306a36Sopenharmony_ci jmp .Ldone4 53162306a36Sopenharmony_ci 53262306a36Sopenharmony_ciSYM_FUNC_END(chacha_4block_xor_avx2) 53362306a36Sopenharmony_ci 53462306a36Sopenharmony_ciSYM_FUNC_START(chacha_8block_xor_avx2) 53562306a36Sopenharmony_ci # %rdi: Input state matrix, s 53662306a36Sopenharmony_ci # %rsi: up to 8 data blocks output, o 53762306a36Sopenharmony_ci # %rdx: up to 8 data blocks input, i 53862306a36Sopenharmony_ci # %rcx: input/output length in bytes 53962306a36Sopenharmony_ci # %r8d: nrounds 54062306a36Sopenharmony_ci 54162306a36Sopenharmony_ci # This function encrypts eight consecutive ChaCha blocks by loading 54262306a36Sopenharmony_ci # the state matrix in AVX registers eight times. As we need some 54362306a36Sopenharmony_ci # scratch registers, we save the first four registers on the stack. The 54462306a36Sopenharmony_ci # algorithm performs each operation on the corresponding word of each 54562306a36Sopenharmony_ci # state matrix, hence requires no word shuffling. For final XORing step 54662306a36Sopenharmony_ci # we transpose the matrix by interleaving 32-, 64- and then 128-bit 54762306a36Sopenharmony_ci # words, which allows us to do XOR in AVX registers. 8/16-bit word 54862306a36Sopenharmony_ci # rotation is done with the slightly better performing byte shuffling, 54962306a36Sopenharmony_ci # 7/12-bit word rotation uses traditional shift+OR. 55062306a36Sopenharmony_ci 55162306a36Sopenharmony_ci vzeroupper 55262306a36Sopenharmony_ci # 4 * 32 byte stack, 32-byte aligned 55362306a36Sopenharmony_ci lea 8(%rsp),%r10 55462306a36Sopenharmony_ci and $~31, %rsp 55562306a36Sopenharmony_ci sub $0x80, %rsp 55662306a36Sopenharmony_ci mov %rcx,%rax 55762306a36Sopenharmony_ci 55862306a36Sopenharmony_ci # x0..15[0-7] = s[0..15] 55962306a36Sopenharmony_ci vpbroadcastd 0x00(%rdi),%ymm0 56062306a36Sopenharmony_ci vpbroadcastd 0x04(%rdi),%ymm1 56162306a36Sopenharmony_ci vpbroadcastd 0x08(%rdi),%ymm2 56262306a36Sopenharmony_ci vpbroadcastd 0x0c(%rdi),%ymm3 56362306a36Sopenharmony_ci vpbroadcastd 0x10(%rdi),%ymm4 56462306a36Sopenharmony_ci vpbroadcastd 0x14(%rdi),%ymm5 56562306a36Sopenharmony_ci vpbroadcastd 0x18(%rdi),%ymm6 56662306a36Sopenharmony_ci vpbroadcastd 0x1c(%rdi),%ymm7 56762306a36Sopenharmony_ci vpbroadcastd 0x20(%rdi),%ymm8 56862306a36Sopenharmony_ci vpbroadcastd 0x24(%rdi),%ymm9 56962306a36Sopenharmony_ci vpbroadcastd 0x28(%rdi),%ymm10 57062306a36Sopenharmony_ci vpbroadcastd 0x2c(%rdi),%ymm11 57162306a36Sopenharmony_ci vpbroadcastd 0x30(%rdi),%ymm12 57262306a36Sopenharmony_ci vpbroadcastd 0x34(%rdi),%ymm13 57362306a36Sopenharmony_ci vpbroadcastd 0x38(%rdi),%ymm14 57462306a36Sopenharmony_ci vpbroadcastd 0x3c(%rdi),%ymm15 57562306a36Sopenharmony_ci # x0..3 on stack 57662306a36Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 57762306a36Sopenharmony_ci vmovdqa %ymm1,0x20(%rsp) 57862306a36Sopenharmony_ci vmovdqa %ymm2,0x40(%rsp) 57962306a36Sopenharmony_ci vmovdqa %ymm3,0x60(%rsp) 58062306a36Sopenharmony_ci 58162306a36Sopenharmony_ci vmovdqa CTRINC(%rip),%ymm1 58262306a36Sopenharmony_ci vmovdqa ROT8(%rip),%ymm2 58362306a36Sopenharmony_ci vmovdqa ROT16(%rip),%ymm3 58462306a36Sopenharmony_ci 58562306a36Sopenharmony_ci # x12 += counter values 0-3 58662306a36Sopenharmony_ci vpaddd %ymm1,%ymm12,%ymm12 58762306a36Sopenharmony_ci 58862306a36Sopenharmony_ci.Ldoubleround8: 58962306a36Sopenharmony_ci # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 59062306a36Sopenharmony_ci vpaddd 0x00(%rsp),%ymm4,%ymm0 59162306a36Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 59262306a36Sopenharmony_ci vpxor %ymm0,%ymm12,%ymm12 59362306a36Sopenharmony_ci vpshufb %ymm3,%ymm12,%ymm12 59462306a36Sopenharmony_ci # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 59562306a36Sopenharmony_ci vpaddd 0x20(%rsp),%ymm5,%ymm0 59662306a36Sopenharmony_ci vmovdqa %ymm0,0x20(%rsp) 59762306a36Sopenharmony_ci vpxor %ymm0,%ymm13,%ymm13 59862306a36Sopenharmony_ci vpshufb %ymm3,%ymm13,%ymm13 59962306a36Sopenharmony_ci # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 60062306a36Sopenharmony_ci vpaddd 0x40(%rsp),%ymm6,%ymm0 60162306a36Sopenharmony_ci vmovdqa %ymm0,0x40(%rsp) 60262306a36Sopenharmony_ci vpxor %ymm0,%ymm14,%ymm14 60362306a36Sopenharmony_ci vpshufb %ymm3,%ymm14,%ymm14 60462306a36Sopenharmony_ci # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 60562306a36Sopenharmony_ci vpaddd 0x60(%rsp),%ymm7,%ymm0 60662306a36Sopenharmony_ci vmovdqa %ymm0,0x60(%rsp) 60762306a36Sopenharmony_ci vpxor %ymm0,%ymm15,%ymm15 60862306a36Sopenharmony_ci vpshufb %ymm3,%ymm15,%ymm15 60962306a36Sopenharmony_ci 61062306a36Sopenharmony_ci # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 61162306a36Sopenharmony_ci vpaddd %ymm12,%ymm8,%ymm8 61262306a36Sopenharmony_ci vpxor %ymm8,%ymm4,%ymm4 61362306a36Sopenharmony_ci vpslld $12,%ymm4,%ymm0 61462306a36Sopenharmony_ci vpsrld $20,%ymm4,%ymm4 61562306a36Sopenharmony_ci vpor %ymm0,%ymm4,%ymm4 61662306a36Sopenharmony_ci # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 61762306a36Sopenharmony_ci vpaddd %ymm13,%ymm9,%ymm9 61862306a36Sopenharmony_ci vpxor %ymm9,%ymm5,%ymm5 61962306a36Sopenharmony_ci vpslld $12,%ymm5,%ymm0 62062306a36Sopenharmony_ci vpsrld $20,%ymm5,%ymm5 62162306a36Sopenharmony_ci vpor %ymm0,%ymm5,%ymm5 62262306a36Sopenharmony_ci # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 62362306a36Sopenharmony_ci vpaddd %ymm14,%ymm10,%ymm10 62462306a36Sopenharmony_ci vpxor %ymm10,%ymm6,%ymm6 62562306a36Sopenharmony_ci vpslld $12,%ymm6,%ymm0 62662306a36Sopenharmony_ci vpsrld $20,%ymm6,%ymm6 62762306a36Sopenharmony_ci vpor %ymm0,%ymm6,%ymm6 62862306a36Sopenharmony_ci # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 62962306a36Sopenharmony_ci vpaddd %ymm15,%ymm11,%ymm11 63062306a36Sopenharmony_ci vpxor %ymm11,%ymm7,%ymm7 63162306a36Sopenharmony_ci vpslld $12,%ymm7,%ymm0 63262306a36Sopenharmony_ci vpsrld $20,%ymm7,%ymm7 63362306a36Sopenharmony_ci vpor %ymm0,%ymm7,%ymm7 63462306a36Sopenharmony_ci 63562306a36Sopenharmony_ci # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 63662306a36Sopenharmony_ci vpaddd 0x00(%rsp),%ymm4,%ymm0 63762306a36Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 63862306a36Sopenharmony_ci vpxor %ymm0,%ymm12,%ymm12 63962306a36Sopenharmony_ci vpshufb %ymm2,%ymm12,%ymm12 64062306a36Sopenharmony_ci # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 64162306a36Sopenharmony_ci vpaddd 0x20(%rsp),%ymm5,%ymm0 64262306a36Sopenharmony_ci vmovdqa %ymm0,0x20(%rsp) 64362306a36Sopenharmony_ci vpxor %ymm0,%ymm13,%ymm13 64462306a36Sopenharmony_ci vpshufb %ymm2,%ymm13,%ymm13 64562306a36Sopenharmony_ci # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 64662306a36Sopenharmony_ci vpaddd 0x40(%rsp),%ymm6,%ymm0 64762306a36Sopenharmony_ci vmovdqa %ymm0,0x40(%rsp) 64862306a36Sopenharmony_ci vpxor %ymm0,%ymm14,%ymm14 64962306a36Sopenharmony_ci vpshufb %ymm2,%ymm14,%ymm14 65062306a36Sopenharmony_ci # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 65162306a36Sopenharmony_ci vpaddd 0x60(%rsp),%ymm7,%ymm0 65262306a36Sopenharmony_ci vmovdqa %ymm0,0x60(%rsp) 65362306a36Sopenharmony_ci vpxor %ymm0,%ymm15,%ymm15 65462306a36Sopenharmony_ci vpshufb %ymm2,%ymm15,%ymm15 65562306a36Sopenharmony_ci 65662306a36Sopenharmony_ci # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 65762306a36Sopenharmony_ci vpaddd %ymm12,%ymm8,%ymm8 65862306a36Sopenharmony_ci vpxor %ymm8,%ymm4,%ymm4 65962306a36Sopenharmony_ci vpslld $7,%ymm4,%ymm0 66062306a36Sopenharmony_ci vpsrld $25,%ymm4,%ymm4 66162306a36Sopenharmony_ci vpor %ymm0,%ymm4,%ymm4 66262306a36Sopenharmony_ci # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 66362306a36Sopenharmony_ci vpaddd %ymm13,%ymm9,%ymm9 66462306a36Sopenharmony_ci vpxor %ymm9,%ymm5,%ymm5 66562306a36Sopenharmony_ci vpslld $7,%ymm5,%ymm0 66662306a36Sopenharmony_ci vpsrld $25,%ymm5,%ymm5 66762306a36Sopenharmony_ci vpor %ymm0,%ymm5,%ymm5 66862306a36Sopenharmony_ci # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 66962306a36Sopenharmony_ci vpaddd %ymm14,%ymm10,%ymm10 67062306a36Sopenharmony_ci vpxor %ymm10,%ymm6,%ymm6 67162306a36Sopenharmony_ci vpslld $7,%ymm6,%ymm0 67262306a36Sopenharmony_ci vpsrld $25,%ymm6,%ymm6 67362306a36Sopenharmony_ci vpor %ymm0,%ymm6,%ymm6 67462306a36Sopenharmony_ci # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 67562306a36Sopenharmony_ci vpaddd %ymm15,%ymm11,%ymm11 67662306a36Sopenharmony_ci vpxor %ymm11,%ymm7,%ymm7 67762306a36Sopenharmony_ci vpslld $7,%ymm7,%ymm0 67862306a36Sopenharmony_ci vpsrld $25,%ymm7,%ymm7 67962306a36Sopenharmony_ci vpor %ymm0,%ymm7,%ymm7 68062306a36Sopenharmony_ci 68162306a36Sopenharmony_ci # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 68262306a36Sopenharmony_ci vpaddd 0x00(%rsp),%ymm5,%ymm0 68362306a36Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 68462306a36Sopenharmony_ci vpxor %ymm0,%ymm15,%ymm15 68562306a36Sopenharmony_ci vpshufb %ymm3,%ymm15,%ymm15 68662306a36Sopenharmony_ci # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 68762306a36Sopenharmony_ci vpaddd 0x20(%rsp),%ymm6,%ymm0 68862306a36Sopenharmony_ci vmovdqa %ymm0,0x20(%rsp) 68962306a36Sopenharmony_ci vpxor %ymm0,%ymm12,%ymm12 69062306a36Sopenharmony_ci vpshufb %ymm3,%ymm12,%ymm12 69162306a36Sopenharmony_ci # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 69262306a36Sopenharmony_ci vpaddd 0x40(%rsp),%ymm7,%ymm0 69362306a36Sopenharmony_ci vmovdqa %ymm0,0x40(%rsp) 69462306a36Sopenharmony_ci vpxor %ymm0,%ymm13,%ymm13 69562306a36Sopenharmony_ci vpshufb %ymm3,%ymm13,%ymm13 69662306a36Sopenharmony_ci # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 69762306a36Sopenharmony_ci vpaddd 0x60(%rsp),%ymm4,%ymm0 69862306a36Sopenharmony_ci vmovdqa %ymm0,0x60(%rsp) 69962306a36Sopenharmony_ci vpxor %ymm0,%ymm14,%ymm14 70062306a36Sopenharmony_ci vpshufb %ymm3,%ymm14,%ymm14 70162306a36Sopenharmony_ci 70262306a36Sopenharmony_ci # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 70362306a36Sopenharmony_ci vpaddd %ymm15,%ymm10,%ymm10 70462306a36Sopenharmony_ci vpxor %ymm10,%ymm5,%ymm5 70562306a36Sopenharmony_ci vpslld $12,%ymm5,%ymm0 70662306a36Sopenharmony_ci vpsrld $20,%ymm5,%ymm5 70762306a36Sopenharmony_ci vpor %ymm0,%ymm5,%ymm5 70862306a36Sopenharmony_ci # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 70962306a36Sopenharmony_ci vpaddd %ymm12,%ymm11,%ymm11 71062306a36Sopenharmony_ci vpxor %ymm11,%ymm6,%ymm6 71162306a36Sopenharmony_ci vpslld $12,%ymm6,%ymm0 71262306a36Sopenharmony_ci vpsrld $20,%ymm6,%ymm6 71362306a36Sopenharmony_ci vpor %ymm0,%ymm6,%ymm6 71462306a36Sopenharmony_ci # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 71562306a36Sopenharmony_ci vpaddd %ymm13,%ymm8,%ymm8 71662306a36Sopenharmony_ci vpxor %ymm8,%ymm7,%ymm7 71762306a36Sopenharmony_ci vpslld $12,%ymm7,%ymm0 71862306a36Sopenharmony_ci vpsrld $20,%ymm7,%ymm7 71962306a36Sopenharmony_ci vpor %ymm0,%ymm7,%ymm7 72062306a36Sopenharmony_ci # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 72162306a36Sopenharmony_ci vpaddd %ymm14,%ymm9,%ymm9 72262306a36Sopenharmony_ci vpxor %ymm9,%ymm4,%ymm4 72362306a36Sopenharmony_ci vpslld $12,%ymm4,%ymm0 72462306a36Sopenharmony_ci vpsrld $20,%ymm4,%ymm4 72562306a36Sopenharmony_ci vpor %ymm0,%ymm4,%ymm4 72662306a36Sopenharmony_ci 72762306a36Sopenharmony_ci # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 72862306a36Sopenharmony_ci vpaddd 0x00(%rsp),%ymm5,%ymm0 72962306a36Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 73062306a36Sopenharmony_ci vpxor %ymm0,%ymm15,%ymm15 73162306a36Sopenharmony_ci vpshufb %ymm2,%ymm15,%ymm15 73262306a36Sopenharmony_ci # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 73362306a36Sopenharmony_ci vpaddd 0x20(%rsp),%ymm6,%ymm0 73462306a36Sopenharmony_ci vmovdqa %ymm0,0x20(%rsp) 73562306a36Sopenharmony_ci vpxor %ymm0,%ymm12,%ymm12 73662306a36Sopenharmony_ci vpshufb %ymm2,%ymm12,%ymm12 73762306a36Sopenharmony_ci # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 73862306a36Sopenharmony_ci vpaddd 0x40(%rsp),%ymm7,%ymm0 73962306a36Sopenharmony_ci vmovdqa %ymm0,0x40(%rsp) 74062306a36Sopenharmony_ci vpxor %ymm0,%ymm13,%ymm13 74162306a36Sopenharmony_ci vpshufb %ymm2,%ymm13,%ymm13 74262306a36Sopenharmony_ci # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 74362306a36Sopenharmony_ci vpaddd 0x60(%rsp),%ymm4,%ymm0 74462306a36Sopenharmony_ci vmovdqa %ymm0,0x60(%rsp) 74562306a36Sopenharmony_ci vpxor %ymm0,%ymm14,%ymm14 74662306a36Sopenharmony_ci vpshufb %ymm2,%ymm14,%ymm14 74762306a36Sopenharmony_ci 74862306a36Sopenharmony_ci # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 74962306a36Sopenharmony_ci vpaddd %ymm15,%ymm10,%ymm10 75062306a36Sopenharmony_ci vpxor %ymm10,%ymm5,%ymm5 75162306a36Sopenharmony_ci vpslld $7,%ymm5,%ymm0 75262306a36Sopenharmony_ci vpsrld $25,%ymm5,%ymm5 75362306a36Sopenharmony_ci vpor %ymm0,%ymm5,%ymm5 75462306a36Sopenharmony_ci # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 75562306a36Sopenharmony_ci vpaddd %ymm12,%ymm11,%ymm11 75662306a36Sopenharmony_ci vpxor %ymm11,%ymm6,%ymm6 75762306a36Sopenharmony_ci vpslld $7,%ymm6,%ymm0 75862306a36Sopenharmony_ci vpsrld $25,%ymm6,%ymm6 75962306a36Sopenharmony_ci vpor %ymm0,%ymm6,%ymm6 76062306a36Sopenharmony_ci # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 76162306a36Sopenharmony_ci vpaddd %ymm13,%ymm8,%ymm8 76262306a36Sopenharmony_ci vpxor %ymm8,%ymm7,%ymm7 76362306a36Sopenharmony_ci vpslld $7,%ymm7,%ymm0 76462306a36Sopenharmony_ci vpsrld $25,%ymm7,%ymm7 76562306a36Sopenharmony_ci vpor %ymm0,%ymm7,%ymm7 76662306a36Sopenharmony_ci # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 76762306a36Sopenharmony_ci vpaddd %ymm14,%ymm9,%ymm9 76862306a36Sopenharmony_ci vpxor %ymm9,%ymm4,%ymm4 76962306a36Sopenharmony_ci vpslld $7,%ymm4,%ymm0 77062306a36Sopenharmony_ci vpsrld $25,%ymm4,%ymm4 77162306a36Sopenharmony_ci vpor %ymm0,%ymm4,%ymm4 77262306a36Sopenharmony_ci 77362306a36Sopenharmony_ci sub $2,%r8d 77462306a36Sopenharmony_ci jnz .Ldoubleround8 77562306a36Sopenharmony_ci 77662306a36Sopenharmony_ci # x0..15[0-3] += s[0..15] 77762306a36Sopenharmony_ci vpbroadcastd 0x00(%rdi),%ymm0 77862306a36Sopenharmony_ci vpaddd 0x00(%rsp),%ymm0,%ymm0 77962306a36Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 78062306a36Sopenharmony_ci vpbroadcastd 0x04(%rdi),%ymm0 78162306a36Sopenharmony_ci vpaddd 0x20(%rsp),%ymm0,%ymm0 78262306a36Sopenharmony_ci vmovdqa %ymm0,0x20(%rsp) 78362306a36Sopenharmony_ci vpbroadcastd 0x08(%rdi),%ymm0 78462306a36Sopenharmony_ci vpaddd 0x40(%rsp),%ymm0,%ymm0 78562306a36Sopenharmony_ci vmovdqa %ymm0,0x40(%rsp) 78662306a36Sopenharmony_ci vpbroadcastd 0x0c(%rdi),%ymm0 78762306a36Sopenharmony_ci vpaddd 0x60(%rsp),%ymm0,%ymm0 78862306a36Sopenharmony_ci vmovdqa %ymm0,0x60(%rsp) 78962306a36Sopenharmony_ci vpbroadcastd 0x10(%rdi),%ymm0 79062306a36Sopenharmony_ci vpaddd %ymm0,%ymm4,%ymm4 79162306a36Sopenharmony_ci vpbroadcastd 0x14(%rdi),%ymm0 79262306a36Sopenharmony_ci vpaddd %ymm0,%ymm5,%ymm5 79362306a36Sopenharmony_ci vpbroadcastd 0x18(%rdi),%ymm0 79462306a36Sopenharmony_ci vpaddd %ymm0,%ymm6,%ymm6 79562306a36Sopenharmony_ci vpbroadcastd 0x1c(%rdi),%ymm0 79662306a36Sopenharmony_ci vpaddd %ymm0,%ymm7,%ymm7 79762306a36Sopenharmony_ci vpbroadcastd 0x20(%rdi),%ymm0 79862306a36Sopenharmony_ci vpaddd %ymm0,%ymm8,%ymm8 79962306a36Sopenharmony_ci vpbroadcastd 0x24(%rdi),%ymm0 80062306a36Sopenharmony_ci vpaddd %ymm0,%ymm9,%ymm9 80162306a36Sopenharmony_ci vpbroadcastd 0x28(%rdi),%ymm0 80262306a36Sopenharmony_ci vpaddd %ymm0,%ymm10,%ymm10 80362306a36Sopenharmony_ci vpbroadcastd 0x2c(%rdi),%ymm0 80462306a36Sopenharmony_ci vpaddd %ymm0,%ymm11,%ymm11 80562306a36Sopenharmony_ci vpbroadcastd 0x30(%rdi),%ymm0 80662306a36Sopenharmony_ci vpaddd %ymm0,%ymm12,%ymm12 80762306a36Sopenharmony_ci vpbroadcastd 0x34(%rdi),%ymm0 80862306a36Sopenharmony_ci vpaddd %ymm0,%ymm13,%ymm13 80962306a36Sopenharmony_ci vpbroadcastd 0x38(%rdi),%ymm0 81062306a36Sopenharmony_ci vpaddd %ymm0,%ymm14,%ymm14 81162306a36Sopenharmony_ci vpbroadcastd 0x3c(%rdi),%ymm0 81262306a36Sopenharmony_ci vpaddd %ymm0,%ymm15,%ymm15 81362306a36Sopenharmony_ci 81462306a36Sopenharmony_ci # x12 += counter values 0-3 81562306a36Sopenharmony_ci vpaddd %ymm1,%ymm12,%ymm12 81662306a36Sopenharmony_ci 81762306a36Sopenharmony_ci # interleave 32-bit words in state n, n+1 81862306a36Sopenharmony_ci vmovdqa 0x00(%rsp),%ymm0 81962306a36Sopenharmony_ci vmovdqa 0x20(%rsp),%ymm1 82062306a36Sopenharmony_ci vpunpckldq %ymm1,%ymm0,%ymm2 82162306a36Sopenharmony_ci vpunpckhdq %ymm1,%ymm0,%ymm1 82262306a36Sopenharmony_ci vmovdqa %ymm2,0x00(%rsp) 82362306a36Sopenharmony_ci vmovdqa %ymm1,0x20(%rsp) 82462306a36Sopenharmony_ci vmovdqa 0x40(%rsp),%ymm0 82562306a36Sopenharmony_ci vmovdqa 0x60(%rsp),%ymm1 82662306a36Sopenharmony_ci vpunpckldq %ymm1,%ymm0,%ymm2 82762306a36Sopenharmony_ci vpunpckhdq %ymm1,%ymm0,%ymm1 82862306a36Sopenharmony_ci vmovdqa %ymm2,0x40(%rsp) 82962306a36Sopenharmony_ci vmovdqa %ymm1,0x60(%rsp) 83062306a36Sopenharmony_ci vmovdqa %ymm4,%ymm0 83162306a36Sopenharmony_ci vpunpckldq %ymm5,%ymm0,%ymm4 83262306a36Sopenharmony_ci vpunpckhdq %ymm5,%ymm0,%ymm5 83362306a36Sopenharmony_ci vmovdqa %ymm6,%ymm0 83462306a36Sopenharmony_ci vpunpckldq %ymm7,%ymm0,%ymm6 83562306a36Sopenharmony_ci vpunpckhdq %ymm7,%ymm0,%ymm7 83662306a36Sopenharmony_ci vmovdqa %ymm8,%ymm0 83762306a36Sopenharmony_ci vpunpckldq %ymm9,%ymm0,%ymm8 83862306a36Sopenharmony_ci vpunpckhdq %ymm9,%ymm0,%ymm9 83962306a36Sopenharmony_ci vmovdqa %ymm10,%ymm0 84062306a36Sopenharmony_ci vpunpckldq %ymm11,%ymm0,%ymm10 84162306a36Sopenharmony_ci vpunpckhdq %ymm11,%ymm0,%ymm11 84262306a36Sopenharmony_ci vmovdqa %ymm12,%ymm0 84362306a36Sopenharmony_ci vpunpckldq %ymm13,%ymm0,%ymm12 84462306a36Sopenharmony_ci vpunpckhdq %ymm13,%ymm0,%ymm13 84562306a36Sopenharmony_ci vmovdqa %ymm14,%ymm0 84662306a36Sopenharmony_ci vpunpckldq %ymm15,%ymm0,%ymm14 84762306a36Sopenharmony_ci vpunpckhdq %ymm15,%ymm0,%ymm15 84862306a36Sopenharmony_ci 84962306a36Sopenharmony_ci # interleave 64-bit words in state n, n+2 85062306a36Sopenharmony_ci vmovdqa 0x00(%rsp),%ymm0 85162306a36Sopenharmony_ci vmovdqa 0x40(%rsp),%ymm2 85262306a36Sopenharmony_ci vpunpcklqdq %ymm2,%ymm0,%ymm1 85362306a36Sopenharmony_ci vpunpckhqdq %ymm2,%ymm0,%ymm2 85462306a36Sopenharmony_ci vmovdqa %ymm1,0x00(%rsp) 85562306a36Sopenharmony_ci vmovdqa %ymm2,0x40(%rsp) 85662306a36Sopenharmony_ci vmovdqa 0x20(%rsp),%ymm0 85762306a36Sopenharmony_ci vmovdqa 0x60(%rsp),%ymm2 85862306a36Sopenharmony_ci vpunpcklqdq %ymm2,%ymm0,%ymm1 85962306a36Sopenharmony_ci vpunpckhqdq %ymm2,%ymm0,%ymm2 86062306a36Sopenharmony_ci vmovdqa %ymm1,0x20(%rsp) 86162306a36Sopenharmony_ci vmovdqa %ymm2,0x60(%rsp) 86262306a36Sopenharmony_ci vmovdqa %ymm4,%ymm0 86362306a36Sopenharmony_ci vpunpcklqdq %ymm6,%ymm0,%ymm4 86462306a36Sopenharmony_ci vpunpckhqdq %ymm6,%ymm0,%ymm6 86562306a36Sopenharmony_ci vmovdqa %ymm5,%ymm0 86662306a36Sopenharmony_ci vpunpcklqdq %ymm7,%ymm0,%ymm5 86762306a36Sopenharmony_ci vpunpckhqdq %ymm7,%ymm0,%ymm7 86862306a36Sopenharmony_ci vmovdqa %ymm8,%ymm0 86962306a36Sopenharmony_ci vpunpcklqdq %ymm10,%ymm0,%ymm8 87062306a36Sopenharmony_ci vpunpckhqdq %ymm10,%ymm0,%ymm10 87162306a36Sopenharmony_ci vmovdqa %ymm9,%ymm0 87262306a36Sopenharmony_ci vpunpcklqdq %ymm11,%ymm0,%ymm9 87362306a36Sopenharmony_ci vpunpckhqdq %ymm11,%ymm0,%ymm11 87462306a36Sopenharmony_ci vmovdqa %ymm12,%ymm0 87562306a36Sopenharmony_ci vpunpcklqdq %ymm14,%ymm0,%ymm12 87662306a36Sopenharmony_ci vpunpckhqdq %ymm14,%ymm0,%ymm14 87762306a36Sopenharmony_ci vmovdqa %ymm13,%ymm0 87862306a36Sopenharmony_ci vpunpcklqdq %ymm15,%ymm0,%ymm13 87962306a36Sopenharmony_ci vpunpckhqdq %ymm15,%ymm0,%ymm15 88062306a36Sopenharmony_ci 88162306a36Sopenharmony_ci # interleave 128-bit words in state n, n+4 88262306a36Sopenharmony_ci # xor/write first four blocks 88362306a36Sopenharmony_ci vmovdqa 0x00(%rsp),%ymm1 88462306a36Sopenharmony_ci vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 88562306a36Sopenharmony_ci cmp $0x0020,%rax 88662306a36Sopenharmony_ci jl .Lxorpart8 88762306a36Sopenharmony_ci vpxor 0x0000(%rdx),%ymm0,%ymm0 88862306a36Sopenharmony_ci vmovdqu %ymm0,0x0000(%rsi) 88962306a36Sopenharmony_ci vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 89062306a36Sopenharmony_ci 89162306a36Sopenharmony_ci vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 89262306a36Sopenharmony_ci cmp $0x0040,%rax 89362306a36Sopenharmony_ci jl .Lxorpart8 89462306a36Sopenharmony_ci vpxor 0x0020(%rdx),%ymm0,%ymm0 89562306a36Sopenharmony_ci vmovdqu %ymm0,0x0020(%rsi) 89662306a36Sopenharmony_ci vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 89762306a36Sopenharmony_ci 89862306a36Sopenharmony_ci vmovdqa 0x40(%rsp),%ymm1 89962306a36Sopenharmony_ci vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 90062306a36Sopenharmony_ci cmp $0x0060,%rax 90162306a36Sopenharmony_ci jl .Lxorpart8 90262306a36Sopenharmony_ci vpxor 0x0040(%rdx),%ymm0,%ymm0 90362306a36Sopenharmony_ci vmovdqu %ymm0,0x0040(%rsi) 90462306a36Sopenharmony_ci vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 90562306a36Sopenharmony_ci 90662306a36Sopenharmony_ci vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 90762306a36Sopenharmony_ci cmp $0x0080,%rax 90862306a36Sopenharmony_ci jl .Lxorpart8 90962306a36Sopenharmony_ci vpxor 0x0060(%rdx),%ymm0,%ymm0 91062306a36Sopenharmony_ci vmovdqu %ymm0,0x0060(%rsi) 91162306a36Sopenharmony_ci vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 91262306a36Sopenharmony_ci 91362306a36Sopenharmony_ci vmovdqa 0x20(%rsp),%ymm1 91462306a36Sopenharmony_ci vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 91562306a36Sopenharmony_ci cmp $0x00a0,%rax 91662306a36Sopenharmony_ci jl .Lxorpart8 91762306a36Sopenharmony_ci vpxor 0x0080(%rdx),%ymm0,%ymm0 91862306a36Sopenharmony_ci vmovdqu %ymm0,0x0080(%rsi) 91962306a36Sopenharmony_ci vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 92062306a36Sopenharmony_ci 92162306a36Sopenharmony_ci vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 92262306a36Sopenharmony_ci cmp $0x00c0,%rax 92362306a36Sopenharmony_ci jl .Lxorpart8 92462306a36Sopenharmony_ci vpxor 0x00a0(%rdx),%ymm0,%ymm0 92562306a36Sopenharmony_ci vmovdqu %ymm0,0x00a0(%rsi) 92662306a36Sopenharmony_ci vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 92762306a36Sopenharmony_ci 92862306a36Sopenharmony_ci vmovdqa 0x60(%rsp),%ymm1 92962306a36Sopenharmony_ci vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 93062306a36Sopenharmony_ci cmp $0x00e0,%rax 93162306a36Sopenharmony_ci jl .Lxorpart8 93262306a36Sopenharmony_ci vpxor 0x00c0(%rdx),%ymm0,%ymm0 93362306a36Sopenharmony_ci vmovdqu %ymm0,0x00c0(%rsi) 93462306a36Sopenharmony_ci vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 93562306a36Sopenharmony_ci 93662306a36Sopenharmony_ci vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 93762306a36Sopenharmony_ci cmp $0x0100,%rax 93862306a36Sopenharmony_ci jl .Lxorpart8 93962306a36Sopenharmony_ci vpxor 0x00e0(%rdx),%ymm0,%ymm0 94062306a36Sopenharmony_ci vmovdqu %ymm0,0x00e0(%rsi) 94162306a36Sopenharmony_ci vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 94262306a36Sopenharmony_ci 94362306a36Sopenharmony_ci # xor remaining blocks, write to output 94462306a36Sopenharmony_ci vmovdqa %ymm4,%ymm0 94562306a36Sopenharmony_ci cmp $0x0120,%rax 94662306a36Sopenharmony_ci jl .Lxorpart8 94762306a36Sopenharmony_ci vpxor 0x0100(%rdx),%ymm0,%ymm0 94862306a36Sopenharmony_ci vmovdqu %ymm0,0x0100(%rsi) 94962306a36Sopenharmony_ci 95062306a36Sopenharmony_ci vmovdqa %ymm12,%ymm0 95162306a36Sopenharmony_ci cmp $0x0140,%rax 95262306a36Sopenharmony_ci jl .Lxorpart8 95362306a36Sopenharmony_ci vpxor 0x0120(%rdx),%ymm0,%ymm0 95462306a36Sopenharmony_ci vmovdqu %ymm0,0x0120(%rsi) 95562306a36Sopenharmony_ci 95662306a36Sopenharmony_ci vmovdqa %ymm6,%ymm0 95762306a36Sopenharmony_ci cmp $0x0160,%rax 95862306a36Sopenharmony_ci jl .Lxorpart8 95962306a36Sopenharmony_ci vpxor 0x0140(%rdx),%ymm0,%ymm0 96062306a36Sopenharmony_ci vmovdqu %ymm0,0x0140(%rsi) 96162306a36Sopenharmony_ci 96262306a36Sopenharmony_ci vmovdqa %ymm14,%ymm0 96362306a36Sopenharmony_ci cmp $0x0180,%rax 96462306a36Sopenharmony_ci jl .Lxorpart8 96562306a36Sopenharmony_ci vpxor 0x0160(%rdx),%ymm0,%ymm0 96662306a36Sopenharmony_ci vmovdqu %ymm0,0x0160(%rsi) 96762306a36Sopenharmony_ci 96862306a36Sopenharmony_ci vmovdqa %ymm5,%ymm0 96962306a36Sopenharmony_ci cmp $0x01a0,%rax 97062306a36Sopenharmony_ci jl .Lxorpart8 97162306a36Sopenharmony_ci vpxor 0x0180(%rdx),%ymm0,%ymm0 97262306a36Sopenharmony_ci vmovdqu %ymm0,0x0180(%rsi) 97362306a36Sopenharmony_ci 97462306a36Sopenharmony_ci vmovdqa %ymm13,%ymm0 97562306a36Sopenharmony_ci cmp $0x01c0,%rax 97662306a36Sopenharmony_ci jl .Lxorpart8 97762306a36Sopenharmony_ci vpxor 0x01a0(%rdx),%ymm0,%ymm0 97862306a36Sopenharmony_ci vmovdqu %ymm0,0x01a0(%rsi) 97962306a36Sopenharmony_ci 98062306a36Sopenharmony_ci vmovdqa %ymm7,%ymm0 98162306a36Sopenharmony_ci cmp $0x01e0,%rax 98262306a36Sopenharmony_ci jl .Lxorpart8 98362306a36Sopenharmony_ci vpxor 0x01c0(%rdx),%ymm0,%ymm0 98462306a36Sopenharmony_ci vmovdqu %ymm0,0x01c0(%rsi) 98562306a36Sopenharmony_ci 98662306a36Sopenharmony_ci vmovdqa %ymm15,%ymm0 98762306a36Sopenharmony_ci cmp $0x0200,%rax 98862306a36Sopenharmony_ci jl .Lxorpart8 98962306a36Sopenharmony_ci vpxor 0x01e0(%rdx),%ymm0,%ymm0 99062306a36Sopenharmony_ci vmovdqu %ymm0,0x01e0(%rsi) 99162306a36Sopenharmony_ci 99262306a36Sopenharmony_ci.Ldone8: 99362306a36Sopenharmony_ci vzeroupper 99462306a36Sopenharmony_ci lea -8(%r10),%rsp 99562306a36Sopenharmony_ci RET 99662306a36Sopenharmony_ci 99762306a36Sopenharmony_ci.Lxorpart8: 99862306a36Sopenharmony_ci # xor remaining bytes from partial register into output 99962306a36Sopenharmony_ci mov %rax,%r9 100062306a36Sopenharmony_ci and $0x1f,%r9 100162306a36Sopenharmony_ci jz .Ldone8 100262306a36Sopenharmony_ci and $~0x1f,%rax 100362306a36Sopenharmony_ci 100462306a36Sopenharmony_ci mov %rsi,%r11 100562306a36Sopenharmony_ci 100662306a36Sopenharmony_ci lea (%rdx,%rax),%rsi 100762306a36Sopenharmony_ci mov %rsp,%rdi 100862306a36Sopenharmony_ci mov %r9,%rcx 100962306a36Sopenharmony_ci rep movsb 101062306a36Sopenharmony_ci 101162306a36Sopenharmony_ci vpxor 0x00(%rsp),%ymm0,%ymm0 101262306a36Sopenharmony_ci vmovdqa %ymm0,0x00(%rsp) 101362306a36Sopenharmony_ci 101462306a36Sopenharmony_ci mov %rsp,%rsi 101562306a36Sopenharmony_ci lea (%r11,%rax),%rdi 101662306a36Sopenharmony_ci mov %r9,%rcx 101762306a36Sopenharmony_ci rep movsb 101862306a36Sopenharmony_ci 101962306a36Sopenharmony_ci jmp .Ldone8 102062306a36Sopenharmony_ci 102162306a36Sopenharmony_ciSYM_FUNC_END(chacha_8block_xor_avx2) 1022