162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-or-later */ 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright (C) 2015 Martin Willi 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci#include <linux/linkage.h> 962306a36Sopenharmony_ci#include <asm/frame.h> 1062306a36Sopenharmony_ci 1162306a36Sopenharmony_ci.section .rodata.cst16.ROT8, "aM", @progbits, 16 1262306a36Sopenharmony_ci.align 16 1362306a36Sopenharmony_ciROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 1462306a36Sopenharmony_ci.section .rodata.cst16.ROT16, "aM", @progbits, 16 1562306a36Sopenharmony_ci.align 16 1662306a36Sopenharmony_ciROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 1762306a36Sopenharmony_ci.section .rodata.cst16.CTRINC, "aM", @progbits, 16 1862306a36Sopenharmony_ci.align 16 1962306a36Sopenharmony_ciCTRINC: .octa 0x00000003000000020000000100000000 2062306a36Sopenharmony_ci 2162306a36Sopenharmony_ci.text 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_ci/* 2462306a36Sopenharmony_ci * chacha_permute - permute one block 2562306a36Sopenharmony_ci * 2662306a36Sopenharmony_ci * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This 2762306a36Sopenharmony_ci * function performs matrix operations on four words in parallel, but requires 2862306a36Sopenharmony_ci * shuffling to rearrange the words after each round. 8/16-bit word rotation is 2962306a36Sopenharmony_ci * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word 3062306a36Sopenharmony_ci * rotation uses traditional shift+OR. 3162306a36Sopenharmony_ci * 3262306a36Sopenharmony_ci * The round count is given in %r8d. 3362306a36Sopenharmony_ci * 3462306a36Sopenharmony_ci * Clobbers: %r8d, %xmm4-%xmm7 3562306a36Sopenharmony_ci */ 3662306a36Sopenharmony_ciSYM_FUNC_START_LOCAL(chacha_permute) 3762306a36Sopenharmony_ci 3862306a36Sopenharmony_ci movdqa ROT8(%rip),%xmm4 3962306a36Sopenharmony_ci movdqa ROT16(%rip),%xmm5 4062306a36Sopenharmony_ci 4162306a36Sopenharmony_ci.Ldoubleround: 4262306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 4362306a36Sopenharmony_ci paddd %xmm1,%xmm0 4462306a36Sopenharmony_ci pxor %xmm0,%xmm3 4562306a36Sopenharmony_ci pshufb %xmm5,%xmm3 4662306a36Sopenharmony_ci 4762306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 4862306a36Sopenharmony_ci paddd %xmm3,%xmm2 4962306a36Sopenharmony_ci pxor %xmm2,%xmm1 5062306a36Sopenharmony_ci movdqa %xmm1,%xmm6 5162306a36Sopenharmony_ci pslld $12,%xmm6 5262306a36Sopenharmony_ci psrld $20,%xmm1 5362306a36Sopenharmony_ci por %xmm6,%xmm1 5462306a36Sopenharmony_ci 5562306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 5662306a36Sopenharmony_ci paddd %xmm1,%xmm0 5762306a36Sopenharmony_ci pxor %xmm0,%xmm3 5862306a36Sopenharmony_ci pshufb %xmm4,%xmm3 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 6162306a36Sopenharmony_ci paddd %xmm3,%xmm2 6262306a36Sopenharmony_ci pxor %xmm2,%xmm1 6362306a36Sopenharmony_ci movdqa %xmm1,%xmm7 6462306a36Sopenharmony_ci pslld $7,%xmm7 6562306a36Sopenharmony_ci psrld $25,%xmm1 6662306a36Sopenharmony_ci por %xmm7,%xmm1 6762306a36Sopenharmony_ci 6862306a36Sopenharmony_ci # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 6962306a36Sopenharmony_ci pshufd $0x39,%xmm1,%xmm1 7062306a36Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 7162306a36Sopenharmony_ci pshufd $0x4e,%xmm2,%xmm2 7262306a36Sopenharmony_ci # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 7362306a36Sopenharmony_ci pshufd $0x93,%xmm3,%xmm3 7462306a36Sopenharmony_ci 7562306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 7662306a36Sopenharmony_ci paddd %xmm1,%xmm0 7762306a36Sopenharmony_ci pxor %xmm0,%xmm3 7862306a36Sopenharmony_ci pshufb %xmm5,%xmm3 7962306a36Sopenharmony_ci 8062306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 8162306a36Sopenharmony_ci paddd %xmm3,%xmm2 8262306a36Sopenharmony_ci pxor %xmm2,%xmm1 8362306a36Sopenharmony_ci movdqa %xmm1,%xmm6 8462306a36Sopenharmony_ci pslld $12,%xmm6 8562306a36Sopenharmony_ci psrld $20,%xmm1 8662306a36Sopenharmony_ci por %xmm6,%xmm1 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_ci # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 8962306a36Sopenharmony_ci paddd %xmm1,%xmm0 9062306a36Sopenharmony_ci pxor %xmm0,%xmm3 9162306a36Sopenharmony_ci pshufb %xmm4,%xmm3 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 9462306a36Sopenharmony_ci paddd %xmm3,%xmm2 9562306a36Sopenharmony_ci pxor %xmm2,%xmm1 9662306a36Sopenharmony_ci movdqa %xmm1,%xmm7 9762306a36Sopenharmony_ci pslld $7,%xmm7 9862306a36Sopenharmony_ci psrld $25,%xmm1 9962306a36Sopenharmony_ci por %xmm7,%xmm1 10062306a36Sopenharmony_ci 10162306a36Sopenharmony_ci # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 10262306a36Sopenharmony_ci pshufd $0x93,%xmm1,%xmm1 10362306a36Sopenharmony_ci # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 10462306a36Sopenharmony_ci pshufd $0x4e,%xmm2,%xmm2 10562306a36Sopenharmony_ci # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 10662306a36Sopenharmony_ci pshufd $0x39,%xmm3,%xmm3 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci sub $2,%r8d 10962306a36Sopenharmony_ci jnz .Ldoubleround 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci RET 11262306a36Sopenharmony_ciSYM_FUNC_END(chacha_permute) 11362306a36Sopenharmony_ci 11462306a36Sopenharmony_ciSYM_FUNC_START(chacha_block_xor_ssse3) 11562306a36Sopenharmony_ci # %rdi: Input state matrix, s 11662306a36Sopenharmony_ci # %rsi: up to 1 data block output, o 11762306a36Sopenharmony_ci # %rdx: up to 1 data block input, i 11862306a36Sopenharmony_ci # %rcx: input/output length in bytes 11962306a36Sopenharmony_ci # %r8d: nrounds 12062306a36Sopenharmony_ci FRAME_BEGIN 12162306a36Sopenharmony_ci 12262306a36Sopenharmony_ci # x0..3 = s0..3 12362306a36Sopenharmony_ci movdqu 0x00(%rdi),%xmm0 12462306a36Sopenharmony_ci movdqu 0x10(%rdi),%xmm1 12562306a36Sopenharmony_ci movdqu 0x20(%rdi),%xmm2 12662306a36Sopenharmony_ci movdqu 0x30(%rdi),%xmm3 12762306a36Sopenharmony_ci movdqa %xmm0,%xmm8 12862306a36Sopenharmony_ci movdqa %xmm1,%xmm9 12962306a36Sopenharmony_ci movdqa %xmm2,%xmm10 13062306a36Sopenharmony_ci movdqa %xmm3,%xmm11 13162306a36Sopenharmony_ci 13262306a36Sopenharmony_ci mov %rcx,%rax 13362306a36Sopenharmony_ci call chacha_permute 13462306a36Sopenharmony_ci 13562306a36Sopenharmony_ci # o0 = i0 ^ (x0 + s0) 13662306a36Sopenharmony_ci paddd %xmm8,%xmm0 13762306a36Sopenharmony_ci cmp $0x10,%rax 13862306a36Sopenharmony_ci jl .Lxorpart 13962306a36Sopenharmony_ci movdqu 0x00(%rdx),%xmm4 14062306a36Sopenharmony_ci pxor %xmm4,%xmm0 14162306a36Sopenharmony_ci movdqu %xmm0,0x00(%rsi) 14262306a36Sopenharmony_ci # o1 = i1 ^ (x1 + s1) 14362306a36Sopenharmony_ci paddd %xmm9,%xmm1 14462306a36Sopenharmony_ci movdqa %xmm1,%xmm0 14562306a36Sopenharmony_ci cmp $0x20,%rax 14662306a36Sopenharmony_ci jl .Lxorpart 14762306a36Sopenharmony_ci movdqu 0x10(%rdx),%xmm0 14862306a36Sopenharmony_ci pxor %xmm1,%xmm0 14962306a36Sopenharmony_ci movdqu %xmm0,0x10(%rsi) 15062306a36Sopenharmony_ci # o2 = i2 ^ (x2 + s2) 15162306a36Sopenharmony_ci paddd %xmm10,%xmm2 15262306a36Sopenharmony_ci movdqa %xmm2,%xmm0 15362306a36Sopenharmony_ci cmp $0x30,%rax 15462306a36Sopenharmony_ci jl .Lxorpart 15562306a36Sopenharmony_ci movdqu 0x20(%rdx),%xmm0 15662306a36Sopenharmony_ci pxor %xmm2,%xmm0 15762306a36Sopenharmony_ci movdqu %xmm0,0x20(%rsi) 15862306a36Sopenharmony_ci # o3 = i3 ^ (x3 + s3) 15962306a36Sopenharmony_ci paddd %xmm11,%xmm3 16062306a36Sopenharmony_ci movdqa %xmm3,%xmm0 16162306a36Sopenharmony_ci cmp $0x40,%rax 16262306a36Sopenharmony_ci jl .Lxorpart 16362306a36Sopenharmony_ci movdqu 0x30(%rdx),%xmm0 16462306a36Sopenharmony_ci pxor %xmm3,%xmm0 16562306a36Sopenharmony_ci movdqu %xmm0,0x30(%rsi) 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_ci.Ldone: 16862306a36Sopenharmony_ci FRAME_END 16962306a36Sopenharmony_ci RET 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci.Lxorpart: 17262306a36Sopenharmony_ci # xor remaining bytes from partial register into output 17362306a36Sopenharmony_ci mov %rax,%r9 17462306a36Sopenharmony_ci and $0x0f,%r9 17562306a36Sopenharmony_ci jz .Ldone 17662306a36Sopenharmony_ci and $~0x0f,%rax 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci mov %rsi,%r11 17962306a36Sopenharmony_ci 18062306a36Sopenharmony_ci lea 8(%rsp),%r10 18162306a36Sopenharmony_ci sub $0x10,%rsp 18262306a36Sopenharmony_ci and $~31,%rsp 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci lea (%rdx,%rax),%rsi 18562306a36Sopenharmony_ci mov %rsp,%rdi 18662306a36Sopenharmony_ci mov %r9,%rcx 18762306a36Sopenharmony_ci rep movsb 18862306a36Sopenharmony_ci 18962306a36Sopenharmony_ci pxor 0x00(%rsp),%xmm0 19062306a36Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 19162306a36Sopenharmony_ci 19262306a36Sopenharmony_ci mov %rsp,%rsi 19362306a36Sopenharmony_ci lea (%r11,%rax),%rdi 19462306a36Sopenharmony_ci mov %r9,%rcx 19562306a36Sopenharmony_ci rep movsb 19662306a36Sopenharmony_ci 19762306a36Sopenharmony_ci lea -8(%r10),%rsp 19862306a36Sopenharmony_ci jmp .Ldone 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ciSYM_FUNC_END(chacha_block_xor_ssse3) 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_ciSYM_FUNC_START(hchacha_block_ssse3) 20362306a36Sopenharmony_ci # %rdi: Input state matrix, s 20462306a36Sopenharmony_ci # %rsi: output (8 32-bit words) 20562306a36Sopenharmony_ci # %edx: nrounds 20662306a36Sopenharmony_ci FRAME_BEGIN 20762306a36Sopenharmony_ci 20862306a36Sopenharmony_ci movdqu 0x00(%rdi),%xmm0 20962306a36Sopenharmony_ci movdqu 0x10(%rdi),%xmm1 21062306a36Sopenharmony_ci movdqu 0x20(%rdi),%xmm2 21162306a36Sopenharmony_ci movdqu 0x30(%rdi),%xmm3 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci mov %edx,%r8d 21462306a36Sopenharmony_ci call chacha_permute 21562306a36Sopenharmony_ci 21662306a36Sopenharmony_ci movdqu %xmm0,0x00(%rsi) 21762306a36Sopenharmony_ci movdqu %xmm3,0x10(%rsi) 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci FRAME_END 22062306a36Sopenharmony_ci RET 22162306a36Sopenharmony_ciSYM_FUNC_END(hchacha_block_ssse3) 22262306a36Sopenharmony_ci 22362306a36Sopenharmony_ciSYM_FUNC_START(chacha_4block_xor_ssse3) 22462306a36Sopenharmony_ci # %rdi: Input state matrix, s 22562306a36Sopenharmony_ci # %rsi: up to 4 data blocks output, o 22662306a36Sopenharmony_ci # %rdx: up to 4 data blocks input, i 22762306a36Sopenharmony_ci # %rcx: input/output length in bytes 22862306a36Sopenharmony_ci # %r8d: nrounds 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci # This function encrypts four consecutive ChaCha blocks by loading the 23162306a36Sopenharmony_ci # the state matrix in SSE registers four times. As we need some scratch 23262306a36Sopenharmony_ci # registers, we save the first four registers on the stack. The 23362306a36Sopenharmony_ci # algorithm performs each operation on the corresponding word of each 23462306a36Sopenharmony_ci # state matrix, hence requires no word shuffling. For final XORing step 23562306a36Sopenharmony_ci # we transpose the matrix by interleaving 32- and then 64-bit words, 23662306a36Sopenharmony_ci # which allows us to do XOR in SSE registers. 8/16-bit word rotation is 23762306a36Sopenharmony_ci # done with the slightly better performing SSSE3 byte shuffling, 23862306a36Sopenharmony_ci # 7/12-bit word rotation uses traditional shift+OR. 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci lea 8(%rsp),%r10 24162306a36Sopenharmony_ci sub $0x80,%rsp 24262306a36Sopenharmony_ci and $~63,%rsp 24362306a36Sopenharmony_ci mov %rcx,%rax 24462306a36Sopenharmony_ci 24562306a36Sopenharmony_ci # x0..15[0-3] = s0..3[0..3] 24662306a36Sopenharmony_ci movq 0x00(%rdi),%xmm1 24762306a36Sopenharmony_ci pshufd $0x00,%xmm1,%xmm0 24862306a36Sopenharmony_ci pshufd $0x55,%xmm1,%xmm1 24962306a36Sopenharmony_ci movq 0x08(%rdi),%xmm3 25062306a36Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 25162306a36Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 25262306a36Sopenharmony_ci movq 0x10(%rdi),%xmm5 25362306a36Sopenharmony_ci pshufd $0x00,%xmm5,%xmm4 25462306a36Sopenharmony_ci pshufd $0x55,%xmm5,%xmm5 25562306a36Sopenharmony_ci movq 0x18(%rdi),%xmm7 25662306a36Sopenharmony_ci pshufd $0x00,%xmm7,%xmm6 25762306a36Sopenharmony_ci pshufd $0x55,%xmm7,%xmm7 25862306a36Sopenharmony_ci movq 0x20(%rdi),%xmm9 25962306a36Sopenharmony_ci pshufd $0x00,%xmm9,%xmm8 26062306a36Sopenharmony_ci pshufd $0x55,%xmm9,%xmm9 26162306a36Sopenharmony_ci movq 0x28(%rdi),%xmm11 26262306a36Sopenharmony_ci pshufd $0x00,%xmm11,%xmm10 26362306a36Sopenharmony_ci pshufd $0x55,%xmm11,%xmm11 26462306a36Sopenharmony_ci movq 0x30(%rdi),%xmm13 26562306a36Sopenharmony_ci pshufd $0x00,%xmm13,%xmm12 26662306a36Sopenharmony_ci pshufd $0x55,%xmm13,%xmm13 26762306a36Sopenharmony_ci movq 0x38(%rdi),%xmm15 26862306a36Sopenharmony_ci pshufd $0x00,%xmm15,%xmm14 26962306a36Sopenharmony_ci pshufd $0x55,%xmm15,%xmm15 27062306a36Sopenharmony_ci # x0..3 on stack 27162306a36Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 27262306a36Sopenharmony_ci movdqa %xmm1,0x10(%rsp) 27362306a36Sopenharmony_ci movdqa %xmm2,0x20(%rsp) 27462306a36Sopenharmony_ci movdqa %xmm3,0x30(%rsp) 27562306a36Sopenharmony_ci 27662306a36Sopenharmony_ci movdqa CTRINC(%rip),%xmm1 27762306a36Sopenharmony_ci movdqa ROT8(%rip),%xmm2 27862306a36Sopenharmony_ci movdqa ROT16(%rip),%xmm3 27962306a36Sopenharmony_ci 28062306a36Sopenharmony_ci # x12 += counter values 0-3 28162306a36Sopenharmony_ci paddd %xmm1,%xmm12 28262306a36Sopenharmony_ci 28362306a36Sopenharmony_ci.Ldoubleround4: 28462306a36Sopenharmony_ci # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 28562306a36Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 28662306a36Sopenharmony_ci paddd %xmm4,%xmm0 28762306a36Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 28862306a36Sopenharmony_ci pxor %xmm0,%xmm12 28962306a36Sopenharmony_ci pshufb %xmm3,%xmm12 29062306a36Sopenharmony_ci # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 29162306a36Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 29262306a36Sopenharmony_ci paddd %xmm5,%xmm0 29362306a36Sopenharmony_ci movdqa %xmm0,0x10(%rsp) 29462306a36Sopenharmony_ci pxor %xmm0,%xmm13 29562306a36Sopenharmony_ci pshufb %xmm3,%xmm13 29662306a36Sopenharmony_ci # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 29762306a36Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 29862306a36Sopenharmony_ci paddd %xmm6,%xmm0 29962306a36Sopenharmony_ci movdqa %xmm0,0x20(%rsp) 30062306a36Sopenharmony_ci pxor %xmm0,%xmm14 30162306a36Sopenharmony_ci pshufb %xmm3,%xmm14 30262306a36Sopenharmony_ci # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 30362306a36Sopenharmony_ci movdqa 0x30(%rsp),%xmm0 30462306a36Sopenharmony_ci paddd %xmm7,%xmm0 30562306a36Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 30662306a36Sopenharmony_ci pxor %xmm0,%xmm15 30762306a36Sopenharmony_ci pshufb %xmm3,%xmm15 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ci # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 31062306a36Sopenharmony_ci paddd %xmm12,%xmm8 31162306a36Sopenharmony_ci pxor %xmm8,%xmm4 31262306a36Sopenharmony_ci movdqa %xmm4,%xmm0 31362306a36Sopenharmony_ci pslld $12,%xmm0 31462306a36Sopenharmony_ci psrld $20,%xmm4 31562306a36Sopenharmony_ci por %xmm0,%xmm4 31662306a36Sopenharmony_ci # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 31762306a36Sopenharmony_ci paddd %xmm13,%xmm9 31862306a36Sopenharmony_ci pxor %xmm9,%xmm5 31962306a36Sopenharmony_ci movdqa %xmm5,%xmm0 32062306a36Sopenharmony_ci pslld $12,%xmm0 32162306a36Sopenharmony_ci psrld $20,%xmm5 32262306a36Sopenharmony_ci por %xmm0,%xmm5 32362306a36Sopenharmony_ci # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 32462306a36Sopenharmony_ci paddd %xmm14,%xmm10 32562306a36Sopenharmony_ci pxor %xmm10,%xmm6 32662306a36Sopenharmony_ci movdqa %xmm6,%xmm0 32762306a36Sopenharmony_ci pslld $12,%xmm0 32862306a36Sopenharmony_ci psrld $20,%xmm6 32962306a36Sopenharmony_ci por %xmm0,%xmm6 33062306a36Sopenharmony_ci # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 33162306a36Sopenharmony_ci paddd %xmm15,%xmm11 33262306a36Sopenharmony_ci pxor %xmm11,%xmm7 33362306a36Sopenharmony_ci movdqa %xmm7,%xmm0 33462306a36Sopenharmony_ci pslld $12,%xmm0 33562306a36Sopenharmony_ci psrld $20,%xmm7 33662306a36Sopenharmony_ci por %xmm0,%xmm7 33762306a36Sopenharmony_ci 33862306a36Sopenharmony_ci # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 33962306a36Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 34062306a36Sopenharmony_ci paddd %xmm4,%xmm0 34162306a36Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 34262306a36Sopenharmony_ci pxor %xmm0,%xmm12 34362306a36Sopenharmony_ci pshufb %xmm2,%xmm12 34462306a36Sopenharmony_ci # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 34562306a36Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 34662306a36Sopenharmony_ci paddd %xmm5,%xmm0 34762306a36Sopenharmony_ci movdqa %xmm0,0x10(%rsp) 34862306a36Sopenharmony_ci pxor %xmm0,%xmm13 34962306a36Sopenharmony_ci pshufb %xmm2,%xmm13 35062306a36Sopenharmony_ci # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 35162306a36Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 35262306a36Sopenharmony_ci paddd %xmm6,%xmm0 35362306a36Sopenharmony_ci movdqa %xmm0,0x20(%rsp) 35462306a36Sopenharmony_ci pxor %xmm0,%xmm14 35562306a36Sopenharmony_ci pshufb %xmm2,%xmm14 35662306a36Sopenharmony_ci # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 35762306a36Sopenharmony_ci movdqa 0x30(%rsp),%xmm0 35862306a36Sopenharmony_ci paddd %xmm7,%xmm0 35962306a36Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 36062306a36Sopenharmony_ci pxor %xmm0,%xmm15 36162306a36Sopenharmony_ci pshufb %xmm2,%xmm15 36262306a36Sopenharmony_ci 36362306a36Sopenharmony_ci # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 36462306a36Sopenharmony_ci paddd %xmm12,%xmm8 36562306a36Sopenharmony_ci pxor %xmm8,%xmm4 36662306a36Sopenharmony_ci movdqa %xmm4,%xmm0 36762306a36Sopenharmony_ci pslld $7,%xmm0 36862306a36Sopenharmony_ci psrld $25,%xmm4 36962306a36Sopenharmony_ci por %xmm0,%xmm4 37062306a36Sopenharmony_ci # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 37162306a36Sopenharmony_ci paddd %xmm13,%xmm9 37262306a36Sopenharmony_ci pxor %xmm9,%xmm5 37362306a36Sopenharmony_ci movdqa %xmm5,%xmm0 37462306a36Sopenharmony_ci pslld $7,%xmm0 37562306a36Sopenharmony_ci psrld $25,%xmm5 37662306a36Sopenharmony_ci por %xmm0,%xmm5 37762306a36Sopenharmony_ci # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 37862306a36Sopenharmony_ci paddd %xmm14,%xmm10 37962306a36Sopenharmony_ci pxor %xmm10,%xmm6 38062306a36Sopenharmony_ci movdqa %xmm6,%xmm0 38162306a36Sopenharmony_ci pslld $7,%xmm0 38262306a36Sopenharmony_ci psrld $25,%xmm6 38362306a36Sopenharmony_ci por %xmm0,%xmm6 38462306a36Sopenharmony_ci # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 38562306a36Sopenharmony_ci paddd %xmm15,%xmm11 38662306a36Sopenharmony_ci pxor %xmm11,%xmm7 38762306a36Sopenharmony_ci movdqa %xmm7,%xmm0 38862306a36Sopenharmony_ci pslld $7,%xmm0 38962306a36Sopenharmony_ci psrld $25,%xmm7 39062306a36Sopenharmony_ci por %xmm0,%xmm7 39162306a36Sopenharmony_ci 39262306a36Sopenharmony_ci # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 39362306a36Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 39462306a36Sopenharmony_ci paddd %xmm5,%xmm0 39562306a36Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 39662306a36Sopenharmony_ci pxor %xmm0,%xmm15 39762306a36Sopenharmony_ci pshufb %xmm3,%xmm15 39862306a36Sopenharmony_ci # x1 += x6, x12 = rotl32(x12 ^ x1, 16) 39962306a36Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 40062306a36Sopenharmony_ci paddd %xmm6,%xmm0 40162306a36Sopenharmony_ci movdqa %xmm0,0x10(%rsp) 40262306a36Sopenharmony_ci pxor %xmm0,%xmm12 40362306a36Sopenharmony_ci pshufb %xmm3,%xmm12 40462306a36Sopenharmony_ci # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 40562306a36Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 40662306a36Sopenharmony_ci paddd %xmm7,%xmm0 40762306a36Sopenharmony_ci movdqa %xmm0,0x20(%rsp) 40862306a36Sopenharmony_ci pxor %xmm0,%xmm13 40962306a36Sopenharmony_ci pshufb %xmm3,%xmm13 41062306a36Sopenharmony_ci # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 41162306a36Sopenharmony_ci movdqa 0x30(%rsp),%xmm0 41262306a36Sopenharmony_ci paddd %xmm4,%xmm0 41362306a36Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 41462306a36Sopenharmony_ci pxor %xmm0,%xmm14 41562306a36Sopenharmony_ci pshufb %xmm3,%xmm14 41662306a36Sopenharmony_ci 41762306a36Sopenharmony_ci # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 41862306a36Sopenharmony_ci paddd %xmm15,%xmm10 41962306a36Sopenharmony_ci pxor %xmm10,%xmm5 42062306a36Sopenharmony_ci movdqa %xmm5,%xmm0 42162306a36Sopenharmony_ci pslld $12,%xmm0 42262306a36Sopenharmony_ci psrld $20,%xmm5 42362306a36Sopenharmony_ci por %xmm0,%xmm5 42462306a36Sopenharmony_ci # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 42562306a36Sopenharmony_ci paddd %xmm12,%xmm11 42662306a36Sopenharmony_ci pxor %xmm11,%xmm6 42762306a36Sopenharmony_ci movdqa %xmm6,%xmm0 42862306a36Sopenharmony_ci pslld $12,%xmm0 42962306a36Sopenharmony_ci psrld $20,%xmm6 43062306a36Sopenharmony_ci por %xmm0,%xmm6 43162306a36Sopenharmony_ci # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 43262306a36Sopenharmony_ci paddd %xmm13,%xmm8 43362306a36Sopenharmony_ci pxor %xmm8,%xmm7 43462306a36Sopenharmony_ci movdqa %xmm7,%xmm0 43562306a36Sopenharmony_ci pslld $12,%xmm0 43662306a36Sopenharmony_ci psrld $20,%xmm7 43762306a36Sopenharmony_ci por %xmm0,%xmm7 43862306a36Sopenharmony_ci # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 43962306a36Sopenharmony_ci paddd %xmm14,%xmm9 44062306a36Sopenharmony_ci pxor %xmm9,%xmm4 44162306a36Sopenharmony_ci movdqa %xmm4,%xmm0 44262306a36Sopenharmony_ci pslld $12,%xmm0 44362306a36Sopenharmony_ci psrld $20,%xmm4 44462306a36Sopenharmony_ci por %xmm0,%xmm4 44562306a36Sopenharmony_ci 44662306a36Sopenharmony_ci # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 44762306a36Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 44862306a36Sopenharmony_ci paddd %xmm5,%xmm0 44962306a36Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 45062306a36Sopenharmony_ci pxor %xmm0,%xmm15 45162306a36Sopenharmony_ci pshufb %xmm2,%xmm15 45262306a36Sopenharmony_ci # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 45362306a36Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 45462306a36Sopenharmony_ci paddd %xmm6,%xmm0 45562306a36Sopenharmony_ci movdqa %xmm0,0x10(%rsp) 45662306a36Sopenharmony_ci pxor %xmm0,%xmm12 45762306a36Sopenharmony_ci pshufb %xmm2,%xmm12 45862306a36Sopenharmony_ci # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 45962306a36Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 46062306a36Sopenharmony_ci paddd %xmm7,%xmm0 46162306a36Sopenharmony_ci movdqa %xmm0,0x20(%rsp) 46262306a36Sopenharmony_ci pxor %xmm0,%xmm13 46362306a36Sopenharmony_ci pshufb %xmm2,%xmm13 46462306a36Sopenharmony_ci # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 46562306a36Sopenharmony_ci movdqa 0x30(%rsp),%xmm0 46662306a36Sopenharmony_ci paddd %xmm4,%xmm0 46762306a36Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 46862306a36Sopenharmony_ci pxor %xmm0,%xmm14 46962306a36Sopenharmony_ci pshufb %xmm2,%xmm14 47062306a36Sopenharmony_ci 47162306a36Sopenharmony_ci # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 47262306a36Sopenharmony_ci paddd %xmm15,%xmm10 47362306a36Sopenharmony_ci pxor %xmm10,%xmm5 47462306a36Sopenharmony_ci movdqa %xmm5,%xmm0 47562306a36Sopenharmony_ci pslld $7,%xmm0 47662306a36Sopenharmony_ci psrld $25,%xmm5 47762306a36Sopenharmony_ci por %xmm0,%xmm5 47862306a36Sopenharmony_ci # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 47962306a36Sopenharmony_ci paddd %xmm12,%xmm11 48062306a36Sopenharmony_ci pxor %xmm11,%xmm6 48162306a36Sopenharmony_ci movdqa %xmm6,%xmm0 48262306a36Sopenharmony_ci pslld $7,%xmm0 48362306a36Sopenharmony_ci psrld $25,%xmm6 48462306a36Sopenharmony_ci por %xmm0,%xmm6 48562306a36Sopenharmony_ci # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 48662306a36Sopenharmony_ci paddd %xmm13,%xmm8 48762306a36Sopenharmony_ci pxor %xmm8,%xmm7 48862306a36Sopenharmony_ci movdqa %xmm7,%xmm0 48962306a36Sopenharmony_ci pslld $7,%xmm0 49062306a36Sopenharmony_ci psrld $25,%xmm7 49162306a36Sopenharmony_ci por %xmm0,%xmm7 49262306a36Sopenharmony_ci # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 49362306a36Sopenharmony_ci paddd %xmm14,%xmm9 49462306a36Sopenharmony_ci pxor %xmm9,%xmm4 49562306a36Sopenharmony_ci movdqa %xmm4,%xmm0 49662306a36Sopenharmony_ci pslld $7,%xmm0 49762306a36Sopenharmony_ci psrld $25,%xmm4 49862306a36Sopenharmony_ci por %xmm0,%xmm4 49962306a36Sopenharmony_ci 50062306a36Sopenharmony_ci sub $2,%r8d 50162306a36Sopenharmony_ci jnz .Ldoubleround4 50262306a36Sopenharmony_ci 50362306a36Sopenharmony_ci # x0[0-3] += s0[0] 50462306a36Sopenharmony_ci # x1[0-3] += s0[1] 50562306a36Sopenharmony_ci movq 0x00(%rdi),%xmm3 50662306a36Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 50762306a36Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 50862306a36Sopenharmony_ci paddd 0x00(%rsp),%xmm2 50962306a36Sopenharmony_ci movdqa %xmm2,0x00(%rsp) 51062306a36Sopenharmony_ci paddd 0x10(%rsp),%xmm3 51162306a36Sopenharmony_ci movdqa %xmm3,0x10(%rsp) 51262306a36Sopenharmony_ci # x2[0-3] += s0[2] 51362306a36Sopenharmony_ci # x3[0-3] += s0[3] 51462306a36Sopenharmony_ci movq 0x08(%rdi),%xmm3 51562306a36Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 51662306a36Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 51762306a36Sopenharmony_ci paddd 0x20(%rsp),%xmm2 51862306a36Sopenharmony_ci movdqa %xmm2,0x20(%rsp) 51962306a36Sopenharmony_ci paddd 0x30(%rsp),%xmm3 52062306a36Sopenharmony_ci movdqa %xmm3,0x30(%rsp) 52162306a36Sopenharmony_ci 52262306a36Sopenharmony_ci # x4[0-3] += s1[0] 52362306a36Sopenharmony_ci # x5[0-3] += s1[1] 52462306a36Sopenharmony_ci movq 0x10(%rdi),%xmm3 52562306a36Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 52662306a36Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 52762306a36Sopenharmony_ci paddd %xmm2,%xmm4 52862306a36Sopenharmony_ci paddd %xmm3,%xmm5 52962306a36Sopenharmony_ci # x6[0-3] += s1[2] 53062306a36Sopenharmony_ci # x7[0-3] += s1[3] 53162306a36Sopenharmony_ci movq 0x18(%rdi),%xmm3 53262306a36Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 53362306a36Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 53462306a36Sopenharmony_ci paddd %xmm2,%xmm6 53562306a36Sopenharmony_ci paddd %xmm3,%xmm7 53662306a36Sopenharmony_ci 53762306a36Sopenharmony_ci # x8[0-3] += s2[0] 53862306a36Sopenharmony_ci # x9[0-3] += s2[1] 53962306a36Sopenharmony_ci movq 0x20(%rdi),%xmm3 54062306a36Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 54162306a36Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 54262306a36Sopenharmony_ci paddd %xmm2,%xmm8 54362306a36Sopenharmony_ci paddd %xmm3,%xmm9 54462306a36Sopenharmony_ci # x10[0-3] += s2[2] 54562306a36Sopenharmony_ci # x11[0-3] += s2[3] 54662306a36Sopenharmony_ci movq 0x28(%rdi),%xmm3 54762306a36Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 54862306a36Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 54962306a36Sopenharmony_ci paddd %xmm2,%xmm10 55062306a36Sopenharmony_ci paddd %xmm3,%xmm11 55162306a36Sopenharmony_ci 55262306a36Sopenharmony_ci # x12[0-3] += s3[0] 55362306a36Sopenharmony_ci # x13[0-3] += s3[1] 55462306a36Sopenharmony_ci movq 0x30(%rdi),%xmm3 55562306a36Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 55662306a36Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 55762306a36Sopenharmony_ci paddd %xmm2,%xmm12 55862306a36Sopenharmony_ci paddd %xmm3,%xmm13 55962306a36Sopenharmony_ci # x14[0-3] += s3[2] 56062306a36Sopenharmony_ci # x15[0-3] += s3[3] 56162306a36Sopenharmony_ci movq 0x38(%rdi),%xmm3 56262306a36Sopenharmony_ci pshufd $0x00,%xmm3,%xmm2 56362306a36Sopenharmony_ci pshufd $0x55,%xmm3,%xmm3 56462306a36Sopenharmony_ci paddd %xmm2,%xmm14 56562306a36Sopenharmony_ci paddd %xmm3,%xmm15 56662306a36Sopenharmony_ci 56762306a36Sopenharmony_ci # x12 += counter values 0-3 56862306a36Sopenharmony_ci paddd %xmm1,%xmm12 56962306a36Sopenharmony_ci 57062306a36Sopenharmony_ci # interleave 32-bit words in state n, n+1 57162306a36Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 57262306a36Sopenharmony_ci movdqa 0x10(%rsp),%xmm1 57362306a36Sopenharmony_ci movdqa %xmm0,%xmm2 57462306a36Sopenharmony_ci punpckldq %xmm1,%xmm2 57562306a36Sopenharmony_ci punpckhdq %xmm1,%xmm0 57662306a36Sopenharmony_ci movdqa %xmm2,0x00(%rsp) 57762306a36Sopenharmony_ci movdqa %xmm0,0x10(%rsp) 57862306a36Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 57962306a36Sopenharmony_ci movdqa 0x30(%rsp),%xmm1 58062306a36Sopenharmony_ci movdqa %xmm0,%xmm2 58162306a36Sopenharmony_ci punpckldq %xmm1,%xmm2 58262306a36Sopenharmony_ci punpckhdq %xmm1,%xmm0 58362306a36Sopenharmony_ci movdqa %xmm2,0x20(%rsp) 58462306a36Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 58562306a36Sopenharmony_ci movdqa %xmm4,%xmm0 58662306a36Sopenharmony_ci punpckldq %xmm5,%xmm4 58762306a36Sopenharmony_ci punpckhdq %xmm5,%xmm0 58862306a36Sopenharmony_ci movdqa %xmm0,%xmm5 58962306a36Sopenharmony_ci movdqa %xmm6,%xmm0 59062306a36Sopenharmony_ci punpckldq %xmm7,%xmm6 59162306a36Sopenharmony_ci punpckhdq %xmm7,%xmm0 59262306a36Sopenharmony_ci movdqa %xmm0,%xmm7 59362306a36Sopenharmony_ci movdqa %xmm8,%xmm0 59462306a36Sopenharmony_ci punpckldq %xmm9,%xmm8 59562306a36Sopenharmony_ci punpckhdq %xmm9,%xmm0 59662306a36Sopenharmony_ci movdqa %xmm0,%xmm9 59762306a36Sopenharmony_ci movdqa %xmm10,%xmm0 59862306a36Sopenharmony_ci punpckldq %xmm11,%xmm10 59962306a36Sopenharmony_ci punpckhdq %xmm11,%xmm0 60062306a36Sopenharmony_ci movdqa %xmm0,%xmm11 60162306a36Sopenharmony_ci movdqa %xmm12,%xmm0 60262306a36Sopenharmony_ci punpckldq %xmm13,%xmm12 60362306a36Sopenharmony_ci punpckhdq %xmm13,%xmm0 60462306a36Sopenharmony_ci movdqa %xmm0,%xmm13 60562306a36Sopenharmony_ci movdqa %xmm14,%xmm0 60662306a36Sopenharmony_ci punpckldq %xmm15,%xmm14 60762306a36Sopenharmony_ci punpckhdq %xmm15,%xmm0 60862306a36Sopenharmony_ci movdqa %xmm0,%xmm15 60962306a36Sopenharmony_ci 61062306a36Sopenharmony_ci # interleave 64-bit words in state n, n+2 61162306a36Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 61262306a36Sopenharmony_ci movdqa 0x20(%rsp),%xmm1 61362306a36Sopenharmony_ci movdqa %xmm0,%xmm2 61462306a36Sopenharmony_ci punpcklqdq %xmm1,%xmm2 61562306a36Sopenharmony_ci punpckhqdq %xmm1,%xmm0 61662306a36Sopenharmony_ci movdqa %xmm2,0x00(%rsp) 61762306a36Sopenharmony_ci movdqa %xmm0,0x20(%rsp) 61862306a36Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 61962306a36Sopenharmony_ci movdqa 0x30(%rsp),%xmm1 62062306a36Sopenharmony_ci movdqa %xmm0,%xmm2 62162306a36Sopenharmony_ci punpcklqdq %xmm1,%xmm2 62262306a36Sopenharmony_ci punpckhqdq %xmm1,%xmm0 62362306a36Sopenharmony_ci movdqa %xmm2,0x10(%rsp) 62462306a36Sopenharmony_ci movdqa %xmm0,0x30(%rsp) 62562306a36Sopenharmony_ci movdqa %xmm4,%xmm0 62662306a36Sopenharmony_ci punpcklqdq %xmm6,%xmm4 62762306a36Sopenharmony_ci punpckhqdq %xmm6,%xmm0 62862306a36Sopenharmony_ci movdqa %xmm0,%xmm6 62962306a36Sopenharmony_ci movdqa %xmm5,%xmm0 63062306a36Sopenharmony_ci punpcklqdq %xmm7,%xmm5 63162306a36Sopenharmony_ci punpckhqdq %xmm7,%xmm0 63262306a36Sopenharmony_ci movdqa %xmm0,%xmm7 63362306a36Sopenharmony_ci movdqa %xmm8,%xmm0 63462306a36Sopenharmony_ci punpcklqdq %xmm10,%xmm8 63562306a36Sopenharmony_ci punpckhqdq %xmm10,%xmm0 63662306a36Sopenharmony_ci movdqa %xmm0,%xmm10 63762306a36Sopenharmony_ci movdqa %xmm9,%xmm0 63862306a36Sopenharmony_ci punpcklqdq %xmm11,%xmm9 63962306a36Sopenharmony_ci punpckhqdq %xmm11,%xmm0 64062306a36Sopenharmony_ci movdqa %xmm0,%xmm11 64162306a36Sopenharmony_ci movdqa %xmm12,%xmm0 64262306a36Sopenharmony_ci punpcklqdq %xmm14,%xmm12 64362306a36Sopenharmony_ci punpckhqdq %xmm14,%xmm0 64462306a36Sopenharmony_ci movdqa %xmm0,%xmm14 64562306a36Sopenharmony_ci movdqa %xmm13,%xmm0 64662306a36Sopenharmony_ci punpcklqdq %xmm15,%xmm13 64762306a36Sopenharmony_ci punpckhqdq %xmm15,%xmm0 64862306a36Sopenharmony_ci movdqa %xmm0,%xmm15 64962306a36Sopenharmony_ci 65062306a36Sopenharmony_ci # xor with corresponding input, write to output 65162306a36Sopenharmony_ci movdqa 0x00(%rsp),%xmm0 65262306a36Sopenharmony_ci cmp $0x10,%rax 65362306a36Sopenharmony_ci jl .Lxorpart4 65462306a36Sopenharmony_ci movdqu 0x00(%rdx),%xmm1 65562306a36Sopenharmony_ci pxor %xmm1,%xmm0 65662306a36Sopenharmony_ci movdqu %xmm0,0x00(%rsi) 65762306a36Sopenharmony_ci 65862306a36Sopenharmony_ci movdqu %xmm4,%xmm0 65962306a36Sopenharmony_ci cmp $0x20,%rax 66062306a36Sopenharmony_ci jl .Lxorpart4 66162306a36Sopenharmony_ci movdqu 0x10(%rdx),%xmm1 66262306a36Sopenharmony_ci pxor %xmm1,%xmm0 66362306a36Sopenharmony_ci movdqu %xmm0,0x10(%rsi) 66462306a36Sopenharmony_ci 66562306a36Sopenharmony_ci movdqu %xmm8,%xmm0 66662306a36Sopenharmony_ci cmp $0x30,%rax 66762306a36Sopenharmony_ci jl .Lxorpart4 66862306a36Sopenharmony_ci movdqu 0x20(%rdx),%xmm1 66962306a36Sopenharmony_ci pxor %xmm1,%xmm0 67062306a36Sopenharmony_ci movdqu %xmm0,0x20(%rsi) 67162306a36Sopenharmony_ci 67262306a36Sopenharmony_ci movdqu %xmm12,%xmm0 67362306a36Sopenharmony_ci cmp $0x40,%rax 67462306a36Sopenharmony_ci jl .Lxorpart4 67562306a36Sopenharmony_ci movdqu 0x30(%rdx),%xmm1 67662306a36Sopenharmony_ci pxor %xmm1,%xmm0 67762306a36Sopenharmony_ci movdqu %xmm0,0x30(%rsi) 67862306a36Sopenharmony_ci 67962306a36Sopenharmony_ci movdqa 0x20(%rsp),%xmm0 68062306a36Sopenharmony_ci cmp $0x50,%rax 68162306a36Sopenharmony_ci jl .Lxorpart4 68262306a36Sopenharmony_ci movdqu 0x40(%rdx),%xmm1 68362306a36Sopenharmony_ci pxor %xmm1,%xmm0 68462306a36Sopenharmony_ci movdqu %xmm0,0x40(%rsi) 68562306a36Sopenharmony_ci 68662306a36Sopenharmony_ci movdqu %xmm6,%xmm0 68762306a36Sopenharmony_ci cmp $0x60,%rax 68862306a36Sopenharmony_ci jl .Lxorpart4 68962306a36Sopenharmony_ci movdqu 0x50(%rdx),%xmm1 69062306a36Sopenharmony_ci pxor %xmm1,%xmm0 69162306a36Sopenharmony_ci movdqu %xmm0,0x50(%rsi) 69262306a36Sopenharmony_ci 69362306a36Sopenharmony_ci movdqu %xmm10,%xmm0 69462306a36Sopenharmony_ci cmp $0x70,%rax 69562306a36Sopenharmony_ci jl .Lxorpart4 69662306a36Sopenharmony_ci movdqu 0x60(%rdx),%xmm1 69762306a36Sopenharmony_ci pxor %xmm1,%xmm0 69862306a36Sopenharmony_ci movdqu %xmm0,0x60(%rsi) 69962306a36Sopenharmony_ci 70062306a36Sopenharmony_ci movdqu %xmm14,%xmm0 70162306a36Sopenharmony_ci cmp $0x80,%rax 70262306a36Sopenharmony_ci jl .Lxorpart4 70362306a36Sopenharmony_ci movdqu 0x70(%rdx),%xmm1 70462306a36Sopenharmony_ci pxor %xmm1,%xmm0 70562306a36Sopenharmony_ci movdqu %xmm0,0x70(%rsi) 70662306a36Sopenharmony_ci 70762306a36Sopenharmony_ci movdqa 0x10(%rsp),%xmm0 70862306a36Sopenharmony_ci cmp $0x90,%rax 70962306a36Sopenharmony_ci jl .Lxorpart4 71062306a36Sopenharmony_ci movdqu 0x80(%rdx),%xmm1 71162306a36Sopenharmony_ci pxor %xmm1,%xmm0 71262306a36Sopenharmony_ci movdqu %xmm0,0x80(%rsi) 71362306a36Sopenharmony_ci 71462306a36Sopenharmony_ci movdqu %xmm5,%xmm0 71562306a36Sopenharmony_ci cmp $0xa0,%rax 71662306a36Sopenharmony_ci jl .Lxorpart4 71762306a36Sopenharmony_ci movdqu 0x90(%rdx),%xmm1 71862306a36Sopenharmony_ci pxor %xmm1,%xmm0 71962306a36Sopenharmony_ci movdqu %xmm0,0x90(%rsi) 72062306a36Sopenharmony_ci 72162306a36Sopenharmony_ci movdqu %xmm9,%xmm0 72262306a36Sopenharmony_ci cmp $0xb0,%rax 72362306a36Sopenharmony_ci jl .Lxorpart4 72462306a36Sopenharmony_ci movdqu 0xa0(%rdx),%xmm1 72562306a36Sopenharmony_ci pxor %xmm1,%xmm0 72662306a36Sopenharmony_ci movdqu %xmm0,0xa0(%rsi) 72762306a36Sopenharmony_ci 72862306a36Sopenharmony_ci movdqu %xmm13,%xmm0 72962306a36Sopenharmony_ci cmp $0xc0,%rax 73062306a36Sopenharmony_ci jl .Lxorpart4 73162306a36Sopenharmony_ci movdqu 0xb0(%rdx),%xmm1 73262306a36Sopenharmony_ci pxor %xmm1,%xmm0 73362306a36Sopenharmony_ci movdqu %xmm0,0xb0(%rsi) 73462306a36Sopenharmony_ci 73562306a36Sopenharmony_ci movdqa 0x30(%rsp),%xmm0 73662306a36Sopenharmony_ci cmp $0xd0,%rax 73762306a36Sopenharmony_ci jl .Lxorpart4 73862306a36Sopenharmony_ci movdqu 0xc0(%rdx),%xmm1 73962306a36Sopenharmony_ci pxor %xmm1,%xmm0 74062306a36Sopenharmony_ci movdqu %xmm0,0xc0(%rsi) 74162306a36Sopenharmony_ci 74262306a36Sopenharmony_ci movdqu %xmm7,%xmm0 74362306a36Sopenharmony_ci cmp $0xe0,%rax 74462306a36Sopenharmony_ci jl .Lxorpart4 74562306a36Sopenharmony_ci movdqu 0xd0(%rdx),%xmm1 74662306a36Sopenharmony_ci pxor %xmm1,%xmm0 74762306a36Sopenharmony_ci movdqu %xmm0,0xd0(%rsi) 74862306a36Sopenharmony_ci 74962306a36Sopenharmony_ci movdqu %xmm11,%xmm0 75062306a36Sopenharmony_ci cmp $0xf0,%rax 75162306a36Sopenharmony_ci jl .Lxorpart4 75262306a36Sopenharmony_ci movdqu 0xe0(%rdx),%xmm1 75362306a36Sopenharmony_ci pxor %xmm1,%xmm0 75462306a36Sopenharmony_ci movdqu %xmm0,0xe0(%rsi) 75562306a36Sopenharmony_ci 75662306a36Sopenharmony_ci movdqu %xmm15,%xmm0 75762306a36Sopenharmony_ci cmp $0x100,%rax 75862306a36Sopenharmony_ci jl .Lxorpart4 75962306a36Sopenharmony_ci movdqu 0xf0(%rdx),%xmm1 76062306a36Sopenharmony_ci pxor %xmm1,%xmm0 76162306a36Sopenharmony_ci movdqu %xmm0,0xf0(%rsi) 76262306a36Sopenharmony_ci 76362306a36Sopenharmony_ci.Ldone4: 76462306a36Sopenharmony_ci lea -8(%r10),%rsp 76562306a36Sopenharmony_ci RET 76662306a36Sopenharmony_ci 76762306a36Sopenharmony_ci.Lxorpart4: 76862306a36Sopenharmony_ci # xor remaining bytes from partial register into output 76962306a36Sopenharmony_ci mov %rax,%r9 77062306a36Sopenharmony_ci and $0x0f,%r9 77162306a36Sopenharmony_ci jz .Ldone4 77262306a36Sopenharmony_ci and $~0x0f,%rax 77362306a36Sopenharmony_ci 77462306a36Sopenharmony_ci mov %rsi,%r11 77562306a36Sopenharmony_ci 77662306a36Sopenharmony_ci lea (%rdx,%rax),%rsi 77762306a36Sopenharmony_ci mov %rsp,%rdi 77862306a36Sopenharmony_ci mov %r9,%rcx 77962306a36Sopenharmony_ci rep movsb 78062306a36Sopenharmony_ci 78162306a36Sopenharmony_ci pxor 0x00(%rsp),%xmm0 78262306a36Sopenharmony_ci movdqa %xmm0,0x00(%rsp) 78362306a36Sopenharmony_ci 78462306a36Sopenharmony_ci mov %rsp,%rsi 78562306a36Sopenharmony_ci lea (%r11,%rax),%rdi 78662306a36Sopenharmony_ci mov %r9,%rcx 78762306a36Sopenharmony_ci rep movsb 78862306a36Sopenharmony_ci 78962306a36Sopenharmony_ci jmp .Ldone4 79062306a36Sopenharmony_ci 79162306a36Sopenharmony_ciSYM_FUNC_END(chacha_4block_xor_ssse3) 792