162306a36Sopenharmony_ci/* SPDX-License-Identifier: GPL-2.0-only */
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Copyright 2023 Linus Torvalds <torvalds@linux-foundation.org>
462306a36Sopenharmony_ci */
562306a36Sopenharmony_ci
662306a36Sopenharmony_ci#include <linux/linkage.h>
762306a36Sopenharmony_ci#include <asm/asm.h>
862306a36Sopenharmony_ci#include <asm/export.h>
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci/*
1162306a36Sopenharmony_ci * copy_user_nocache - Uncached memory copy with exception handling
1262306a36Sopenharmony_ci *
1362306a36Sopenharmony_ci * This copies from user space into kernel space, but the kernel
1462306a36Sopenharmony_ci * space accesses can take a machine check exception, so they too
1562306a36Sopenharmony_ci * need exception handling.
1662306a36Sopenharmony_ci *
1762306a36Sopenharmony_ci * Note: only 32-bit and 64-bit stores have non-temporal versions,
1862306a36Sopenharmony_ci * and we only use aligned versions. Any unaligned parts at the
1962306a36Sopenharmony_ci * start or end of the copy will be done using normal cached stores.
2062306a36Sopenharmony_ci *
2162306a36Sopenharmony_ci * Input:
2262306a36Sopenharmony_ci * rdi destination
2362306a36Sopenharmony_ci * rsi source
2462306a36Sopenharmony_ci * edx count
2562306a36Sopenharmony_ci *
2662306a36Sopenharmony_ci * Output:
2762306a36Sopenharmony_ci * rax uncopied bytes or 0 if successful.
2862306a36Sopenharmony_ci */
2962306a36Sopenharmony_ciSYM_FUNC_START(__copy_user_nocache)
3062306a36Sopenharmony_ci	/* If destination is not 7-byte aligned, we'll have to align it */
3162306a36Sopenharmony_ci	testb $7,%dil
3262306a36Sopenharmony_ci	jne .Lalign
3362306a36Sopenharmony_ci
3462306a36Sopenharmony_ci.Lis_aligned:
3562306a36Sopenharmony_ci	cmp $64,%edx
3662306a36Sopenharmony_ci	jb .Lquadwords
3762306a36Sopenharmony_ci
3862306a36Sopenharmony_ci	.p2align 4,0x90
3962306a36Sopenharmony_ci.Lunrolled:
4062306a36Sopenharmony_ci10:	movq (%rsi),%r8
4162306a36Sopenharmony_ci11:	movq 8(%rsi),%r9
4262306a36Sopenharmony_ci12:	movq 16(%rsi),%r10
4362306a36Sopenharmony_ci13:	movq 24(%rsi),%r11
4462306a36Sopenharmony_ci20:	movnti %r8,(%rdi)
4562306a36Sopenharmony_ci21:	movnti %r9,8(%rdi)
4662306a36Sopenharmony_ci22:	movnti %r10,16(%rdi)
4762306a36Sopenharmony_ci23:	movnti %r11,24(%rdi)
4862306a36Sopenharmony_ci30:	movq 32(%rsi),%r8
4962306a36Sopenharmony_ci31:	movq 40(%rsi),%r9
5062306a36Sopenharmony_ci32:	movq 48(%rsi),%r10
5162306a36Sopenharmony_ci33:	movq 56(%rsi),%r11
5262306a36Sopenharmony_ci40:	movnti %r8,32(%rdi)
5362306a36Sopenharmony_ci41:	movnti %r9,40(%rdi)
5462306a36Sopenharmony_ci42:	movnti %r10,48(%rdi)
5562306a36Sopenharmony_ci43:	movnti %r11,56(%rdi)
5662306a36Sopenharmony_ci
5762306a36Sopenharmony_ci	addq $64,%rsi
5862306a36Sopenharmony_ci	addq $64,%rdi
5962306a36Sopenharmony_ci	sub $64,%edx
6062306a36Sopenharmony_ci	cmp $64,%edx
6162306a36Sopenharmony_ci	jae .Lunrolled
6262306a36Sopenharmony_ci
6362306a36Sopenharmony_ci/*
6462306a36Sopenharmony_ci * First set of user mode loads have been done
6562306a36Sopenharmony_ci * without any stores, so if they fail, we can
6662306a36Sopenharmony_ci * just try the non-unrolled loop.
6762306a36Sopenharmony_ci */
6862306a36Sopenharmony_ci_ASM_EXTABLE_UA(10b, .Lquadwords)
6962306a36Sopenharmony_ci_ASM_EXTABLE_UA(11b, .Lquadwords)
7062306a36Sopenharmony_ci_ASM_EXTABLE_UA(12b, .Lquadwords)
7162306a36Sopenharmony_ci_ASM_EXTABLE_UA(13b, .Lquadwords)
7262306a36Sopenharmony_ci
7362306a36Sopenharmony_ci/*
7462306a36Sopenharmony_ci * The second set of user mode loads have been
7562306a36Sopenharmony_ci * done with 32 bytes stored to the destination,
7662306a36Sopenharmony_ci * so we need to take that into account before
7762306a36Sopenharmony_ci * falling back to the unrolled loop.
7862306a36Sopenharmony_ci */
7962306a36Sopenharmony_ci_ASM_EXTABLE_UA(30b, .Lfixup32)
8062306a36Sopenharmony_ci_ASM_EXTABLE_UA(31b, .Lfixup32)
8162306a36Sopenharmony_ci_ASM_EXTABLE_UA(32b, .Lfixup32)
8262306a36Sopenharmony_ci_ASM_EXTABLE_UA(33b, .Lfixup32)
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci/*
8562306a36Sopenharmony_ci * An exception on a write means that we're
8662306a36Sopenharmony_ci * done, but we need to update the count
8762306a36Sopenharmony_ci * depending on where in the unrolled loop
8862306a36Sopenharmony_ci * we were.
8962306a36Sopenharmony_ci */
9062306a36Sopenharmony_ci_ASM_EXTABLE_UA(20b, .Ldone0)
9162306a36Sopenharmony_ci_ASM_EXTABLE_UA(21b, .Ldone8)
9262306a36Sopenharmony_ci_ASM_EXTABLE_UA(22b, .Ldone16)
9362306a36Sopenharmony_ci_ASM_EXTABLE_UA(23b, .Ldone24)
9462306a36Sopenharmony_ci_ASM_EXTABLE_UA(40b, .Ldone32)
9562306a36Sopenharmony_ci_ASM_EXTABLE_UA(41b, .Ldone40)
9662306a36Sopenharmony_ci_ASM_EXTABLE_UA(42b, .Ldone48)
9762306a36Sopenharmony_ci_ASM_EXTABLE_UA(43b, .Ldone56)
9862306a36Sopenharmony_ci
9962306a36Sopenharmony_ci.Lquadwords:
10062306a36Sopenharmony_ci	cmp $8,%edx
10162306a36Sopenharmony_ci	jb .Llong
10262306a36Sopenharmony_ci50:	movq (%rsi),%rax
10362306a36Sopenharmony_ci51:	movnti %rax,(%rdi)
10462306a36Sopenharmony_ci	addq $8,%rsi
10562306a36Sopenharmony_ci	addq $8,%rdi
10662306a36Sopenharmony_ci	sub $8,%edx
10762306a36Sopenharmony_ci	jmp .Lquadwords
10862306a36Sopenharmony_ci
10962306a36Sopenharmony_ci/*
11062306a36Sopenharmony_ci * If we fail on the last full quadword, we will
11162306a36Sopenharmony_ci * not try to do any byte-wise cached accesses.
11262306a36Sopenharmony_ci * We will try to do one more 4-byte uncached
11362306a36Sopenharmony_ci * one, though.
11462306a36Sopenharmony_ci */
11562306a36Sopenharmony_ci_ASM_EXTABLE_UA(50b, .Llast4)
11662306a36Sopenharmony_ci_ASM_EXTABLE_UA(51b, .Ldone0)
11762306a36Sopenharmony_ci
11862306a36Sopenharmony_ci.Llong:
11962306a36Sopenharmony_ci	test $4,%dl
12062306a36Sopenharmony_ci	je .Lword
12162306a36Sopenharmony_ci60:	movl (%rsi),%eax
12262306a36Sopenharmony_ci61:	movnti %eax,(%rdi)
12362306a36Sopenharmony_ci	addq $4,%rsi
12462306a36Sopenharmony_ci	addq $4,%rdi
12562306a36Sopenharmony_ci	sub $4,%edx
12662306a36Sopenharmony_ci.Lword:
12762306a36Sopenharmony_ci	sfence
12862306a36Sopenharmony_ci	test $2,%dl
12962306a36Sopenharmony_ci	je .Lbyte
13062306a36Sopenharmony_ci70:	movw (%rsi),%ax
13162306a36Sopenharmony_ci71:	movw %ax,(%rdi)
13262306a36Sopenharmony_ci	addq $2,%rsi
13362306a36Sopenharmony_ci	addq $2,%rdi
13462306a36Sopenharmony_ci	sub $2,%edx
13562306a36Sopenharmony_ci.Lbyte:
13662306a36Sopenharmony_ci	test $1,%dl
13762306a36Sopenharmony_ci	je .Ldone
13862306a36Sopenharmony_ci80:	movb (%rsi),%al
13962306a36Sopenharmony_ci81:	movb %al,(%rdi)
14062306a36Sopenharmony_ci	dec %edx
14162306a36Sopenharmony_ci.Ldone:
14262306a36Sopenharmony_ci	mov %edx,%eax
14362306a36Sopenharmony_ci	RET
14462306a36Sopenharmony_ci
14562306a36Sopenharmony_ci/*
14662306a36Sopenharmony_ci * If we fail on the last four bytes, we won't
14762306a36Sopenharmony_ci * bother with any fixups. It's dead, Jim. Note
14862306a36Sopenharmony_ci * that there's no need for 'sfence' for any
14962306a36Sopenharmony_ci * of this, since the exception will have been
15062306a36Sopenharmony_ci * serializing.
15162306a36Sopenharmony_ci */
15262306a36Sopenharmony_ci_ASM_EXTABLE_UA(60b, .Ldone)
15362306a36Sopenharmony_ci_ASM_EXTABLE_UA(61b, .Ldone)
15462306a36Sopenharmony_ci_ASM_EXTABLE_UA(70b, .Ldone)
15562306a36Sopenharmony_ci_ASM_EXTABLE_UA(71b, .Ldone)
15662306a36Sopenharmony_ci_ASM_EXTABLE_UA(80b, .Ldone)
15762306a36Sopenharmony_ci_ASM_EXTABLE_UA(81b, .Ldone)
15862306a36Sopenharmony_ci
15962306a36Sopenharmony_ci/*
16062306a36Sopenharmony_ci * This is the "head needs aliging" case when
16162306a36Sopenharmony_ci * the destination isn't 8-byte aligned. The
16262306a36Sopenharmony_ci * 4-byte case can be done uncached, but any
16362306a36Sopenharmony_ci * smaller alignment is done with regular stores.
16462306a36Sopenharmony_ci */
16562306a36Sopenharmony_ci.Lalign:
16662306a36Sopenharmony_ci	test $1,%dil
16762306a36Sopenharmony_ci	je .Lalign_word
16862306a36Sopenharmony_ci	test %edx,%edx
16962306a36Sopenharmony_ci	je .Ldone
17062306a36Sopenharmony_ci90:	movb (%rsi),%al
17162306a36Sopenharmony_ci91:	movb %al,(%rdi)
17262306a36Sopenharmony_ci	inc %rsi
17362306a36Sopenharmony_ci	inc %rdi
17462306a36Sopenharmony_ci	dec %edx
17562306a36Sopenharmony_ci.Lalign_word:
17662306a36Sopenharmony_ci	test $2,%dil
17762306a36Sopenharmony_ci	je .Lalign_long
17862306a36Sopenharmony_ci	cmp $2,%edx
17962306a36Sopenharmony_ci	jb .Lbyte
18062306a36Sopenharmony_ci92:	movw (%rsi),%ax
18162306a36Sopenharmony_ci93:	movw %ax,(%rdi)
18262306a36Sopenharmony_ci	addq $2,%rsi
18362306a36Sopenharmony_ci	addq $2,%rdi
18462306a36Sopenharmony_ci	sub $2,%edx
18562306a36Sopenharmony_ci.Lalign_long:
18662306a36Sopenharmony_ci	test $4,%dil
18762306a36Sopenharmony_ci	je .Lis_aligned
18862306a36Sopenharmony_ci	cmp $4,%edx
18962306a36Sopenharmony_ci	jb .Lword
19062306a36Sopenharmony_ci94:	movl (%rsi),%eax
19162306a36Sopenharmony_ci95:	movnti %eax,(%rdi)
19262306a36Sopenharmony_ci	addq $4,%rsi
19362306a36Sopenharmony_ci	addq $4,%rdi
19462306a36Sopenharmony_ci	sub $4,%edx
19562306a36Sopenharmony_ci	jmp .Lis_aligned
19662306a36Sopenharmony_ci
19762306a36Sopenharmony_ci/*
19862306a36Sopenharmony_ci * If we fail on the initial alignment accesses,
19962306a36Sopenharmony_ci * we're all done. Again, no point in trying to
20062306a36Sopenharmony_ci * do byte-by-byte probing if the 4-byte load
20162306a36Sopenharmony_ci * fails - we're not doing any uncached accesses
20262306a36Sopenharmony_ci * any more.
20362306a36Sopenharmony_ci */
20462306a36Sopenharmony_ci_ASM_EXTABLE_UA(90b, .Ldone)
20562306a36Sopenharmony_ci_ASM_EXTABLE_UA(91b, .Ldone)
20662306a36Sopenharmony_ci_ASM_EXTABLE_UA(92b, .Ldone)
20762306a36Sopenharmony_ci_ASM_EXTABLE_UA(93b, .Ldone)
20862306a36Sopenharmony_ci_ASM_EXTABLE_UA(94b, .Ldone)
20962306a36Sopenharmony_ci_ASM_EXTABLE_UA(95b, .Ldone)
21062306a36Sopenharmony_ci
21162306a36Sopenharmony_ci/*
21262306a36Sopenharmony_ci * Exception table fixups for faults in the middle
21362306a36Sopenharmony_ci */
21462306a36Sopenharmony_ci.Ldone56: sub $8,%edx
21562306a36Sopenharmony_ci.Ldone48: sub $8,%edx
21662306a36Sopenharmony_ci.Ldone40: sub $8,%edx
21762306a36Sopenharmony_ci.Ldone32: sub $8,%edx
21862306a36Sopenharmony_ci.Ldone24: sub $8,%edx
21962306a36Sopenharmony_ci.Ldone16: sub $8,%edx
22062306a36Sopenharmony_ci.Ldone8: sub $8,%edx
22162306a36Sopenharmony_ci.Ldone0:
22262306a36Sopenharmony_ci	mov %edx,%eax
22362306a36Sopenharmony_ci	RET
22462306a36Sopenharmony_ci
22562306a36Sopenharmony_ci.Lfixup32:
22662306a36Sopenharmony_ci	addq $32,%rsi
22762306a36Sopenharmony_ci	addq $32,%rdi
22862306a36Sopenharmony_ci	sub $32,%edx
22962306a36Sopenharmony_ci	jmp .Lquadwords
23062306a36Sopenharmony_ci
23162306a36Sopenharmony_ci.Llast4:
23262306a36Sopenharmony_ci52:	movl (%rsi),%eax
23362306a36Sopenharmony_ci53:	movnti %eax,(%rdi)
23462306a36Sopenharmony_ci	sfence
23562306a36Sopenharmony_ci	sub $4,%edx
23662306a36Sopenharmony_ci	mov %edx,%eax
23762306a36Sopenharmony_ci	RET
23862306a36Sopenharmony_ci_ASM_EXTABLE_UA(52b, .Ldone0)
23962306a36Sopenharmony_ci_ASM_EXTABLE_UA(53b, .Ldone0)
24062306a36Sopenharmony_ci
24162306a36Sopenharmony_ciSYM_FUNC_END(__copy_user_nocache)
24262306a36Sopenharmony_ciEXPORT_SYMBOL(__copy_user_nocache)
243