1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * sha3-ce-core.S - core SHA-3 transform using v8.2 Crypto Extensions
4  *
5  * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License version 2 as
9  * published by the Free Software Foundation.
10  */
11 
12 #include <linux/linkage.h>
13 #include <asm/assembler.h>
14 
15 	.irp	b,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
16 	.set	.Lv\b\().2d, \b
17 	.set	.Lv\b\().16b, \b
18 	.endr
19 
20 	/*
21 	 * ARMv8.2 Crypto Extensions instructions
22 	 */
23 	.macro	eor3, rd, rn, rm, ra
24 	.inst	0xce000000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
25 	.endm
26 
27 	.macro	rax1, rd, rn, rm
28 	.inst	0xce608c00 | .L\rd | (.L\rn << 5) | (.L\rm << 16)
29 	.endm
30 
31 	.macro	bcax, rd, rn, rm, ra
32 	.inst	0xce200000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16)
33 	.endm
34 
35 	.macro	xar, rd, rn, rm, imm6
36 	.inst	0xce800000 | .L\rd | (.L\rn << 5) | ((\imm6) << 10) | (.L\rm << 16)
37 	.endm
38 
39 	/*
40 	 * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
41 	 */
42 	.text
43 SYM_FUNC_START(sha3_ce_transform)
44 	/* load state */
45 	add	x8, x0, #32
46 	ld1	{ v0.1d- v3.1d}, [x0]
47 	ld1	{ v4.1d- v7.1d}, [x8], #32
48 	ld1	{ v8.1d-v11.1d}, [x8], #32
49 	ld1	{v12.1d-v15.1d}, [x8], #32
50 	ld1	{v16.1d-v19.1d}, [x8], #32
51 	ld1	{v20.1d-v23.1d}, [x8], #32
52 	ld1	{v24.1d}, [x8]
53 
54 0:	sub	w2, w2, #1
55 	mov	w8, #24
56 	adr_l	x9, .Lsha3_rcon
57 
58 	/* load input */
59 	ld1	{v25.8b-v28.8b}, [x1], #32
60 	ld1	{v29.8b-v31.8b}, [x1], #24
61 	eor	v0.8b, v0.8b, v25.8b
62 	eor	v1.8b, v1.8b, v26.8b
63 	eor	v2.8b, v2.8b, v27.8b
64 	eor	v3.8b, v3.8b, v28.8b
65 	eor	v4.8b, v4.8b, v29.8b
66 	eor	v5.8b, v5.8b, v30.8b
67 	eor	v6.8b, v6.8b, v31.8b
68 
69 	tbnz	x3, #6, 2f		// SHA3-512
70 
71 	ld1	{v25.8b-v28.8b}, [x1], #32
72 	ld1	{v29.8b-v30.8b}, [x1], #16
73 	eor	 v7.8b,  v7.8b, v25.8b
74 	eor	 v8.8b,  v8.8b, v26.8b
75 	eor	 v9.8b,  v9.8b, v27.8b
76 	eor	v10.8b, v10.8b, v28.8b
77 	eor	v11.8b, v11.8b, v29.8b
78 	eor	v12.8b, v12.8b, v30.8b
79 
80 	tbnz	x3, #4, 1f		// SHA3-384 or SHA3-224
81 
82 	// SHA3-256
83 	ld1	{v25.8b-v28.8b}, [x1], #32
84 	eor	v13.8b, v13.8b, v25.8b
85 	eor	v14.8b, v14.8b, v26.8b
86 	eor	v15.8b, v15.8b, v27.8b
87 	eor	v16.8b, v16.8b, v28.8b
88 	b	3f
89 
90 1:	tbz	x3, #2, 3f		// bit 2 cleared? SHA-384
91 
92 	// SHA3-224
93 	ld1	{v25.8b-v28.8b}, [x1], #32
94 	ld1	{v29.8b}, [x1], #8
95 	eor	v13.8b, v13.8b, v25.8b
96 	eor	v14.8b, v14.8b, v26.8b
97 	eor	v15.8b, v15.8b, v27.8b
98 	eor	v16.8b, v16.8b, v28.8b
99 	eor	v17.8b, v17.8b, v29.8b
100 	b	3f
101 
102 	// SHA3-512
103 2:	ld1	{v25.8b-v26.8b}, [x1], #16
104 	eor	 v7.8b,  v7.8b, v25.8b
105 	eor	 v8.8b,  v8.8b, v26.8b
106 
107 3:	sub	w8, w8, #1
108 
109 	eor3	v29.16b,  v4.16b,  v9.16b, v14.16b
110 	eor3	v26.16b,  v1.16b,  v6.16b, v11.16b
111 	eor3	v28.16b,  v3.16b,  v8.16b, v13.16b
112 	eor3	v25.16b,  v0.16b,  v5.16b, v10.16b
113 	eor3	v27.16b,  v2.16b,  v7.16b, v12.16b
114 	eor3	v29.16b, v29.16b, v19.16b, v24.16b
115 	eor3	v26.16b, v26.16b, v16.16b, v21.16b
116 	eor3	v28.16b, v28.16b, v18.16b, v23.16b
117 	eor3	v25.16b, v25.16b, v15.16b, v20.16b
118 	eor3	v27.16b, v27.16b, v17.16b, v22.16b
119 
120 	rax1	v30.2d, v29.2d, v26.2d	// bc[0]
121 	rax1	v26.2d, v26.2d, v28.2d	// bc[2]
122 	rax1	v28.2d, v28.2d, v25.2d	// bc[4]
123 	rax1	v25.2d, v25.2d, v27.2d	// bc[1]
124 	rax1	v27.2d, v27.2d, v29.2d	// bc[3]
125 
126 	eor	 v0.16b,  v0.16b, v30.16b
127 	xar	 v29.2d,   v1.2d,  v25.2d, (64 - 1)
128 	xar	  v1.2d,   v6.2d,  v25.2d, (64 - 44)
129 	xar	  v6.2d,   v9.2d,  v28.2d, (64 - 20)
130 	xar	  v9.2d,  v22.2d,  v26.2d, (64 - 61)
131 	xar	 v22.2d,  v14.2d,  v28.2d, (64 - 39)
132 	xar	 v14.2d,  v20.2d,  v30.2d, (64 - 18)
133 	xar	 v31.2d,   v2.2d,  v26.2d, (64 - 62)
134 	xar	  v2.2d,  v12.2d,  v26.2d, (64 - 43)
135 	xar	 v12.2d,  v13.2d,  v27.2d, (64 - 25)
136 	xar	 v13.2d,  v19.2d,  v28.2d, (64 - 8)
137 	xar	 v19.2d,  v23.2d,  v27.2d, (64 - 56)
138 	xar	 v23.2d,  v15.2d,  v30.2d, (64 - 41)
139 	xar	 v15.2d,   v4.2d,  v28.2d, (64 - 27)
140 	xar	 v28.2d,  v24.2d,  v28.2d, (64 - 14)
141 	xar	 v24.2d,  v21.2d,  v25.2d, (64 - 2)
142 	xar	  v8.2d,   v8.2d,  v27.2d, (64 - 55)
143 	xar	  v4.2d,  v16.2d,  v25.2d, (64 - 45)
144 	xar	 v16.2d,   v5.2d,  v30.2d, (64 - 36)
145 	xar	  v5.2d,   v3.2d,  v27.2d, (64 - 28)
146 	xar	 v27.2d,  v18.2d,  v27.2d, (64 - 21)
147 	xar	  v3.2d,  v17.2d,  v26.2d, (64 - 15)
148 	xar	 v25.2d,  v11.2d,  v25.2d, (64 - 10)
149 	xar	 v26.2d,   v7.2d,  v26.2d, (64 - 6)
150 	xar	 v30.2d,  v10.2d,  v30.2d, (64 - 3)
151 
152 	bcax	v20.16b, v31.16b, v22.16b,  v8.16b
153 	bcax	v21.16b,  v8.16b, v23.16b, v22.16b
154 	bcax	v22.16b, v22.16b, v24.16b, v23.16b
155 	bcax	v23.16b, v23.16b, v31.16b, v24.16b
156 	bcax	v24.16b, v24.16b,  v8.16b, v31.16b
157 
158 	ld1r	{v31.2d}, [x9], #8
159 
160 	bcax	v17.16b, v25.16b, v19.16b,  v3.16b
161 	bcax	v18.16b,  v3.16b, v15.16b, v19.16b
162 	bcax	v19.16b, v19.16b, v16.16b, v15.16b
163 	bcax	v15.16b, v15.16b, v25.16b, v16.16b
164 	bcax	v16.16b, v16.16b,  v3.16b, v25.16b
165 
166 	bcax	v10.16b, v29.16b, v12.16b, v26.16b
167 	bcax	v11.16b, v26.16b, v13.16b, v12.16b
168 	bcax	v12.16b, v12.16b, v14.16b, v13.16b
169 	bcax	v13.16b, v13.16b, v29.16b, v14.16b
170 	bcax	v14.16b, v14.16b, v26.16b, v29.16b
171 
172 	bcax	 v7.16b, v30.16b,  v9.16b,  v4.16b
173 	bcax	 v8.16b,  v4.16b,  v5.16b,  v9.16b
174 	bcax	 v9.16b,  v9.16b,  v6.16b,  v5.16b
175 	bcax	 v5.16b,  v5.16b, v30.16b,  v6.16b
176 	bcax	 v6.16b,  v6.16b,  v4.16b, v30.16b
177 
178 	bcax	 v3.16b, v27.16b,  v0.16b, v28.16b
179 	bcax	 v4.16b, v28.16b,  v1.16b,  v0.16b
180 	bcax	 v0.16b,  v0.16b,  v2.16b,  v1.16b
181 	bcax	 v1.16b,  v1.16b, v27.16b,  v2.16b
182 	bcax	 v2.16b,  v2.16b, v28.16b, v27.16b
183 
184 	eor	 v0.16b,  v0.16b, v31.16b
185 
186 	cbnz	w8, 3b
187 	cond_yield 4f, x8, x9
188 	cbnz	w2, 0b
189 
190 	/* save state */
191 4:	st1	{ v0.1d- v3.1d}, [x0], #32
192 	st1	{ v4.1d- v7.1d}, [x0], #32
193 	st1	{ v8.1d-v11.1d}, [x0], #32
194 	st1	{v12.1d-v15.1d}, [x0], #32
195 	st1	{v16.1d-v19.1d}, [x0], #32
196 	st1	{v20.1d-v23.1d}, [x0], #32
197 	st1	{v24.1d}, [x0]
198 	mov	w0, w2
199 	ret
200 SYM_FUNC_END(sha3_ce_transform)
201 
202 	.section	".rodata", "a"
203 	.align		8
204 .Lsha3_rcon:
205 	.quad	0x0000000000000001, 0x0000000000008082, 0x800000000000808a
206 	.quad	0x8000000080008000, 0x000000000000808b, 0x0000000080000001
207 	.quad	0x8000000080008081, 0x8000000000008009, 0x000000000000008a
208 	.quad	0x0000000000000088, 0x0000000080008009, 0x000000008000000a
209 	.quad	0x000000008000808b, 0x800000000000008b, 0x8000000000008089
210 	.quad	0x8000000000008003, 0x8000000000008002, 0x8000000000000080
211 	.quad	0x000000000000800a, 0x800000008000000a, 0x8000000080008081
212 	.quad	0x8000000000008080, 0x0000000080000001, 0x8000000080008008
213