1 // Copyright 2016, VIXL authors
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are met:
6 //
7 // * Redistributions of source code must retain the above copyright notice,
8 // this list of conditions and the following disclaimer.
9 // * Redistributions in binary form must reproduce the above copyright notice,
10 // this list of conditions and the following disclaimer in the documentation
11 // and/or other materials provided with the distribution.
12 // * Neither the name of ARM Limited nor the names of its contributors may be
13 // used to endorse or promote products derived from this software without
14 // specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17 // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 // WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27 #include <cfloat>
28 #include <cmath>
29 #include <cstdio>
30 #include <cstdlib>
31 #include <cstring>
32 #include <fstream>
33 #include <regex>
34
35 #include "test-runner.h"
36
37 #include "aarch64/cpu-aarch64.h"
38 #include "aarch64/disasm-aarch64.h"
39 #include "aarch64/macro-assembler-aarch64.h"
40 #include "aarch64/simulator-aarch64.h"
41 #include "test-utils-aarch64.h"
42
43 namespace vixl {
44 namespace aarch64 {
45
46 #define __ masm->
47 #define TEST(name) TEST_(TRACE_##name)
48
49 #define REF(name) "test/test-trace-reference/" name
50
GenerateTestSequenceBase(MacroAssembler* masm)51 static void GenerateTestSequenceBase(MacroAssembler* masm) {
52 ExactAssemblyScope guard(masm,
53 masm->GetBuffer()->GetRemainingBytes(),
54 ExactAssemblyScope::kMaximumSize);
55
56 __ adc(w3, w4, w5);
57 __ adc(x6, x7, x8);
58 __ adcs(w9, w10, w11);
59 __ adcs(x12, x13, x14);
60 __ add(w15, w16, w17);
61 __ add(x18, x19, x20);
62 __ adds(w21, w22, w23);
63 __ adds(x24, x25, x26);
64 __ and_(w27, w28, w29);
65 __ and_(x2, x3, x4);
66 __ ands(w5, w6, w7);
67 __ ands(x8, x9, x10);
68 __ asr(w11, w12, 0);
69 __ asr(x13, x14, 1);
70 __ asrv(w15, w16, w17);
71 __ asrv(x18, x19, x20);
72 __ bfm(w21, w22, 5, 6);
73 __ bfm(x23, x24, 7, 8);
74 __ bic(w25, w26, w27);
75 __ bic(x28, x29, x2);
76 __ bics(w3, w4, w5);
77 __ bics(x6, x7, x8);
78 __ ccmn(w9, w10, NoFlag, al);
79 __ ccmn(w9, w10, NoFlag, eq);
80 __ ccmn(w9, w10, NoFlag, ne);
81 __ ccmn(x11, x12, CFlag, al);
82 __ ccmn(x11, x12, CFlag, cc);
83 __ ccmn(x11, x12, CFlag, cs);
84 __ ccmp(w13, w14, VFlag, al);
85 __ ccmp(w13, w14, VFlag, hi);
86 __ ccmp(w13, w14, VFlag, ls);
87 __ ccmp(x15, x16, CVFlag, al);
88 __ ccmp(x15, x16, CVFlag, eq);
89 __ ccmp(x15, x16, CVFlag, ne);
90 __ cinc(w17, w18, cc);
91 __ cinc(w17, w18, cs);
92 __ cinc(x19, x20, hi);
93 __ cinc(x19, x20, ls);
94 __ cinv(w21, w22, eq);
95 __ cinv(w21, w22, ne);
96 __ cinv(x23, x24, cc);
97 __ cinv(x23, x24, cs);
98 __ clrex();
99 __ cls(w25, w26);
100 __ cls(x27, x28);
101 __ clz(w29, w2);
102 __ clz(x3, x4);
103 __ cmn(w5, w6);
104 __ cmn(x7, x8);
105 __ cmp(w9, w10);
106 __ cmp(x11, x12);
107 __ cneg(w13, w14, hi);
108 __ cneg(w13, w14, ls);
109 __ cneg(x15, x16, eq);
110 __ cneg(x15, x16, ne);
111 __ crc32b(w17, w18, w19);
112 __ crc32cb(w20, w21, w22);
113 __ crc32ch(w23, w24, w25);
114 __ crc32cw(w26, w27, w28);
115 __ crc32h(w4, w5, w6);
116 __ crc32w(w7, w8, w9);
117 __ csel(w13, w14, w15, cc);
118 __ csel(w13, w14, w15, cs);
119 __ csel(x16, x17, x18, hi);
120 __ csel(x16, x17, x18, ls);
121 __ cset(w19, eq);
122 __ cset(w19, ne);
123 __ cset(x20, cc);
124 __ cset(x20, cs);
125 __ csetm(w21, hi);
126 __ csetm(w21, ls);
127 __ csetm(x22, eq);
128 __ csetm(x22, ne);
129 __ csinc(w23, w24, w25, cc);
130 __ csinc(w23, w24, w25, cs);
131 __ csinc(x26, x27, x28, hi);
132 __ csinc(x26, x27, x28, ls);
133 __ csinv(w29, w2, w3, eq);
134 __ csinv(w29, w2, w3, ne);
135 __ csinv(x4, x5, x6, cc);
136 __ csinv(x4, x5, x6, cs);
137 __ csneg(w7, w8, w9, hi);
138 __ csneg(w7, w8, w9, ls);
139 __ csneg(x10, x11, x12, eq);
140 __ csneg(x10, x11, x12, ne);
141 __ dc(CVAC, x0);
142 __ dmb(InnerShareable, BarrierAll);
143 __ dsb(InnerShareable, BarrierAll);
144 __ eon(w13, w14, w15);
145 __ eon(x16, x17, x18);
146 __ eor(w19, w20, w21);
147 __ eor(x22, x23, x24);
148 __ extr(w25, w26, w27, 9);
149 __ extr(x28, x29, x2, 10);
150 __ hint(NOP);
151 __ ic(IVAU, x0);
152 __ isb();
153 __ ldar(w3, MemOperand(x0));
154 __ ldar(x4, MemOperand(x0));
155 __ ldarb(w5, MemOperand(x0));
156 __ ldarb(x6, MemOperand(x0));
157 __ ldarh(w7, MemOperand(x0));
158 __ ldarh(x8, MemOperand(x0));
159 __ ldaxp(w9, w10, MemOperand(x0));
160 __ ldaxp(x11, x12, MemOperand(x0));
161 __ ldaxr(w13, MemOperand(x0));
162 __ ldaxr(x14, MemOperand(x0));
163 __ ldaxrb(w15, MemOperand(x0));
164 __ ldaxrb(x16, MemOperand(x0));
165 __ ldaxrh(w17, MemOperand(x0));
166 __ ldaxrh(x18, MemOperand(x0));
167 __ ldnp(w19, w20, MemOperand(x0));
168 __ ldnp(x21, x22, MemOperand(x0));
169 __ ldp(w23, w24, MemOperand(x0));
170 __ ldp(w23, w24, MemOperand(x1, 8, PostIndex));
171 __ ldp(w23, w24, MemOperand(x1, 8, PreIndex));
172 __ ldp(x25, x26, MemOperand(x0));
173 __ ldp(x25, x26, MemOperand(x1, 16, PostIndex));
174 __ ldp(x25, x26, MemOperand(x1, 16, PreIndex));
175 __ ldpsw(x27, x28, MemOperand(x0));
176 __ ldpsw(x27, x28, MemOperand(x1, 8, PostIndex));
177 __ ldpsw(x27, x28, MemOperand(x1, 8, PreIndex));
178 __ ldr(w29, MemOperand(x0));
179 __ ldr(w29, MemOperand(x1, 4, PostIndex));
180 __ ldr(w29, MemOperand(x1, 4, PreIndex));
181 __ ldr(x2, MemOperand(x0));
182 __ ldr(x2, MemOperand(x1, 8, PostIndex));
183 __ ldr(x2, MemOperand(x1, 8, PreIndex));
184 __ ldrb(w3, MemOperand(x0));
185 __ ldrb(w3, MemOperand(x1, 1, PostIndex));
186 __ ldrb(w3, MemOperand(x1, 1, PreIndex));
187 __ ldrb(x4, MemOperand(x0));
188 __ ldrb(x4, MemOperand(x1, 1, PostIndex));
189 __ ldrb(x4, MemOperand(x1, 1, PreIndex));
190 __ ldrh(w5, MemOperand(x0));
191 __ ldrh(w5, MemOperand(x1, 2, PostIndex));
192 __ ldrh(w5, MemOperand(x1, 2, PreIndex));
193 __ ldrh(x6, MemOperand(x0));
194 __ ldrh(x6, MemOperand(x1, 2, PostIndex));
195 __ ldrh(x6, MemOperand(x1, 2, PreIndex));
196 __ ldrsb(w7, MemOperand(x0));
197 __ ldrsb(w7, MemOperand(x1, 1, PostIndex));
198 __ ldrsb(w7, MemOperand(x1, 1, PreIndex));
199 __ ldrsb(x8, MemOperand(x0));
200 __ ldrsb(x8, MemOperand(x1, 1, PostIndex));
201 __ ldrsb(x8, MemOperand(x1, 1, PreIndex));
202 __ ldrsh(w9, MemOperand(x0));
203 __ ldrsh(w9, MemOperand(x1, 2, PostIndex));
204 __ ldrsh(w9, MemOperand(x1, 2, PreIndex));
205 __ ldrsh(x10, MemOperand(x0));
206 __ ldrsh(x10, MemOperand(x1, 2, PostIndex));
207 __ ldrsh(x10, MemOperand(x1, 2, PreIndex));
208 __ ldrsw(x11, MemOperand(x0));
209 __ ldrsw(x11, MemOperand(x1, 4, PostIndex));
210 __ ldrsw(x11, MemOperand(x1, 4, PreIndex));
211 __ ldur(w12, MemOperand(x0, 7));
212 __ ldur(x13, MemOperand(x0, 15));
213 __ ldurb(w14, MemOperand(x0, 1));
214 __ ldurb(x15, MemOperand(x0, 1));
215 __ ldurh(w16, MemOperand(x0, 3));
216 __ ldurh(x17, MemOperand(x0, 3));
217 __ ldursb(w18, MemOperand(x0, 1));
218 __ ldursb(x19, MemOperand(x0, 1));
219 __ ldursh(w20, MemOperand(x0, 3));
220 __ ldursh(x21, MemOperand(x0, 3));
221 __ ldursw(x22, MemOperand(x0, 7));
222 __ ldxp(w23, w24, MemOperand(x0));
223 __ ldxp(x25, x26, MemOperand(x0));
224 __ ldxr(w27, MemOperand(x0));
225 __ ldxr(x28, MemOperand(x0));
226 __ ldxrb(w29, MemOperand(x0));
227 __ ldxrb(x2, MemOperand(x0));
228 __ ldxrh(w3, MemOperand(x0));
229 __ ldxrh(x4, MemOperand(x0));
230 __ lsl(w5, w6, 2);
231 __ lsl(x7, x8, 3);
232 __ lslv(w9, w10, w11);
233 __ lslv(x12, x13, x14);
234 __ lsr(w15, w16, 4);
235 __ lsr(x17, x18, 5);
236 __ lsrv(w19, w20, w21);
237 __ lsrv(x22, x23, x24);
238 __ madd(w25, w26, w27, w28);
239 __ madd(x29, x2, x3, x4);
240 __ mneg(w5, w6, w7);
241 __ mneg(x8, x9, x10);
242 __ mov(w11, w12);
243 __ mov(x13, x14);
244 __ movk(w15, 130);
245 __ movk(x16, 131);
246 __ movn(w17, 132);
247 __ movn(x18, 133);
248 __ movz(w19, 134);
249 __ movz(x20, 135);
250 __ msub(w22, w23, w24, w25);
251 __ msub(x26, x27, x28, x29);
252 __ mul(w2, w3, w4);
253 __ mul(x5, x6, x7);
254 __ mvn(w8, w9);
255 __ mvn(x10, x11);
256 __ neg(w12, w13);
257 __ neg(x14, x15);
258 __ negs(w16, w17);
259 __ negs(x18, x19);
260 __ ngc(w20, w21);
261 __ ngc(x22, x23);
262 __ ngcs(w24, w25);
263 __ ngcs(x26, x27);
264 __ nop();
265 __ orn(w28, w29, w2);
266 __ orn(x3, x4, x5);
267 __ orr(w6, w7, w8);
268 __ orr(x9, x10, x11);
269 __ prfm(PLDL1KEEP, MemOperand(x0, 4));
270 __ prfum(PLDL1KEEP, MemOperand(x0, 1));
271 __ rbit(w12, w13);
272 __ rbit(x14, x15);
273 __ rev(w16, w17);
274 __ rev(x18, x19);
275 __ rev16(w20, w21);
276 __ rev16(x22, x23);
277 __ rev32(x24, x25);
278 __ rorv(w26, w27, w28);
279 __ rorv(x29, x2, x3);
280 __ sbc(w4, w5, w6);
281 __ sbc(x7, x8, x9);
282 __ sbcs(w10, w11, w12);
283 __ sbcs(x13, x14, x15);
284 __ sbfiz(w16, w17, 2, 3);
285 __ sbfiz(x18, x19, 4, 5);
286 __ sbfx(w22, w23, 6, 7);
287 __ sbfx(x24, x25, 8, 9);
288 __ sdiv(w26, w27, w28);
289 __ sdiv(x29, x2, x3);
290 __ smulh(x12, x13, x14);
291 __ stlr(w18, MemOperand(x0));
292 __ stlr(x19, MemOperand(x0));
293 __ stlrb(w20, MemOperand(x0));
294 __ stlrb(x21, MemOperand(x0));
295 __ stlrh(w22, MemOperand(x0));
296 __ stlrh(x23, MemOperand(x0));
297 __ stlxp(w24, w25, w26, MemOperand(x0));
298 __ stlxp(x27, x28, x29, MemOperand(x0));
299 __ stlxr(w2, w3, MemOperand(x0));
300 __ stlxr(x4, x5, MemOperand(x0));
301 __ stlxrb(w6, w7, MemOperand(x0));
302 __ stlxrb(x8, x9, MemOperand(x0));
303 __ stlxrh(w10, w11, MemOperand(x0));
304 __ stlxrh(x12, x13, MemOperand(x0));
305 __ stnp(w14, w15, MemOperand(x0));
306 __ stnp(x16, x17, MemOperand(x0));
307 __ stp(w18, w19, MemOperand(x0));
308 __ stp(w18, w19, MemOperand(x1, 8, PostIndex));
309 __ stp(w18, w19, MemOperand(x1, 8, PreIndex));
310 __ stp(x20, x21, MemOperand(x0));
311 __ stp(x20, x21, MemOperand(x1, 16, PostIndex));
312 __ stp(x20, x21, MemOperand(x1, 16, PreIndex));
313 __ str(w22, MemOperand(x0));
314 __ str(w22, MemOperand(x1, 4, PostIndex));
315 __ str(w22, MemOperand(x1, 4, PreIndex));
316 __ str(x23, MemOperand(x0));
317 __ str(x23, MemOperand(x1, 8, PostIndex));
318 __ str(x23, MemOperand(x1, 8, PreIndex));
319 __ strb(w24, MemOperand(x0));
320 __ strb(w24, MemOperand(x1, 1, PostIndex));
321 __ strb(w24, MemOperand(x1, 1, PreIndex));
322 __ strb(x25, MemOperand(x0));
323 __ strb(x25, MemOperand(x1, 1, PostIndex));
324 __ strb(x25, MemOperand(x1, 1, PreIndex));
325 __ strh(w26, MemOperand(x0));
326 __ strh(w26, MemOperand(x1, 2, PostIndex));
327 __ strh(w26, MemOperand(x1, 2, PreIndex));
328 __ strh(x27, MemOperand(x0));
329 __ strh(x27, MemOperand(x1, 2, PostIndex));
330 __ strh(x27, MemOperand(x1, 2, PreIndex));
331 __ stur(w28, MemOperand(x0, 7));
332 __ stur(x29, MemOperand(x0, 15));
333 __ sturb(w2, MemOperand(x0, 1));
334 __ sturb(x3, MemOperand(x0, 1));
335 __ sturh(w4, MemOperand(x0, 3));
336 __ sturh(x5, MemOperand(x0, 3));
337 __ stxp(w6, w7, w8, MemOperand(x0));
338 __ stxp(x9, x10, x11, MemOperand(x0));
339 __ stxr(w12, w13, MemOperand(x0));
340 __ stxr(x14, x15, MemOperand(x0));
341 __ stxrb(w16, w17, MemOperand(x0));
342 __ stxrb(x18, x19, MemOperand(x0));
343 __ stxrh(w20, w21, MemOperand(x0));
344 __ stxrh(x22, x23, MemOperand(x0));
345 __ sub(w24, w25, w26);
346 __ sub(x27, x28, x29);
347 __ subs(w2, w3, w4);
348 __ subs(x5, x6, x7);
349 __ sxtb(w8, w9);
350 __ sxtb(x10, x11);
351 __ sxth(w12, w13);
352 __ sxth(x14, x15);
353 __ sxtw(w16, w17);
354 __ sxtw(x18, x19);
355 __ tst(w20, w21);
356 __ tst(x22, x23);
357 __ ubfiz(w24, w25, 10, 11);
358 __ ubfiz(x26, x27, 12, 13);
359 __ ubfm(w28, w29, 14, 15);
360 __ ubfm(x2, x3, 1, 2);
361 __ ubfx(w4, w5, 3, 4);
362 __ ubfx(x6, x7, 5, 6);
363 __ udiv(w8, w9, w10);
364 __ udiv(x11, x12, x13);
365 __ umulh(x22, x23, x24);
366 __ uxtb(w28, w29);
367 __ uxtb(x2, x3);
368 __ uxth(w4, w5);
369 __ uxth(x6, x7);
370 __ uxtw(w8, w9);
371 __ uxtw(x10, x11);
372
373 // Regression tests.
374 __ stp(x10, xzr, MemOperand(sp, -16, PreIndex));
375 __ ldp(x10, xzr, MemOperand(sp, 16, PostIndex));
376 __ str(xzr, MemOperand(sp, -16, PreIndex));
377 __ ldrsb(xzr, MemOperand(sp, 16, PostIndex));
378 __ str(xzr, MemOperand(sp, -16, PreIndex));
379 __ ldrsh(xzr, MemOperand(sp, 16, PostIndex));
380 __ str(xzr, MemOperand(sp, -16, PreIndex));
381 __ ldrsw(xzr, MemOperand(sp, 16, PostIndex));
382
383 // Branch tests.
384 {
385 Label end;
386 // Branch to the next instruction.
387 __ b(&end);
388 __ bind(&end);
389 }
390 {
391 Label loop, end;
392 __ subs(x3, x3, x3);
393 __ bind(&loop);
394 // Not-taken branch (the first time).
395 // Taken branch (the second time).
396 __ b(&end, ne);
397 __ cmp(x3, 1);
398 // Backwards branch.
399 __ b(&loop);
400 __ bind(&end);
401 }
402 }
403
404
GenerateTestSequenceFP(MacroAssembler* masm)405 static void GenerateTestSequenceFP(MacroAssembler* masm) {
406 ExactAssemblyScope guard(masm,
407 masm->GetBuffer()->GetRemainingBytes(),
408 ExactAssemblyScope::kMaximumSize);
409
410 // Scalar floating point instructions.
411 __ fabd(d13, d2, d19);
412 __ fabd(s8, s10, s30);
413 __ fabs(d1, d1);
414 __ fabs(s25, s7);
415 __ facge(d1, d23, d16);
416 __ facge(s4, s17, s1);
417 __ facgt(d2, d21, d24);
418 __ facgt(s12, s26, s12);
419 __ fadd(d13, d11, d22);
420 __ fadd(s27, s19, s8);
421 __ fccmp(d6, d10, NoFlag, hs);
422 __ fccmp(s29, s20, NZVFlag, ne);
423 __ fccmpe(d10, d2, NZCFlag, al);
424 __ fccmpe(s3, s3, NZVFlag, pl);
425 __ fcmeq(d19, d8, d10);
426 __ fcmeq(d0, d18, 0.0);
427 __ fcmeq(s1, s4, s30);
428 __ fcmeq(s22, s29, 0.0);
429 __ fcmge(d27, d18, d1);
430 __ fcmge(d31, d28, 0.0);
431 __ fcmge(s31, s19, s9);
432 __ fcmge(s1, s25, 0.0);
433 __ fcmgt(d18, d1, d15);
434 __ fcmgt(d3, d31, 0.0);
435 __ fcmgt(s11, s25, s2);
436 __ fcmgt(s17, s16, 0.0);
437 __ fcmle(d24, d17, 0.0);
438 __ fcmle(s11, s8, 0.0);
439 __ fcmlt(d5, d31, 0.0);
440 __ fcmlt(s18, s23, 0.0);
441 __ fcmp(d10, d24);
442 __ fcmp(d13, 0.0);
443 __ fcmp(s18, s6);
444 __ fcmp(s16, 0.0);
445 __ fcmpe(d9, d17);
446 __ fcmpe(d29, 0.0);
447 __ fcmpe(s16, s17);
448 __ fcmpe(s22, 0.0);
449 __ fcsel(d10, d14, d19, gt);
450 __ fcsel(s22, s18, s2, ge);
451 __ fcvt(d4, h24);
452 __ fcvt(d11, s2);
453 __ fcvt(h8, d9);
454 __ fcvt(h12, s1);
455 __ fcvt(s12, d31);
456 __ fcvt(s27, h25);
457 __ fcvtas(d28, d16);
458 __ fcvtas(s3, s5);
459 __ fcvtas(w18, d31);
460 __ fcvtas(w29, s24);
461 __ fcvtas(x9, d1);
462 __ fcvtas(x30, s2);
463 __ fcvtau(d14, d0);
464 __ fcvtau(s31, s14);
465 __ fcvtau(w16, d2);
466 __ fcvtau(w18, s0);
467 __ fcvtau(x26, d7);
468 __ fcvtau(x25, s19);
469 __ fcvtms(d30, d25);
470 __ fcvtms(s12, s15);
471 __ fcvtms(w9, d7);
472 __ fcvtms(w19, s6);
473 __ fcvtms(x6, d6);
474 __ fcvtms(x22, s7);
475 __ fcvtmu(d27, d0);
476 __ fcvtmu(s8, s22);
477 __ fcvtmu(w29, d19);
478 __ fcvtmu(w26, s0);
479 __ fcvtmu(x13, d5);
480 __ fcvtmu(x5, s18);
481 __ fcvtns(d30, d15);
482 __ fcvtns(s10, s11);
483 __ fcvtns(w21, d15);
484 __ fcvtns(w18, s10);
485 __ fcvtns(x8, d17);
486 __ fcvtns(x17, s12);
487 __ fcvtnu(d0, d21);
488 __ fcvtnu(s6, s25);
489 __ fcvtnu(w29, d11);
490 __ fcvtnu(w25, s31);
491 __ fcvtnu(x30, d11);
492 __ fcvtnu(x27, s18);
493 __ fcvtps(d11, d22);
494 __ fcvtps(s29, s20);
495 __ fcvtps(w15, d25);
496 __ fcvtps(w16, s7);
497 __ fcvtps(x13, d20);
498 __ fcvtps(x3, s23);
499 __ fcvtpu(d24, d1);
500 __ fcvtpu(s14, s24);
501 __ fcvtpu(w26, d29);
502 __ fcvtpu(wzr, s26);
503 __ fcvtpu(x27, d6);
504 __ fcvtpu(x29, s14);
505 __ fcvtxn(s12, d12);
506 __ fcvtzs(d15, d0);
507 __ fcvtzs(d13, d4, 42);
508 __ fcvtzs(s8, s11);
509 __ fcvtzs(s31, s6, 25);
510 __ fcvtzs(w6, d9);
511 __ fcvtzs(w25, d10, 20);
512 __ fcvtzs(w9, s1);
513 __ fcvtzs(w17, s29, 30);
514 __ fcvtzs(x19, d2);
515 __ fcvtzs(x22, d14, 1);
516 __ fcvtzs(x14, s20);
517 __ fcvtzs(x3, s30, 33);
518 __ fcvtzu(d28, d15);
519 __ fcvtzu(d0, d4, 3);
520 __ fcvtzu(s2, s5);
521 __ fcvtzu(s4, s0, 30);
522 __ fcvtzu(w11, d4);
523 __ fcvtzu(w7, d24, 32);
524 __ fcvtzu(w18, s24);
525 __ fcvtzu(w14, s27, 4);
526 __ fcvtzu(x22, d11);
527 __ fcvtzu(x8, d27, 52);
528 __ fcvtzu(x7, s20);
529 __ fcvtzu(x22, s7, 44);
530 __ fdiv(d6, d14, d15);
531 __ fdiv(s26, s5, s25);
532 __ fmadd(d18, d26, d12, d30);
533 __ fmadd(s13, s9, s28, s4);
534 __ fmax(d12, d5, d5);
535 __ fmax(s12, s28, s6);
536 __ fmaxnm(d28, d4, d2);
537 __ fmaxnm(s6, s10, s8);
538 __ fmin(d20, d20, d18);
539 __ fmin(s7, s13, s16);
540 __ fminnm(d19, d14, d30);
541 __ fminnm(s0, s1, s1);
542 __ fmov(d13, d6);
543 __ fmov(d2, x17);
544 __ fmov(d8, -2.5000);
545 __ fmov(s5, s3);
546 __ fmov(s25, w20);
547 __ fmov(s21, 2.8750f);
548 __ fmov(w18, s24);
549 __ fmov(x18, d2);
550 __ fmsub(d20, d30, d3, d19);
551 __ fmsub(s5, s19, s4, s12);
552 __ fmul(d30, d27, d23);
553 __ fmul(s25, s17, s15);
554 __ fmulx(d4, d17, d1);
555 __ fmulx(s14, s25, s4);
556 __ fneg(d15, d0);
557 __ fneg(s14, s15);
558 __ fnmadd(d0, d16, d22, d31);
559 __ fnmadd(s0, s18, s26, s18);
560 __ fnmsub(d19, d12, d15, d21);
561 __ fnmsub(s29, s0, s11, s26);
562 __ fnmul(d31, d19, d1);
563 __ fnmul(s18, s3, s17);
564 __ frecpe(d7, d21);
565 __ frecpe(s29, s17);
566 __ frecps(d11, d26, d17);
567 __ frecps(s18, s27, s1);
568 __ frecpx(d15, d18);
569 __ frecpx(s5, s10);
570 __ frinta(d16, d30);
571 __ frinta(s1, s22);
572 __ frinti(d19, d29);
573 __ frinti(s14, s21);
574 __ frintm(d20, d30);
575 __ frintm(s1, s16);
576 __ frintn(d30, d1);
577 __ frintn(s24, s10);
578 __ frintp(d4, d20);
579 __ frintp(s13, s3);
580 __ frintx(d13, d20);
581 __ frintx(s17, s7);
582 __ frintz(d0, d8);
583 __ frintz(s15, s29);
584 __ frsqrte(d21, d10);
585 __ frsqrte(s17, s25);
586 __ frsqrts(d4, d29, d17);
587 __ frsqrts(s14, s3, s24);
588 __ fsqrt(d14, d17);
589 __ fsqrt(s4, s14);
590 __ fsub(d13, d19, d7);
591 __ fsub(s3, s21, s27);
592 __ scvtf(d31, d16);
593 __ scvtf(d26, d31, 24);
594 __ scvtf(d6, w16);
595 __ scvtf(d5, w20, 6);
596 __ scvtf(d16, x8);
597 __ scvtf(d15, x8, 10);
598 __ scvtf(s7, s4);
599 __ scvtf(s8, s15, 14);
600 __ scvtf(s29, w10);
601 __ scvtf(s15, w21, 11);
602 __ scvtf(s27, x26);
603 __ scvtf(s26, x12, 38);
604 __ ucvtf(d0, d9);
605 __ ucvtf(d5, d22, 47);
606 __ ucvtf(d30, w27);
607 __ ucvtf(d3, w19, 1);
608 __ ucvtf(d28, x21);
609 __ ucvtf(d27, x30, 35);
610 __ ucvtf(s11, s5);
611 __ ucvtf(s0, s23, 14);
612 __ ucvtf(s20, w19);
613 __ ucvtf(s21, w22, 18);
614 __ ucvtf(s6, x13);
615 __ ucvtf(s7, x2, 21);
616 }
617
618
GenerateTestSequenceNEON(MacroAssembler* masm)619 static void GenerateTestSequenceNEON(MacroAssembler* masm) {
620 ExactAssemblyScope guard(masm,
621 masm->GetBuffer()->GetRemainingBytes(),
622 ExactAssemblyScope::kMaximumSize);
623
624 // NEON integer instructions.
625 __ abs(d19, d0);
626 __ abs(v16.V16B(), v11.V16B());
627 __ abs(v0.V2D(), v31.V2D());
628 __ abs(v27.V2S(), v25.V2S());
629 __ abs(v21.V4H(), v27.V4H());
630 __ abs(v16.V4S(), v1.V4S());
631 __ abs(v31.V8B(), v5.V8B());
632 __ abs(v29.V8H(), v13.V8H());
633 __ add(d10, d5, d17);
634 __ add(v31.V16B(), v15.V16B(), v23.V16B());
635 __ add(v10.V2D(), v31.V2D(), v14.V2D());
636 __ add(v15.V2S(), v14.V2S(), v19.V2S());
637 __ add(v27.V4H(), v23.V4H(), v17.V4H());
638 __ add(v25.V4S(), v28.V4S(), v29.V4S());
639 __ add(v13.V8B(), v7.V8B(), v18.V8B());
640 __ add(v4.V8H(), v2.V8H(), v1.V8H());
641 __ addhn(v10.V2S(), v14.V2D(), v15.V2D());
642 __ addhn(v10.V4H(), v30.V4S(), v26.V4S());
643 __ addhn(v31.V8B(), v12.V8H(), v22.V8H());
644 __ addhn2(v16.V16B(), v21.V8H(), v20.V8H());
645 __ addhn2(v0.V4S(), v2.V2D(), v17.V2D());
646 __ addhn2(v31.V8H(), v7.V4S(), v17.V4S());
647 __ addp(d14, v19.V2D());
648 __ addp(v3.V16B(), v8.V16B(), v28.V16B());
649 __ addp(v8.V2D(), v5.V2D(), v17.V2D());
650 __ addp(v22.V2S(), v30.V2S(), v26.V2S());
651 __ addp(v29.V4H(), v24.V4H(), v14.V4H());
652 __ addp(v30.V4S(), v26.V4S(), v24.V4S());
653 __ addp(v12.V8B(), v26.V8B(), v7.V8B());
654 __ addp(v17.V8H(), v8.V8H(), v12.V8H());
655 __ addv(b27, v23.V16B());
656 __ addv(b12, v20.V8B());
657 __ addv(h27, v30.V4H());
658 __ addv(h19, v14.V8H());
659 __ addv(s14, v27.V4S());
660 __ and_(v10.V16B(), v8.V16B(), v27.V16B());
661 __ and_(v5.V8B(), v1.V8B(), v16.V8B());
662 __ bic(v26.V16B(), v3.V16B(), v24.V16B());
663 __ bic(v7.V2S(), 0xe4, 16);
664 __ bic(v28.V4H(), 0x23, 8);
665 __ bic(v29.V4S(), 0xac);
666 __ bic(v12.V8B(), v31.V8B(), v21.V8B());
667 __ bic(v18.V8H(), 0x98);
668 __ bif(v12.V16B(), v26.V16B(), v8.V16B());
669 __ bif(v2.V8B(), v23.V8B(), v27.V8B());
670 __ bit(v8.V16B(), v3.V16B(), v13.V16B());
671 __ bit(v5.V8B(), v5.V8B(), v23.V8B());
672 __ bsl(v9.V16B(), v31.V16B(), v23.V16B());
673 __ bsl(v14.V8B(), v7.V8B(), v3.V8B());
674 __ cls(v29.V16B(), v5.V16B());
675 __ cls(v21.V2S(), v0.V2S());
676 __ cls(v1.V4H(), v12.V4H());
677 __ cls(v27.V4S(), v10.V4S());
678 __ cls(v19.V8B(), v4.V8B());
679 __ cls(v15.V8H(), v14.V8H());
680 __ clz(v1.V16B(), v4.V16B());
681 __ clz(v27.V2S(), v17.V2S());
682 __ clz(v9.V4H(), v9.V4H());
683 __ clz(v31.V4S(), v15.V4S());
684 __ clz(v14.V8B(), v19.V8B());
685 __ clz(v6.V8H(), v11.V8H());
686 __ cmeq(d18, d5, d29);
687 __ cmeq(d14, d31, 0);
688 __ cmeq(v19.V16B(), v3.V16B(), v22.V16B());
689 __ cmeq(v15.V16B(), v9.V16B(), 0);
690 __ cmeq(v12.V2D(), v16.V2D(), v10.V2D());
691 __ cmeq(v8.V2D(), v22.V2D(), 0);
692 __ cmeq(v2.V2S(), v3.V2S(), v9.V2S());
693 __ cmeq(v16.V2S(), v25.V2S(), 0);
694 __ cmeq(v6.V4H(), v23.V4H(), v20.V4H());
695 __ cmeq(v16.V4H(), v13.V4H(), 0);
696 __ cmeq(v21.V4S(), v17.V4S(), v2.V4S());
697 __ cmeq(v6.V4S(), v25.V4S(), 0);
698 __ cmeq(v16.V8B(), v13.V8B(), v2.V8B());
699 __ cmeq(v21.V8B(), v16.V8B(), 0);
700 __ cmeq(v20.V8H(), v7.V8H(), v25.V8H());
701 __ cmeq(v26.V8H(), v8.V8H(), 0);
702 __ cmge(d16, d13, d31);
703 __ cmge(d25, d24, 0);
704 __ cmge(v17.V16B(), v19.V16B(), v17.V16B());
705 __ cmge(v22.V16B(), v30.V16B(), 0);
706 __ cmge(v28.V2D(), v20.V2D(), v26.V2D());
707 __ cmge(v6.V2D(), v23.V2D(), 0);
708 __ cmge(v25.V2S(), v22.V2S(), v3.V2S());
709 __ cmge(v21.V2S(), v11.V2S(), 0);
710 __ cmge(v16.V4H(), v3.V4H(), v12.V4H());
711 __ cmge(v23.V4H(), v9.V4H(), 0);
712 __ cmge(v7.V4S(), v2.V4S(), v11.V4S());
713 __ cmge(v0.V4S(), v22.V4S(), 0);
714 __ cmge(v10.V8B(), v30.V8B(), v9.V8B());
715 __ cmge(v21.V8B(), v8.V8B(), 0);
716 __ cmge(v2.V8H(), v7.V8H(), v26.V8H());
717 __ cmge(v19.V8H(), v10.V8H(), 0);
718 __ cmgt(d6, d13, d1);
719 __ cmgt(d30, d24, 0);
720 __ cmgt(v20.V16B(), v25.V16B(), v27.V16B());
721 __ cmgt(v0.V16B(), v25.V16B(), 0);
722 __ cmgt(v22.V2D(), v25.V2D(), v1.V2D());
723 __ cmgt(v16.V2D(), v16.V2D(), 0);
724 __ cmgt(v5.V2S(), v9.V2S(), v15.V2S());
725 __ cmgt(v12.V2S(), v18.V2S(), 0);
726 __ cmgt(v28.V4H(), v18.V4H(), v11.V4H());
727 __ cmgt(v22.V4H(), v3.V4H(), 0);
728 __ cmgt(v5.V4S(), v11.V4S(), v27.V4S());
729 __ cmgt(v13.V4S(), v20.V4S(), 0);
730 __ cmgt(v27.V8B(), v31.V8B(), v7.V8B());
731 __ cmgt(v5.V8B(), v0.V8B(), 0);
732 __ cmgt(v22.V8H(), v28.V8H(), v13.V8H());
733 __ cmgt(v6.V8H(), v2.V8H(), 0);
734 __ cmhi(d21, d8, d22);
735 __ cmhi(v18.V16B(), v19.V16B(), v19.V16B());
736 __ cmhi(v7.V2D(), v0.V2D(), v21.V2D());
737 __ cmhi(v15.V2S(), v19.V2S(), v0.V2S());
738 __ cmhi(v31.V4H(), v7.V4H(), v12.V4H());
739 __ cmhi(v9.V4S(), v16.V4S(), v22.V4S());
740 __ cmhi(v7.V8B(), v24.V8B(), v28.V8B());
741 __ cmhi(v11.V8H(), v10.V8H(), v25.V8H());
742 __ cmhs(d1, d12, d17);
743 __ cmhs(v21.V16B(), v25.V16B(), v30.V16B());
744 __ cmhs(v8.V2D(), v2.V2D(), v26.V2D());
745 __ cmhs(v1.V2S(), v22.V2S(), v29.V2S());
746 __ cmhs(v26.V4H(), v30.V4H(), v30.V4H());
747 __ cmhs(v19.V4S(), v20.V4S(), v16.V4S());
748 __ cmhs(v1.V8B(), v3.V8B(), v26.V8B());
749 __ cmhs(v20.V8H(), v28.V8H(), v8.V8H());
750 __ cmle(d30, d24, 0);
751 __ cmle(v0.V16B(), v3.V16B(), 0);
752 __ cmle(v2.V2D(), v30.V2D(), 0);
753 __ cmle(v7.V2S(), v10.V2S(), 0);
754 __ cmle(v9.V4H(), v31.V4H(), 0);
755 __ cmle(v9.V4S(), v18.V4S(), 0);
756 __ cmle(v21.V8B(), v31.V8B(), 0);
757 __ cmle(v29.V8H(), v21.V8H(), 0);
758 __ cmlt(d25, d23, 0);
759 __ cmlt(v7.V16B(), v21.V16B(), 0);
760 __ cmlt(v7.V2D(), v30.V2D(), 0);
761 __ cmlt(v25.V2S(), v28.V2S(), 0);
762 __ cmlt(v0.V4H(), v11.V4H(), 0);
763 __ cmlt(v24.V4S(), v5.V4S(), 0);
764 __ cmlt(v26.V8B(), v11.V8B(), 0);
765 __ cmlt(v1.V8H(), v21.V8H(), 0);
766 __ cmtst(d28, d23, d30);
767 __ cmtst(v26.V16B(), v6.V16B(), v31.V16B());
768 __ cmtst(v1.V2D(), v21.V2D(), v4.V2D());
769 __ cmtst(v27.V2S(), v26.V2S(), v20.V2S());
770 __ cmtst(v26.V4H(), v0.V4H(), v18.V4H());
771 __ cmtst(v25.V4S(), v16.V4S(), v4.V4S());
772 __ cmtst(v11.V8B(), v10.V8B(), v9.V8B());
773 __ cmtst(v0.V8H(), v2.V8H(), v1.V8H());
774 __ cnt(v25.V16B(), v15.V16B());
775 __ cnt(v28.V8B(), v6.V8B());
776 __ dup(v6.V16B(), v7.B(), 7);
777 __ dup(v9.V16B(), w20);
778 __ dup(v12.V2D(), v13.D(), 1);
779 __ dup(v9.V2D(), xzr);
780 __ dup(v4.V2S(), v26.S(), 2);
781 __ dup(v3.V2S(), w12);
782 __ dup(v22.V4H(), v5.H(), 7);
783 __ dup(v16.V4H(), w25);
784 __ dup(v20.V4S(), v10.S(), 2);
785 __ dup(v10.V4S(), w7);
786 __ dup(v30.V8B(), v30.B(), 2);
787 __ dup(v31.V8B(), w15);
788 __ dup(v28.V8H(), v17.H(), 4);
789 __ dup(v2.V8H(), w3);
790 __ eor(v29.V16B(), v25.V16B(), v3.V16B());
791 __ eor(v3.V8B(), v16.V8B(), v28.V8B());
792 __ ext(v1.V16B(), v26.V16B(), v6.V16B(), 1);
793 __ ext(v2.V8B(), v30.V8B(), v1.V8B(), 1);
794 __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0));
795 __ ld1(v23.V16B(),
796 v24.V16B(),
797 v25.V16B(),
798 v26.V16B(),
799 MemOperand(x1, x2, PostIndex));
800 __ ld1(v5.V16B(),
801 v6.V16B(),
802 v7.V16B(),
803 v8.V16B(),
804 MemOperand(x1, 64, PostIndex));
805 __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), MemOperand(x0));
806 __ ld1(v13.V16B(), v14.V16B(), v15.V16B(), MemOperand(x1, x2, PostIndex));
807 __ ld1(v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x1, 48, PostIndex));
808 __ ld1(v17.V16B(), v18.V16B(), MemOperand(x0));
809 __ ld1(v20.V16B(), v21.V16B(), MemOperand(x1, x2, PostIndex));
810 __ ld1(v28.V16B(), v29.V16B(), MemOperand(x1, 32, PostIndex));
811 __ ld1(v29.V16B(), MemOperand(x0));
812 __ ld1(v21.V16B(), MemOperand(x1, x2, PostIndex));
813 __ ld1(v4.V16B(), MemOperand(x1, 16, PostIndex));
814 __ ld1(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), MemOperand(x0));
815 __ ld1(v17.V1D(),
816 v18.V1D(),
817 v19.V1D(),
818 v20.V1D(),
819 MemOperand(x1, x2, PostIndex));
820 __ ld1(v28.V1D(),
821 v29.V1D(),
822 v30.V1D(),
823 v31.V1D(),
824 MemOperand(x1, 32, PostIndex));
825 __ ld1(v20.V1D(), v21.V1D(), v22.V1D(), MemOperand(x0));
826 __ ld1(v19.V1D(), v20.V1D(), v21.V1D(), MemOperand(x1, x2, PostIndex));
827 __ ld1(v12.V1D(), v13.V1D(), v14.V1D(), MemOperand(x1, 24, PostIndex));
828 __ ld1(v29.V1D(), v30.V1D(), MemOperand(x0));
829 __ ld1(v31.V1D(), v0.V1D(), MemOperand(x1, x2, PostIndex));
830 __ ld1(v3.V1D(), v4.V1D(), MemOperand(x1, 16, PostIndex));
831 __ ld1(v28.V1D(), MemOperand(x0));
832 __ ld1(v11.V1D(), MemOperand(x1, x2, PostIndex));
833 __ ld1(v29.V1D(), MemOperand(x1, 8, PostIndex));
834 __ ld1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), MemOperand(x0));
835 __ ld1(v8.V2D(),
836 v9.V2D(),
837 v10.V2D(),
838 v11.V2D(),
839 MemOperand(x1, x2, PostIndex));
840 __ ld1(v14.V2D(),
841 v15.V2D(),
842 v16.V2D(),
843 v17.V2D(),
844 MemOperand(x1, 64, PostIndex));
845 __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x0));
846 __ ld1(v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
847 __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x1, 48, PostIndex));
848 __ ld1(v18.V2D(), v19.V2D(), MemOperand(x0));
849 __ ld1(v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex));
850 __ ld1(v17.V2D(), v18.V2D(), MemOperand(x1, 32, PostIndex));
851 __ ld1(v5.V2D(), MemOperand(x0));
852 __ ld1(v6.V2D(), MemOperand(x1, x2, PostIndex));
853 __ ld1(v15.V2D(), MemOperand(x1, 16, PostIndex));
854 __ ld1(v30.V2S(), v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x0));
855 __ ld1(v24.V2S(),
856 v25.V2S(),
857 v26.V2S(),
858 v27.V2S(),
859 MemOperand(x1, x2, PostIndex));
860 __ ld1(v27.V2S(),
861 v28.V2S(),
862 v29.V2S(),
863 v30.V2S(),
864 MemOperand(x1, 32, PostIndex));
865 __ ld1(v11.V2S(), v12.V2S(), v13.V2S(), MemOperand(x0));
866 __ ld1(v8.V2S(), v9.V2S(), v10.V2S(), MemOperand(x1, x2, PostIndex));
867 __ ld1(v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x1, 24, PostIndex));
868 __ ld1(v0.V2S(), v1.V2S(), MemOperand(x0));
869 __ ld1(v13.V2S(), v14.V2S(), MemOperand(x1, x2, PostIndex));
870 __ ld1(v3.V2S(), v4.V2S(), MemOperand(x1, 16, PostIndex));
871 __ ld1(v26.V2S(), MemOperand(x0));
872 __ ld1(v0.V2S(), MemOperand(x1, x2, PostIndex));
873 __ ld1(v11.V2S(), MemOperand(x1, 8, PostIndex));
874 __ ld1(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0));
875 __ ld1(v24.V4H(),
876 v25.V4H(),
877 v26.V4H(),
878 v27.V4H(),
879 MemOperand(x1, x2, PostIndex));
880 __ ld1(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex));
881 __ ld1(v30.V4H(), v31.V4H(), v0.V4H(), MemOperand(x0));
882 __ ld1(v25.V4H(), v26.V4H(), v27.V4H(), MemOperand(x1, x2, PostIndex));
883 __ ld1(v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 24, PostIndex));
884 __ ld1(v3.V4H(), v4.V4H(), MemOperand(x0));
885 __ ld1(v3.V4H(), v4.V4H(), MemOperand(x1, x2, PostIndex));
886 __ ld1(v23.V4H(), v24.V4H(), MemOperand(x1, 16, PostIndex));
887 __ ld1(v26.V4H(), MemOperand(x0));
888 __ ld1(v1.V4H(), MemOperand(x1, x2, PostIndex));
889 __ ld1(v14.V4H(), MemOperand(x1, 8, PostIndex));
890 __ ld1(v26.V4S(), v27.V4S(), v28.V4S(), v29.V4S(), MemOperand(x0));
891 __ ld1(v28.V4S(),
892 v29.V4S(),
893 v30.V4S(),
894 v31.V4S(),
895 MemOperand(x1, x2, PostIndex));
896 __ ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), MemOperand(x1, 64, PostIndex));
897 __ ld1(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0));
898 __ ld1(v22.V4S(), v23.V4S(), v24.V4S(), MemOperand(x1, x2, PostIndex));
899 __ ld1(v15.V4S(), v16.V4S(), v17.V4S(), MemOperand(x1, 48, PostIndex));
900 __ ld1(v20.V4S(), v21.V4S(), MemOperand(x0));
901 __ ld1(v30.V4S(), v31.V4S(), MemOperand(x1, x2, PostIndex));
902 __ ld1(v11.V4S(), v12.V4S(), MemOperand(x1, 32, PostIndex));
903 __ ld1(v15.V4S(), MemOperand(x0));
904 __ ld1(v12.V4S(), MemOperand(x1, x2, PostIndex));
905 __ ld1(v0.V4S(), MemOperand(x1, 16, PostIndex));
906 __ ld1(v17.V8B(), v18.V8B(), v19.V8B(), v20.V8B(), MemOperand(x0));
907 __ ld1(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, x2, PostIndex));
908 __ ld1(v9.V8B(),
909 v10.V8B(),
910 v11.V8B(),
911 v12.V8B(),
912 MemOperand(x1, 32, PostIndex));
913 __ ld1(v4.V8B(), v5.V8B(), v6.V8B(), MemOperand(x0));
914 __ ld1(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x1, x2, PostIndex));
915 __ ld1(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex));
916 __ ld1(v10.V8B(), v11.V8B(), MemOperand(x0));
917 __ ld1(v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex));
918 __ ld1(v27.V8B(), v28.V8B(), MemOperand(x1, 16, PostIndex));
919 __ ld1(v31.V8B(), MemOperand(x0));
920 __ ld1(v10.V8B(), MemOperand(x1, x2, PostIndex));
921 __ ld1(v28.V8B(), MemOperand(x1, 8, PostIndex));
922 __ ld1(v5.V8H(), v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0));
923 __ ld1(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
924 __ ld1(v10.V8H(),
925 v11.V8H(),
926 v12.V8H(),
927 v13.V8H(),
928 MemOperand(x1, 64, PostIndex));
929 __ ld1(v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0));
930 __ ld1(v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
931 __ ld1(v17.V8H(), v18.V8H(), v19.V8H(), MemOperand(x1, 48, PostIndex));
932 __ ld1(v4.V8H(), v5.V8H(), MemOperand(x0));
933 __ ld1(v21.V8H(), v22.V8H(), MemOperand(x1, x2, PostIndex));
934 __ ld1(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex));
935 __ ld1(v9.V8H(), MemOperand(x0));
936 __ ld1(v27.V8H(), MemOperand(x1, x2, PostIndex));
937 __ ld1(v26.V8H(), MemOperand(x1, 16, PostIndex));
938 __ ld1(v19.B(), 1, MemOperand(x0));
939 __ ld1(v12.B(), 3, MemOperand(x1, x2, PostIndex));
940 __ ld1(v27.B(), 12, MemOperand(x1, 1, PostIndex));
941 __ ld1(v10.D(), 1, MemOperand(x0));
942 __ ld1(v26.D(), 1, MemOperand(x1, x2, PostIndex));
943 __ ld1(v7.D(), 1, MemOperand(x1, 8, PostIndex));
944 __ ld1(v19.H(), 5, MemOperand(x0));
945 __ ld1(v10.H(), 1, MemOperand(x1, x2, PostIndex));
946 __ ld1(v5.H(), 4, MemOperand(x1, 2, PostIndex));
947 __ ld1(v21.S(), 2, MemOperand(x0));
948 __ ld1(v13.S(), 2, MemOperand(x1, x2, PostIndex));
949 __ ld1(v1.S(), 2, MemOperand(x1, 4, PostIndex));
950 __ ld1r(v2.V16B(), MemOperand(x0));
951 __ ld1r(v2.V16B(), MemOperand(x1, x2, PostIndex));
952 __ ld1r(v22.V16B(), MemOperand(x1, 1, PostIndex));
953 __ ld1r(v25.V1D(), MemOperand(x0));
954 __ ld1r(v9.V1D(), MemOperand(x1, x2, PostIndex));
955 __ ld1r(v23.V1D(), MemOperand(x1, 8, PostIndex));
956 __ ld1r(v19.V2D(), MemOperand(x0));
957 __ ld1r(v21.V2D(), MemOperand(x1, x2, PostIndex));
958 __ ld1r(v30.V2D(), MemOperand(x1, 8, PostIndex));
959 __ ld1r(v24.V2S(), MemOperand(x0));
960 __ ld1r(v26.V2S(), MemOperand(x1, x2, PostIndex));
961 __ ld1r(v28.V2S(), MemOperand(x1, 4, PostIndex));
962 __ ld1r(v19.V4H(), MemOperand(x0));
963 __ ld1r(v1.V4H(), MemOperand(x1, x2, PostIndex));
964 __ ld1r(v21.V4H(), MemOperand(x1, 2, PostIndex));
965 __ ld1r(v15.V4S(), MemOperand(x0));
966 __ ld1r(v21.V4S(), MemOperand(x1, x2, PostIndex));
967 __ ld1r(v23.V4S(), MemOperand(x1, 4, PostIndex));
968 __ ld1r(v26.V8B(), MemOperand(x0));
969 __ ld1r(v14.V8B(), MemOperand(x1, x2, PostIndex));
970 __ ld1r(v19.V8B(), MemOperand(x1, 1, PostIndex));
971 __ ld1r(v13.V8H(), MemOperand(x0));
972 __ ld1r(v30.V8H(), MemOperand(x1, x2, PostIndex));
973 __ ld1r(v27.V8H(), MemOperand(x1, 2, PostIndex));
974 __ ld2(v21.V16B(), v22.V16B(), MemOperand(x0));
975 __ ld2(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex));
976 __ ld2(v12.V16B(), v13.V16B(), MemOperand(x1, 32, PostIndex));
977 __ ld2(v14.V2D(), v15.V2D(), MemOperand(x0));
978 __ ld2(v0.V2D(), v1.V2D(), MemOperand(x1, x2, PostIndex));
979 __ ld2(v12.V2D(), v13.V2D(), MemOperand(x1, 32, PostIndex));
980 __ ld2(v27.V2S(), v28.V2S(), MemOperand(x0));
981 __ ld2(v2.V2S(), v3.V2S(), MemOperand(x1, x2, PostIndex));
982 __ ld2(v12.V2S(), v13.V2S(), MemOperand(x1, 16, PostIndex));
983 __ ld2(v9.V4H(), v10.V4H(), MemOperand(x0));
984 __ ld2(v23.V4H(), v24.V4H(), MemOperand(x1, x2, PostIndex));
985 __ ld2(v1.V4H(), v2.V4H(), MemOperand(x1, 16, PostIndex));
986 __ ld2(v20.V4S(), v21.V4S(), MemOperand(x0));
987 __ ld2(v10.V4S(), v11.V4S(), MemOperand(x1, x2, PostIndex));
988 __ ld2(v24.V4S(), v25.V4S(), MemOperand(x1, 32, PostIndex));
989 __ ld2(v17.V8B(), v18.V8B(), MemOperand(x0));
990 __ ld2(v13.V8B(), v14.V8B(), MemOperand(x1, x2, PostIndex));
991 __ ld2(v7.V8B(), v8.V8B(), MemOperand(x1, 16, PostIndex));
992 __ ld2(v30.V8H(), v31.V8H(), MemOperand(x0));
993 __ ld2(v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
994 __ ld2(v13.V8H(), v14.V8H(), MemOperand(x1, 32, PostIndex));
995 __ ld2(v5.B(), v6.B(), 12, MemOperand(x0));
996 __ ld2(v16.B(), v17.B(), 7, MemOperand(x1, x2, PostIndex));
997 __ ld2(v29.B(), v30.B(), 2, MemOperand(x1, 2, PostIndex));
998 __ ld2(v11.D(), v12.D(), 1, MemOperand(x0));
999 __ ld2(v26.D(), v27.D(), 0, MemOperand(x1, x2, PostIndex));
1000 __ ld2(v25.D(), v26.D(), 0, MemOperand(x1, 16, PostIndex));
1001 __ ld2(v18.H(), v19.H(), 7, MemOperand(x0));
1002 __ ld2(v17.H(), v18.H(), 5, MemOperand(x1, x2, PostIndex));
1003 __ ld2(v30.H(), v31.H(), 2, MemOperand(x1, 4, PostIndex));
1004 __ ld2(v29.S(), v30.S(), 3, MemOperand(x0));
1005 __ ld2(v28.S(), v29.S(), 0, MemOperand(x1, x2, PostIndex));
1006 __ ld2(v6.S(), v7.S(), 1, MemOperand(x1, 8, PostIndex));
1007 __ ld2r(v26.V16B(), v27.V16B(), MemOperand(x0));
1008 __ ld2r(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex));
1009 __ ld2r(v5.V16B(), v6.V16B(), MemOperand(x1, 2, PostIndex));
1010 __ ld2r(v26.V1D(), v27.V1D(), MemOperand(x0));
1011 __ ld2r(v14.V1D(), v15.V1D(), MemOperand(x1, x2, PostIndex));
1012 __ ld2r(v23.V1D(), v24.V1D(), MemOperand(x1, 16, PostIndex));
1013 __ ld2r(v11.V2D(), v12.V2D(), MemOperand(x0));
1014 __ ld2r(v29.V2D(), v30.V2D(), MemOperand(x1, x2, PostIndex));
1015 __ ld2r(v15.V2D(), v16.V2D(), MemOperand(x1, 16, PostIndex));
1016 __ ld2r(v26.V2S(), v27.V2S(), MemOperand(x0));
1017 __ ld2r(v22.V2S(), v23.V2S(), MemOperand(x1, x2, PostIndex));
1018 __ ld2r(v2.V2S(), v3.V2S(), MemOperand(x1, 8, PostIndex));
1019 __ ld2r(v2.V4H(), v3.V4H(), MemOperand(x0));
1020 __ ld2r(v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex));
1021 __ ld2r(v6.V4H(), v7.V4H(), MemOperand(x1, 4, PostIndex));
1022 __ ld2r(v7.V4S(), v8.V4S(), MemOperand(x0));
1023 __ ld2r(v19.V4S(), v20.V4S(), MemOperand(x1, x2, PostIndex));
1024 __ ld2r(v21.V4S(), v22.V4S(), MemOperand(x1, 8, PostIndex));
1025 __ ld2r(v26.V8B(), v27.V8B(), MemOperand(x0));
1026 __ ld2r(v20.V8B(), v21.V8B(), MemOperand(x1, x2, PostIndex));
1027 __ ld2r(v11.V8B(), v12.V8B(), MemOperand(x1, 2, PostIndex));
1028 __ ld2r(v12.V8H(), v13.V8H(), MemOperand(x0));
1029 __ ld2r(v6.V8H(), v7.V8H(), MemOperand(x1, x2, PostIndex));
1030 __ ld2r(v25.V8H(), v26.V8H(), MemOperand(x1, 4, PostIndex));
1031 __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x0));
1032 __ ld3(v28.V16B(), v29.V16B(), v30.V16B(), MemOperand(x1, x2, PostIndex));
1033 __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x1, 48, PostIndex));
1034 __ ld3(v21.V2D(), v22.V2D(), v23.V2D(), MemOperand(x0));
1035 __ ld3(v18.V2D(), v19.V2D(), v20.V2D(), MemOperand(x1, x2, PostIndex));
1036 __ ld3(v27.V2D(), v28.V2D(), v29.V2D(), MemOperand(x1, 48, PostIndex));
1037 __ ld3(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x0));
1038 __ ld3(v20.V2S(), v21.V2S(), v22.V2S(), MemOperand(x1, x2, PostIndex));
1039 __ ld3(v26.V2S(), v27.V2S(), v28.V2S(), MemOperand(x1, 24, PostIndex));
1040 __ ld3(v27.V4H(), v28.V4H(), v29.V4H(), MemOperand(x0));
1041 __ ld3(v28.V4H(), v29.V4H(), v30.V4H(), MemOperand(x1, x2, PostIndex));
1042 __ ld3(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 24, PostIndex));
1043 __ ld3(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0));
1044 __ ld3(v24.V4S(), v25.V4S(), v26.V4S(), MemOperand(x1, x2, PostIndex));
1045 __ ld3(v11.V4S(), v12.V4S(), v13.V4S(), MemOperand(x1, 48, PostIndex));
1046 __ ld3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x0));
1047 __ ld3(v1.V8B(), v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex));
1048 __ ld3(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex));
1049 __ ld3(v22.V8H(), v23.V8H(), v24.V8H(), MemOperand(x0));
1050 __ ld3(v13.V8H(), v14.V8H(), v15.V8H(), MemOperand(x1, x2, PostIndex));
1051 __ ld3(v28.V8H(), v29.V8H(), v30.V8H(), MemOperand(x1, 48, PostIndex));
1052 __ ld3(v21.B(), v22.B(), v23.B(), 11, MemOperand(x0));
1053 __ ld3(v5.B(), v6.B(), v7.B(), 9, MemOperand(x1, x2, PostIndex));
1054 __ ld3(v23.B(), v24.B(), v25.B(), 0, MemOperand(x1, 3, PostIndex));
1055 __ ld3(v16.D(), v17.D(), v18.D(), 0, MemOperand(x0));
1056 __ ld3(v30.D(), v31.D(), v0.D(), 0, MemOperand(x1, x2, PostIndex));
1057 __ ld3(v28.D(), v29.D(), v30.D(), 1, MemOperand(x1, 24, PostIndex));
1058 __ ld3(v13.H(), v14.H(), v15.H(), 2, MemOperand(x0));
1059 __ ld3(v22.H(), v23.H(), v24.H(), 7, MemOperand(x1, x2, PostIndex));
1060 __ ld3(v14.H(), v15.H(), v16.H(), 3, MemOperand(x1, 6, PostIndex));
1061 __ ld3(v22.S(), v23.S(), v24.S(), 3, MemOperand(x0));
1062 __ ld3(v30.S(), v31.S(), v0.S(), 2, MemOperand(x1, x2, PostIndex));
1063 __ ld3(v12.S(), v13.S(), v14.S(), 1, MemOperand(x1, 12, PostIndex));
1064 __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x0));
1065 __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, x2, PostIndex));
1066 __ ld3r(v3.V16B(), v4.V16B(), v5.V16B(), MemOperand(x1, 3, PostIndex));
1067 __ ld3r(v4.V1D(), v5.V1D(), v6.V1D(), MemOperand(x0));
1068 __ ld3r(v7.V1D(), v8.V1D(), v9.V1D(), MemOperand(x1, x2, PostIndex));
1069 __ ld3r(v17.V1D(), v18.V1D(), v19.V1D(), MemOperand(x1, 24, PostIndex));
1070 __ ld3r(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x0));
1071 __ ld3r(v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex));
1072 __ ld3r(v14.V2D(), v15.V2D(), v16.V2D(), MemOperand(x1, 24, PostIndex));
1073 __ ld3r(v10.V2S(), v11.V2S(), v12.V2S(), MemOperand(x0));
1074 __ ld3r(v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x1, x2, PostIndex));
1075 __ ld3r(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, 12, PostIndex));
1076 __ ld3r(v22.V4H(), v23.V4H(), v24.V4H(), MemOperand(x0));
1077 __ ld3r(v6.V4H(), v7.V4H(), v8.V4H(), MemOperand(x1, x2, PostIndex));
1078 __ ld3r(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 6, PostIndex));
1079 __ ld3r(v26.V4S(), v27.V4S(), v28.V4S(), MemOperand(x0));
1080 __ ld3r(v0.V4S(), v1.V4S(), v2.V4S(), MemOperand(x1, x2, PostIndex));
1081 __ ld3r(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, 12, PostIndex));
1082 __ ld3r(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x0));
1083 __ ld3r(v10.V8B(), v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex));
1084 __ ld3r(v28.V8B(), v29.V8B(), v30.V8B(), MemOperand(x1, 3, PostIndex));
1085 __ ld3r(v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0));
1086 __ ld3r(v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x1, x2, PostIndex));
1087 __ ld3r(v7.V8H(), v8.V8H(), v9.V8H(), MemOperand(x1, 6, PostIndex));
1088 __ ld4(v3.V16B(), v4.V16B(), v5.V16B(), v6.V16B(), MemOperand(x0));
1089 __ ld4(v2.V16B(),
1090 v3.V16B(),
1091 v4.V16B(),
1092 v5.V16B(),
1093 MemOperand(x1, x2, PostIndex));
1094 __ ld4(v5.V16B(),
1095 v6.V16B(),
1096 v7.V16B(),
1097 v8.V16B(),
1098 MemOperand(x1, 64, PostIndex));
1099 __ ld4(v18.V2D(), v19.V2D(), v20.V2D(), v21.V2D(), MemOperand(x0));
1100 __ ld4(v4.V2D(), v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
1101 __ ld4(v29.V2D(),
1102 v30.V2D(),
1103 v31.V2D(),
1104 v0.V2D(),
1105 MemOperand(x1, 64, PostIndex));
1106 __ ld4(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), MemOperand(x0));
1107 __ ld4(v24.V2S(),
1108 v25.V2S(),
1109 v26.V2S(),
1110 v27.V2S(),
1111 MemOperand(x1, x2, PostIndex));
1112 __ ld4(v4.V2S(), v5.V2S(), v6.V2S(), v7.V2S(), MemOperand(x1, 32, PostIndex));
1113 __ ld4(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0));
1114 __ ld4(v23.V4H(),
1115 v24.V4H(),
1116 v25.V4H(),
1117 v26.V4H(),
1118 MemOperand(x1, x2, PostIndex));
1119 __ ld4(v2.V4H(), v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 32, PostIndex));
1120 __ ld4(v7.V4S(), v8.V4S(), v9.V4S(), v10.V4S(), MemOperand(x0));
1121 __ ld4(v28.V4S(),
1122 v29.V4S(),
1123 v30.V4S(),
1124 v31.V4S(),
1125 MemOperand(x1, x2, PostIndex));
1126 __ ld4(v29.V4S(),
1127 v30.V4S(),
1128 v31.V4S(),
1129 v0.V4S(),
1130 MemOperand(x1, 64, PostIndex));
1131 __ ld4(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), MemOperand(x0));
1132 __ ld4(v27.V8B(),
1133 v28.V8B(),
1134 v29.V8B(),
1135 v30.V8B(),
1136 MemOperand(x1, x2, PostIndex));
1137 __ ld4(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, 32, PostIndex));
1138 __ ld4(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0));
1139 __ ld4(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
1140 __ ld4(v20.V8H(),
1141 v21.V8H(),
1142 v22.V8H(),
1143 v23.V8H(),
1144 MemOperand(x1, 64, PostIndex));
1145 __ ld4(v20.B(), v21.B(), v22.B(), v23.B(), 3, MemOperand(x0));
1146 __ ld4(v12.B(), v13.B(), v14.B(), v15.B(), 3, MemOperand(x1, x2, PostIndex));
1147 __ ld4(v27.B(), v28.B(), v29.B(), v30.B(), 6, MemOperand(x1, 4, PostIndex));
1148 __ ld4(v28.D(), v29.D(), v30.D(), v31.D(), 1, MemOperand(x0));
1149 __ ld4(v15.D(), v16.D(), v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex));
1150 __ ld4(v16.D(), v17.D(), v18.D(), v19.D(), 1, MemOperand(x1, 32, PostIndex));
1151 __ ld4(v2.H(), v3.H(), v4.H(), v5.H(), 6, MemOperand(x0));
1152 __ ld4(v5.H(), v6.H(), v7.H(), v8.H(), 3, MemOperand(x1, x2, PostIndex));
1153 __ ld4(v7.H(), v8.H(), v9.H(), v10.H(), 6, MemOperand(x1, 8, PostIndex));
1154 __ ld4(v6.S(), v7.S(), v8.S(), v9.S(), 1, MemOperand(x0));
1155 __ ld4(v25.S(), v26.S(), v27.S(), v28.S(), 2, MemOperand(x1, x2, PostIndex));
1156 __ ld4(v8.S(), v9.S(), v10.S(), v11.S(), 3, MemOperand(x1, 16, PostIndex));
1157 __ ld4r(v14.V16B(), v15.V16B(), v16.V16B(), v17.V16B(), MemOperand(x0));
1158 __ ld4r(v13.V16B(),
1159 v14.V16B(),
1160 v15.V16B(),
1161 v16.V16B(),
1162 MemOperand(x1, x2, PostIndex));
1163 __ ld4r(v9.V16B(),
1164 v10.V16B(),
1165 v11.V16B(),
1166 v12.V16B(),
1167 MemOperand(x1, 4, PostIndex));
1168 __ ld4r(v8.V1D(), v9.V1D(), v10.V1D(), v11.V1D(), MemOperand(x0));
1169 __ ld4r(v4.V1D(),
1170 v5.V1D(),
1171 v6.V1D(),
1172 v7.V1D(),
1173 MemOperand(x1, x2, PostIndex));
1174 __ ld4r(v26.V1D(),
1175 v27.V1D(),
1176 v28.V1D(),
1177 v29.V1D(),
1178 MemOperand(x1, 32, PostIndex));
1179 __ ld4r(v19.V2D(), v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x0));
1180 __ ld4r(v28.V2D(),
1181 v29.V2D(),
1182 v30.V2D(),
1183 v31.V2D(),
1184 MemOperand(x1, x2, PostIndex));
1185 __ ld4r(v15.V2D(),
1186 v16.V2D(),
1187 v17.V2D(),
1188 v18.V2D(),
1189 MemOperand(x1, 32, PostIndex));
1190 __ ld4r(v31.V2S(), v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x0));
1191 __ ld4r(v28.V2S(),
1192 v29.V2S(),
1193 v30.V2S(),
1194 v31.V2S(),
1195 MemOperand(x1, x2, PostIndex));
1196 __ ld4r(v11.V2S(),
1197 v12.V2S(),
1198 v13.V2S(),
1199 v14.V2S(),
1200 MemOperand(x1, 16, PostIndex));
1201 __ ld4r(v19.V4H(), v20.V4H(), v21.V4H(), v22.V4H(), MemOperand(x0));
1202 __ ld4r(v22.V4H(),
1203 v23.V4H(),
1204 v24.V4H(),
1205 v25.V4H(),
1206 MemOperand(x1, x2, PostIndex));
1207 __ ld4r(v20.V4H(),
1208 v21.V4H(),
1209 v22.V4H(),
1210 v23.V4H(),
1211 MemOperand(x1, 8, PostIndex));
1212 __ ld4r(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), MemOperand(x0));
1213 __ ld4r(v25.V4S(),
1214 v26.V4S(),
1215 v27.V4S(),
1216 v28.V4S(),
1217 MemOperand(x1, x2, PostIndex));
1218 __ ld4r(v23.V4S(),
1219 v24.V4S(),
1220 v25.V4S(),
1221 v26.V4S(),
1222 MemOperand(x1, 16, PostIndex));
1223 __ ld4r(v22.V8B(), v23.V8B(), v24.V8B(), v25.V8B(), MemOperand(x0));
1224 __ ld4r(v27.V8B(),
1225 v28.V8B(),
1226 v29.V8B(),
1227 v30.V8B(),
1228 MemOperand(x1, x2, PostIndex));
1229 __ ld4r(v29.V8B(),
1230 v30.V8B(),
1231 v31.V8B(),
1232 v0.V8B(),
1233 MemOperand(x1, 4, PostIndex));
1234 __ ld4r(v28.V8H(), v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x0));
1235 __ ld4r(v25.V8H(),
1236 v26.V8H(),
1237 v27.V8H(),
1238 v28.V8H(),
1239 MemOperand(x1, x2, PostIndex));
1240 __ ld4r(v22.V8H(),
1241 v23.V8H(),
1242 v24.V8H(),
1243 v25.V8H(),
1244 MemOperand(x1, 8, PostIndex));
1245 __ mla(v29.V16B(), v7.V16B(), v26.V16B());
1246 __ mla(v6.V2S(), v4.V2S(), v14.V2S());
1247 __ mla(v9.V2S(), v11.V2S(), v0.S(), 2);
1248 __ mla(v5.V4H(), v17.V4H(), v25.V4H());
1249 __ mla(v24.V4H(), v7.V4H(), v11.H(), 3);
1250 __ mla(v12.V4S(), v3.V4S(), v4.V4S());
1251 __ mla(v10.V4S(), v7.V4S(), v7.S(), 3);
1252 __ mla(v3.V8B(), v16.V8B(), v9.V8B());
1253 __ mla(v19.V8H(), v22.V8H(), v18.V8H());
1254 __ mla(v6.V8H(), v2.V8H(), v0.H(), 0);
1255 __ mls(v23.V16B(), v10.V16B(), v11.V16B());
1256 __ mls(v14.V2S(), v31.V2S(), v22.V2S());
1257 __ mls(v28.V2S(), v13.V2S(), v1.S(), 3);
1258 __ mls(v2.V4H(), v19.V4H(), v13.V4H());
1259 __ mls(v18.V4H(), v15.V4H(), v12.H(), 6);
1260 __ mls(v6.V4S(), v11.V4S(), v16.V4S());
1261 __ mls(v23.V4S(), v16.V4S(), v10.S(), 2);
1262 __ mls(v26.V8B(), v13.V8B(), v23.V8B());
1263 __ mls(v10.V8H(), v10.V8H(), v12.V8H());
1264 __ mls(v14.V8H(), v0.V8H(), v14.H(), 7);
1265 __ mov(b22, v1.B(), 3);
1266 __ mov(d7, v13.D(), 1);
1267 __ mov(h26, v21.H(), 2);
1268 __ mov(s26, v19.S(), 0);
1269 __ mov(v26.V16B(), v11.V16B());
1270 __ mov(v20.V8B(), v0.V8B());
1271 __ mov(v19.B(), 13, v6.B(), 4);
1272 __ mov(v4.B(), 13, w19);
1273 __ mov(v11.D(), 1, v8.D(), 0);
1274 __ mov(v3.D(), 0, x30);
1275 __ mov(v29.H(), 4, v11.H(), 7);
1276 __ mov(v2.H(), 6, w6);
1277 __ mov(v22.S(), 0, v5.S(), 2);
1278 __ mov(v24.S(), 3, w8);
1279 __ mov(w18, v1.S(), 3);
1280 __ mov(x28, v21.D(), 0);
1281 __ movi(d24, 0xffff0000ffffff);
1282 __ movi(v29.V16B(), 0x80);
1283 __ movi(v12.V2D(), 0xffff00ff00ffff00);
1284 __ movi(v12.V2S(), 0xec, LSL, 24);
1285 __ movi(v10.V2S(), 0x4c, MSL, 16);
1286 __ movi(v26.V4H(), 0xc0, LSL);
1287 __ movi(v24.V4S(), 0x98, LSL, 16);
1288 __ movi(v1.V4S(), 0xde, MSL, 16);
1289 __ movi(v21.V8B(), 0x4d);
1290 __ movi(v29.V8H(), 0x69, LSL);
1291 __ mul(v1.V16B(), v15.V16B(), v17.V16B());
1292 __ mul(v21.V2S(), v19.V2S(), v29.V2S());
1293 __ mul(v19.V2S(), v5.V2S(), v3.S(), 0);
1294 __ mul(v29.V4H(), v11.V4H(), v2.V4H());
1295 __ mul(v2.V4H(), v7.V4H(), v0.H(), 0);
1296 __ mul(v25.V4S(), v26.V4S(), v16.V4S());
1297 __ mul(v26.V4S(), v6.V4S(), v15.S(), 2);
1298 __ mul(v11.V8B(), v15.V8B(), v31.V8B());
1299 __ mul(v20.V8H(), v31.V8H(), v15.V8H());
1300 __ mul(v29.V8H(), v5.V8H(), v9.H(), 4);
1301 __ mvn(v13.V16B(), v21.V16B());
1302 __ mvn(v28.V8B(), v19.V8B());
1303 __ mvni(v25.V2S(), 0xb8, LSL, 8);
1304 __ mvni(v17.V2S(), 0x6c, MSL, 16);
1305 __ mvni(v29.V4H(), 0x48, LSL);
1306 __ mvni(v20.V4S(), 0x7a, LSL, 16);
1307 __ mvni(v0.V4S(), 0x1e, MSL, 8);
1308 __ mvni(v31.V8H(), 0x3e, LSL);
1309 __ neg(d25, d11);
1310 __ neg(v4.V16B(), v9.V16B());
1311 __ neg(v11.V2D(), v25.V2D());
1312 __ neg(v7.V2S(), v18.V2S());
1313 __ neg(v7.V4H(), v15.V4H());
1314 __ neg(v17.V4S(), v18.V4S());
1315 __ neg(v20.V8B(), v17.V8B());
1316 __ neg(v0.V8H(), v11.V8H());
1317 __ orn(v13.V16B(), v11.V16B(), v31.V16B());
1318 __ orn(v22.V8B(), v16.V8B(), v22.V8B());
1319 __ orr(v17.V16B(), v17.V16B(), v23.V16B());
1320 __ orr(v8.V2S(), 0xe3);
1321 __ orr(v11.V4H(), 0x97, 8);
1322 __ orr(v7.V4S(), 0xab);
1323 __ orr(v8.V8B(), v4.V8B(), v3.V8B());
1324 __ orr(v31.V8H(), 0xb0, 8);
1325 __ pmul(v11.V16B(), v18.V16B(), v23.V16B());
1326 __ pmul(v8.V8B(), v24.V8B(), v5.V8B());
1327 __ pmull(v24.V8H(), v18.V8B(), v22.V8B());
1328 __ pmull2(v13.V8H(), v3.V16B(), v21.V16B());
1329 __ raddhn(v22.V2S(), v10.V2D(), v21.V2D());
1330 __ raddhn(v5.V4H(), v13.V4S(), v13.V4S());
1331 __ raddhn(v10.V8B(), v17.V8H(), v26.V8H());
1332 __ raddhn2(v9.V16B(), v29.V8H(), v13.V8H());
1333 __ raddhn2(v27.V4S(), v23.V2D(), v26.V2D());
1334 __ raddhn2(v0.V8H(), v29.V4S(), v7.V4S());
1335 __ rbit(v22.V16B(), v15.V16B());
1336 __ rbit(v30.V8B(), v3.V8B());
1337 __ rev16(v31.V16B(), v27.V16B());
1338 __ rev16(v12.V8B(), v26.V8B());
1339 __ rev32(v5.V16B(), v4.V16B());
1340 __ rev32(v16.V4H(), v26.V4H());
1341 __ rev32(v20.V8B(), v3.V8B());
1342 __ rev32(v20.V8H(), v28.V8H());
1343 __ rev64(v9.V16B(), v19.V16B());
1344 __ rev64(v5.V2S(), v16.V2S());
1345 __ rev64(v7.V4H(), v31.V4H());
1346 __ rev64(v15.V4S(), v26.V4S());
1347 __ rev64(v25.V8B(), v9.V8B());
1348 __ rev64(v11.V8H(), v5.V8H());
1349 __ rshrn(v18.V2S(), v13.V2D(), 1);
1350 __ rshrn(v25.V4H(), v30.V4S(), 2);
1351 __ rshrn(v13.V8B(), v9.V8H(), 8);
1352 __ rshrn2(v3.V16B(), v6.V8H(), 8);
1353 __ rshrn2(v0.V4S(), v29.V2D(), 25);
1354 __ rshrn2(v27.V8H(), v26.V4S(), 15);
1355 __ rsubhn(v15.V2S(), v25.V2D(), v4.V2D());
1356 __ rsubhn(v23.V4H(), v9.V4S(), v3.V4S());
1357 __ rsubhn(v6.V8B(), v30.V8H(), v24.V8H());
1358 __ rsubhn2(v4.V16B(), v24.V8H(), v20.V8H());
1359 __ rsubhn2(v1.V4S(), v23.V2D(), v22.V2D());
1360 __ rsubhn2(v19.V8H(), v2.V4S(), v20.V4S());
1361 __ saba(v28.V16B(), v9.V16B(), v25.V16B());
1362 __ saba(v9.V2S(), v28.V2S(), v20.V2S());
1363 __ saba(v17.V4H(), v22.V4H(), v22.V4H());
1364 __ saba(v29.V4S(), v5.V4S(), v27.V4S());
1365 __ saba(v20.V8B(), v21.V8B(), v18.V8B());
1366 __ saba(v27.V8H(), v17.V8H(), v30.V8H());
1367 __ sabal(v20.V2D(), v13.V2S(), v7.V2S());
1368 __ sabal(v4.V4S(), v12.V4H(), v4.V4H());
1369 __ sabal(v23.V8H(), v24.V8B(), v20.V8B());
1370 __ sabal2(v26.V2D(), v21.V4S(), v18.V4S());
1371 __ sabal2(v27.V4S(), v28.V8H(), v8.V8H());
1372 __ sabal2(v12.V8H(), v16.V16B(), v21.V16B());
1373 __ sabd(v0.V16B(), v15.V16B(), v13.V16B());
1374 __ sabd(v15.V2S(), v7.V2S(), v30.V2S());
1375 __ sabd(v17.V4H(), v17.V4H(), v12.V4H());
1376 __ sabd(v7.V4S(), v4.V4S(), v22.V4S());
1377 __ sabd(v23.V8B(), v3.V8B(), v26.V8B());
1378 __ sabd(v20.V8H(), v28.V8H(), v5.V8H());
1379 __ sabdl(v27.V2D(), v22.V2S(), v20.V2S());
1380 __ sabdl(v31.V4S(), v20.V4H(), v23.V4H());
1381 __ sabdl(v0.V8H(), v20.V8B(), v27.V8B());
1382 __ sabdl2(v31.V2D(), v11.V4S(), v3.V4S());
1383 __ sabdl2(v26.V4S(), v11.V8H(), v27.V8H());
1384 __ sabdl2(v6.V8H(), v8.V16B(), v18.V16B());
1385 __ sadalp(v8.V1D(), v26.V2S());
1386 __ sadalp(v12.V2D(), v26.V4S());
1387 __ sadalp(v12.V2S(), v26.V4H());
1388 __ sadalp(v4.V4H(), v1.V8B());
1389 __ sadalp(v15.V4S(), v17.V8H());
1390 __ sadalp(v21.V8H(), v25.V16B());
1391 __ saddl(v5.V2D(), v10.V2S(), v14.V2S());
1392 __ saddl(v18.V4S(), v3.V4H(), v15.V4H());
1393 __ saddl(v15.V8H(), v2.V8B(), v23.V8B());
1394 __ saddl2(v16.V2D(), v16.V4S(), v27.V4S());
1395 __ saddl2(v6.V4S(), v24.V8H(), v0.V8H());
1396 __ saddl2(v7.V8H(), v20.V16B(), v28.V16B());
1397 __ saddlp(v10.V1D(), v25.V2S());
1398 __ saddlp(v15.V2D(), v16.V4S());
1399 __ saddlp(v18.V2S(), v10.V4H());
1400 __ saddlp(v29.V4H(), v26.V8B());
1401 __ saddlp(v10.V4S(), v1.V8H());
1402 __ saddlp(v0.V8H(), v21.V16B());
1403 __ saddlv(d12, v7.V4S());
1404 __ saddlv(h14, v28.V16B());
1405 __ saddlv(h30, v30.V8B());
1406 __ saddlv(s27, v3.V4H());
1407 __ saddlv(s16, v16.V8H());
1408 __ saddw(v24.V2D(), v11.V2D(), v18.V2S());
1409 __ saddw(v13.V4S(), v12.V4S(), v6.V4H());
1410 __ saddw(v19.V8H(), v19.V8H(), v7.V8B());
1411 __ saddw2(v27.V2D(), v9.V2D(), v26.V4S());
1412 __ saddw2(v19.V4S(), v23.V4S(), v21.V8H());
1413 __ saddw2(v15.V8H(), v25.V8H(), v30.V16B());
1414 __ shadd(v7.V16B(), v4.V16B(), v9.V16B());
1415 __ shadd(v29.V2S(), v25.V2S(), v24.V2S());
1416 __ shadd(v31.V4H(), v10.V4H(), v13.V4H());
1417 __ shadd(v21.V4S(), v16.V4S(), v8.V4S());
1418 __ shadd(v14.V8B(), v29.V8B(), v22.V8B());
1419 __ shadd(v19.V8H(), v24.V8H(), v20.V8H());
1420 __ shl(d22, d25, 23);
1421 __ shl(v5.V16B(), v17.V16B(), 7);
1422 __ shl(v2.V2D(), v4.V2D(), 21);
1423 __ shl(v4.V2S(), v3.V2S(), 26);
1424 __ shl(v3.V4H(), v28.V4H(), 8);
1425 __ shl(v4.V4S(), v31.V4S(), 24);
1426 __ shl(v18.V8B(), v16.V8B(), 2);
1427 __ shl(v0.V8H(), v11.V8H(), 3);
1428 __ shll(v5.V2D(), v24.V2S(), 32);
1429 __ shll(v26.V4S(), v20.V4H(), 16);
1430 __ shll(v5.V8H(), v9.V8B(), 8);
1431 __ shll2(v21.V2D(), v28.V4S(), 32);
1432 __ shll2(v22.V4S(), v1.V8H(), 16);
1433 __ shll2(v30.V8H(), v25.V16B(), 8);
1434 __ shrn(v5.V2S(), v1.V2D(), 28);
1435 __ shrn(v29.V4H(), v18.V4S(), 7);
1436 __ shrn(v17.V8B(), v29.V8H(), 2);
1437 __ shrn2(v5.V16B(), v30.V8H(), 3);
1438 __ shrn2(v24.V4S(), v1.V2D(), 1);
1439 __ shrn2(v5.V8H(), v14.V4S(), 16);
1440 __ shsub(v30.V16B(), v22.V16B(), v23.V16B());
1441 __ shsub(v22.V2S(), v27.V2S(), v25.V2S());
1442 __ shsub(v13.V4H(), v22.V4H(), v1.V4H());
1443 __ shsub(v10.V4S(), v8.V4S(), v23.V4S());
1444 __ shsub(v6.V8B(), v9.V8B(), v31.V8B());
1445 __ shsub(v8.V8H(), v31.V8H(), v8.V8H());
1446 __ sli(d19, d29, 20);
1447 __ sli(v9.V16B(), v24.V16B(), 0);
1448 __ sli(v22.V2D(), v9.V2D(), 10);
1449 __ sli(v11.V2S(), v27.V2S(), 20);
1450 __ sli(v16.V4H(), v15.V4H(), 5);
1451 __ sli(v8.V4S(), v8.V4S(), 25);
1452 __ sli(v10.V8B(), v30.V8B(), 0);
1453 __ sli(v7.V8H(), v28.V8H(), 6);
1454 __ smax(v18.V16B(), v8.V16B(), v1.V16B());
1455 __ smax(v30.V2S(), v5.V2S(), v1.V2S());
1456 __ smax(v17.V4H(), v25.V4H(), v19.V4H());
1457 __ smax(v1.V4S(), v24.V4S(), v31.V4S());
1458 __ smax(v17.V8B(), v24.V8B(), v24.V8B());
1459 __ smax(v11.V8H(), v26.V8H(), v10.V8H());
1460 __ smaxp(v12.V16B(), v14.V16B(), v7.V16B());
1461 __ smaxp(v31.V2S(), v24.V2S(), v6.V2S());
1462 __ smaxp(v10.V4H(), v29.V4H(), v10.V4H());
1463 __ smaxp(v18.V4S(), v11.V4S(), v7.V4S());
1464 __ smaxp(v21.V8B(), v0.V8B(), v18.V8B());
1465 __ smaxp(v26.V8H(), v8.V8H(), v15.V8H());
1466 __ smaxv(b4, v5.V16B());
1467 __ smaxv(b23, v0.V8B());
1468 __ smaxv(h6, v0.V4H());
1469 __ smaxv(h24, v8.V8H());
1470 __ smaxv(s3, v16.V4S());
1471 __ smin(v24.V16B(), v8.V16B(), v18.V16B());
1472 __ smin(v29.V2S(), v8.V2S(), v23.V2S());
1473 __ smin(v6.V4H(), v11.V4H(), v21.V4H());
1474 __ smin(v24.V4S(), v23.V4S(), v15.V4S());
1475 __ smin(v8.V8B(), v16.V8B(), v4.V8B());
1476 __ smin(v12.V8H(), v1.V8H(), v10.V8H());
1477 __ sminp(v13.V16B(), v18.V16B(), v28.V16B());
1478 __ sminp(v22.V2S(), v28.V2S(), v16.V2S());
1479 __ sminp(v15.V4H(), v12.V4H(), v5.V4H());
1480 __ sminp(v15.V4S(), v17.V4S(), v8.V4S());
1481 __ sminp(v21.V8B(), v2.V8B(), v6.V8B());
1482 __ sminp(v21.V8H(), v12.V8H(), v6.V8H());
1483 __ sminv(b8, v6.V16B());
1484 __ sminv(b6, v18.V8B());
1485 __ sminv(h20, v1.V4H());
1486 __ sminv(h7, v17.V8H());
1487 __ sminv(s21, v4.V4S());
1488 __ smlal(v24.V2D(), v14.V2S(), v21.V2S());
1489 __ smlal(v31.V2D(), v3.V2S(), v14.S(), 2);
1490 __ smlal(v7.V4S(), v20.V4H(), v21.V4H());
1491 __ smlal(v19.V4S(), v16.V4H(), v9.H(), 3);
1492 __ smlal(v29.V8H(), v14.V8B(), v1.V8B());
1493 __ smlal2(v30.V2D(), v26.V4S(), v16.V4S());
1494 __ smlal2(v31.V2D(), v30.V4S(), v1.S(), 0);
1495 __ smlal2(v17.V4S(), v6.V8H(), v3.V8H());
1496 __ smlal2(v11.V4S(), v31.V8H(), v5.H(), 7);
1497 __ smlal2(v30.V8H(), v16.V16B(), v29.V16B());
1498 __ smlsl(v1.V2D(), v20.V2S(), v17.V2S());
1499 __ smlsl(v29.V2D(), v12.V2S(), v5.S(), 3);
1500 __ smlsl(v0.V4S(), v26.V4H(), v1.V4H());
1501 __ smlsl(v3.V4S(), v5.V4H(), v6.H(), 5);
1502 __ smlsl(v4.V8H(), v0.V8B(), v26.V8B());
1503 __ smlsl2(v14.V2D(), v14.V4S(), v5.V4S());
1504 __ smlsl2(v15.V2D(), v5.V4S(), v0.S(), 1);
1505 __ smlsl2(v29.V4S(), v17.V8H(), v31.V8H());
1506 __ smlsl2(v6.V4S(), v15.V8H(), v9.H(), 6);
1507 __ smlsl2(v30.V8H(), v15.V16B(), v15.V16B());
1508 __ smov(w21, v6.B(), 3);
1509 __ smov(w13, v26.H(), 7);
1510 __ smov(x24, v16.B(), 7);
1511 __ smov(x7, v4.H(), 3);
1512 __ smov(x29, v7.S(), 1);
1513 __ smull(v4.V2D(), v29.V2S(), v17.V2S());
1514 __ smull(v30.V2D(), v21.V2S(), v6.S(), 2);
1515 __ smull(v23.V4S(), v5.V4H(), v23.V4H());
1516 __ smull(v8.V4S(), v9.V4H(), v2.H(), 1);
1517 __ smull(v31.V8H(), v17.V8B(), v1.V8B());
1518 __ smull2(v3.V2D(), v3.V4S(), v23.V4S());
1519 __ smull2(v15.V2D(), v29.V4S(), v6.S(), 1);
1520 __ smull2(v19.V4S(), v20.V8H(), v30.V8H());
1521 __ smull2(v6.V4S(), v10.V8H(), v7.H(), 4);
1522 __ smull2(v25.V8H(), v8.V16B(), v27.V16B());
1523 __ sqabs(b3, b15);
1524 __ sqabs(d14, d9);
1525 __ sqabs(h31, h28);
1526 __ sqabs(s8, s0);
1527 __ sqabs(v14.V16B(), v7.V16B());
1528 __ sqabs(v23.V2D(), v19.V2D());
1529 __ sqabs(v10.V2S(), v24.V2S());
1530 __ sqabs(v31.V4H(), v19.V4H());
1531 __ sqabs(v23.V4S(), v0.V4S());
1532 __ sqabs(v29.V8B(), v23.V8B());
1533 __ sqabs(v17.V8H(), v21.V8H());
1534 __ sqadd(b9, b23, b13);
1535 __ sqadd(d2, d25, d26);
1536 __ sqadd(h7, h29, h25);
1537 __ sqadd(s11, s7, s24);
1538 __ sqadd(v20.V16B(), v16.V16B(), v29.V16B());
1539 __ sqadd(v23.V2D(), v30.V2D(), v28.V2D());
1540 __ sqadd(v8.V2S(), v19.V2S(), v2.V2S());
1541 __ sqadd(v20.V4H(), v12.V4H(), v31.V4H());
1542 __ sqadd(v14.V4S(), v15.V4S(), v17.V4S());
1543 __ sqadd(v2.V8B(), v29.V8B(), v13.V8B());
1544 __ sqadd(v7.V8H(), v19.V8H(), v14.V8H());
1545 __ sqdmlal(d15, s5, s30);
1546 __ sqdmlal(d24, s10, v2.S(), 3);
1547 __ sqdmlal(s9, h19, h8);
1548 __ sqdmlal(s14, h1, v12.H(), 3);
1549 __ sqdmlal(v30.V2D(), v5.V2S(), v31.V2S());
1550 __ sqdmlal(v25.V2D(), v14.V2S(), v10.S(), 1);
1551 __ sqdmlal(v19.V4S(), v17.V4H(), v16.V4H());
1552 __ sqdmlal(v8.V4S(), v5.V4H(), v8.H(), 1);
1553 __ sqdmlal2(v1.V2D(), v23.V4S(), v3.V4S());
1554 __ sqdmlal2(v19.V2D(), v0.V4S(), v9.S(), 0);
1555 __ sqdmlal2(v26.V4S(), v22.V8H(), v11.V8H());
1556 __ sqdmlal2(v6.V4S(), v28.V8H(), v13.H(), 4);
1557 __ sqdmlsl(d10, s29, s20);
1558 __ sqdmlsl(d10, s9, v10.S(), 1);
1559 __ sqdmlsl(s30, h9, h24);
1560 __ sqdmlsl(s13, h24, v6.H(), 1);
1561 __ sqdmlsl(v27.V2D(), v10.V2S(), v20.V2S());
1562 __ sqdmlsl(v23.V2D(), v23.V2S(), v3.S(), 3);
1563 __ sqdmlsl(v7.V4S(), v17.V4H(), v29.V4H());
1564 __ sqdmlsl(v22.V4S(), v21.V4H(), v3.H(), 4);
1565 __ sqdmlsl2(v12.V2D(), v7.V4S(), v22.V4S());
1566 __ sqdmlsl2(v20.V2D(), v25.V4S(), v8.S(), 0);
1567 __ sqdmlsl2(v25.V4S(), v26.V8H(), v18.V8H());
1568 __ sqdmlsl2(v25.V4S(), v19.V8H(), v5.H(), 0);
1569 __ sqdmulh(h17, h27, h12);
1570 __ sqdmulh(h16, h5, v11.H(), 0);
1571 __ sqdmulh(s1, s19, s16);
1572 __ sqdmulh(s1, s16, v2.S(), 0);
1573 __ sqdmulh(v28.V2S(), v1.V2S(), v8.V2S());
1574 __ sqdmulh(v28.V2S(), v8.V2S(), v3.S(), 0);
1575 __ sqdmulh(v11.V4H(), v25.V4H(), v5.V4H());
1576 __ sqdmulh(v30.V4H(), v14.V4H(), v8.H(), 5);
1577 __ sqdmulh(v25.V4S(), v21.V4S(), v13.V4S());
1578 __ sqdmulh(v23.V4S(), v2.V4S(), v10.S(), 3);
1579 __ sqdmulh(v26.V8H(), v5.V8H(), v23.V8H());
1580 __ sqdmulh(v4.V8H(), v22.V8H(), v4.H(), 3);
1581 __ sqdmull(d25, s2, s26);
1582 __ sqdmull(d30, s14, v5.S(), 1);
1583 __ sqdmull(s29, h18, h11);
1584 __ sqdmull(s11, h13, v7.H(), 6);
1585 __ sqdmull(v23.V2D(), v9.V2S(), v8.V2S());
1586 __ sqdmull(v18.V2D(), v29.V2S(), v4.S(), 1);
1587 __ sqdmull(v17.V4S(), v24.V4H(), v7.V4H());
1588 __ sqdmull(v8.V4S(), v15.V4H(), v5.H(), 1);
1589 __ sqdmull2(v28.V2D(), v14.V4S(), v2.V4S());
1590 __ sqdmull2(v1.V2D(), v24.V4S(), v13.S(), 2);
1591 __ sqdmull2(v11.V4S(), v17.V8H(), v31.V8H());
1592 __ sqdmull2(v1.V4S(), v20.V8H(), v11.H(), 3);
1593 __ sqneg(b2, b0);
1594 __ sqneg(d24, d2);
1595 __ sqneg(h29, h3);
1596 __ sqneg(s4, s9);
1597 __ sqneg(v14.V16B(), v29.V16B());
1598 __ sqneg(v30.V2D(), v12.V2D());
1599 __ sqneg(v28.V2S(), v26.V2S());
1600 __ sqneg(v4.V4H(), v4.V4H());
1601 __ sqneg(v9.V4S(), v8.V4S());
1602 __ sqneg(v20.V8B(), v20.V8B());
1603 __ sqneg(v27.V8H(), v10.V8H());
1604 __ sqrdmulh(h7, h24, h0);
1605 __ sqrdmulh(h14, h3, v4.H(), 6);
1606 __ sqrdmulh(s27, s19, s24);
1607 __ sqrdmulh(s31, s21, v4.S(), 0);
1608 __ sqrdmulh(v18.V2S(), v25.V2S(), v1.V2S());
1609 __ sqrdmulh(v22.V2S(), v5.V2S(), v13.S(), 0);
1610 __ sqrdmulh(v22.V4H(), v24.V4H(), v9.V4H());
1611 __ sqrdmulh(v13.V4H(), v2.V4H(), v12.H(), 6);
1612 __ sqrdmulh(v9.V4S(), v27.V4S(), v2.V4S());
1613 __ sqrdmulh(v3.V4S(), v23.V4S(), v7.S(), 1);
1614 __ sqrdmulh(v2.V8H(), v0.V8H(), v7.V8H());
1615 __ sqrdmulh(v16.V8H(), v9.V8H(), v8.H(), 2);
1616 __ sqrshl(b8, b21, b13);
1617 __ sqrshl(d29, d7, d20);
1618 __ sqrshl(h28, h14, h10);
1619 __ sqrshl(s26, s18, s2);
1620 __ sqrshl(v18.V16B(), v31.V16B(), v26.V16B());
1621 __ sqrshl(v28.V2D(), v4.V2D(), v0.V2D());
1622 __ sqrshl(v3.V2S(), v6.V2S(), v0.V2S());
1623 __ sqrshl(v1.V4H(), v18.V4H(), v22.V4H());
1624 __ sqrshl(v16.V4S(), v25.V4S(), v7.V4S());
1625 __ sqrshl(v0.V8B(), v21.V8B(), v5.V8B());
1626 __ sqrshl(v30.V8H(), v19.V8H(), v8.V8H());
1627 __ sqrshrn(b6, h21, 4);
1628 __ sqrshrn(h14, s17, 11);
1629 __ sqrshrn(s25, d27, 10);
1630 __ sqrshrn(v6.V2S(), v13.V2D(), 18);
1631 __ sqrshrn(v5.V4H(), v9.V4S(), 15);
1632 __ sqrshrn(v19.V8B(), v12.V8H(), 1);
1633 __ sqrshrn2(v19.V16B(), v21.V8H(), 7);
1634 __ sqrshrn2(v29.V4S(), v24.V2D(), 13);
1635 __ sqrshrn2(v12.V8H(), v2.V4S(), 10);
1636 __ sqrshrun(b16, h9, 5);
1637 __ sqrshrun(h3, s24, 15);
1638 __ sqrshrun(s16, d18, 8);
1639 __ sqrshrun(v28.V2S(), v23.V2D(), 8);
1640 __ sqrshrun(v31.V4H(), v25.V4S(), 10);
1641 __ sqrshrun(v19.V8B(), v23.V8H(), 2);
1642 __ sqrshrun2(v24.V16B(), v0.V8H(), 8);
1643 __ sqrshrun2(v22.V4S(), v1.V2D(), 23);
1644 __ sqrshrun2(v28.V8H(), v21.V4S(), 13);
1645 __ sqshl(b6, b21, b8);
1646 __ sqshl(b11, b26, 2);
1647 __ sqshl(d29, d0, d4);
1648 __ sqshl(d21, d7, 35);
1649 __ sqshl(h20, h25, h17);
1650 __ sqshl(h20, h0, 8);
1651 __ sqshl(s29, s13, s4);
1652 __ sqshl(s10, s11, 20);
1653 __ sqshl(v8.V16B(), v18.V16B(), v28.V16B());
1654 __ sqshl(v29.V16B(), v29.V16B(), 2);
1655 __ sqshl(v8.V2D(), v31.V2D(), v16.V2D());
1656 __ sqshl(v7.V2D(), v14.V2D(), 37);
1657 __ sqshl(v0.V2S(), v26.V2S(), v7.V2S());
1658 __ sqshl(v5.V2S(), v11.V2S(), 19);
1659 __ sqshl(v11.V4H(), v30.V4H(), v0.V4H());
1660 __ sqshl(v1.V4H(), v18.V4H(), 7);
1661 __ sqshl(v22.V4S(), v3.V4S(), v30.V4S());
1662 __ sqshl(v16.V4S(), v15.V4S(), 28);
1663 __ sqshl(v6.V8B(), v28.V8B(), v25.V8B());
1664 __ sqshl(v0.V8B(), v15.V8B(), 0);
1665 __ sqshl(v6.V8H(), v16.V8H(), v30.V8H());
1666 __ sqshl(v3.V8H(), v20.V8H(), 14);
1667 __ sqshlu(b13, b14, 6);
1668 __ sqshlu(d0, d16, 44);
1669 __ sqshlu(h5, h29, 15);
1670 __ sqshlu(s29, s8, 13);
1671 __ sqshlu(v27.V16B(), v20.V16B(), 2);
1672 __ sqshlu(v24.V2D(), v12.V2D(), 11);
1673 __ sqshlu(v12.V2S(), v19.V2S(), 22);
1674 __ sqshlu(v8.V4H(), v12.V4H(), 11);
1675 __ sqshlu(v18.V4S(), v3.V4S(), 8);
1676 __ sqshlu(v3.V8B(), v10.V8B(), 1);
1677 __ sqshlu(v30.V8H(), v24.V8H(), 4);
1678 __ sqshrn(b1, h28, 1);
1679 __ sqshrn(h31, s7, 10);
1680 __ sqshrn(s4, d10, 24);
1681 __ sqshrn(v10.V2S(), v1.V2D(), 29);
1682 __ sqshrn(v3.V4H(), v13.V4S(), 14);
1683 __ sqshrn(v27.V8B(), v6.V8H(), 7);
1684 __ sqshrn2(v14.V16B(), v23.V8H(), 1);
1685 __ sqshrn2(v25.V4S(), v22.V2D(), 27);
1686 __ sqshrn2(v31.V8H(), v12.V4S(), 10);
1687 __ sqshrun(b9, h0, 1);
1688 __ sqshrun(h11, s6, 7);
1689 __ sqshrun(s13, d12, 13);
1690 __ sqshrun(v10.V2S(), v30.V2D(), 1);
1691 __ sqshrun(v31.V4H(), v3.V4S(), 11);
1692 __ sqshrun(v28.V8B(), v30.V8H(), 8);
1693 __ sqshrun2(v16.V16B(), v27.V8H(), 3);
1694 __ sqshrun2(v27.V4S(), v14.V2D(), 18);
1695 __ sqshrun2(v23.V8H(), v14.V4S(), 1);
1696 __ sqsub(b19, b29, b11);
1697 __ sqsub(d21, d31, d6);
1698 __ sqsub(h18, h10, h19);
1699 __ sqsub(s6, s5, s0);
1700 __ sqsub(v21.V16B(), v22.V16B(), v0.V16B());
1701 __ sqsub(v22.V2D(), v10.V2D(), v17.V2D());
1702 __ sqsub(v8.V2S(), v21.V2S(), v2.V2S());
1703 __ sqsub(v18.V4H(), v25.V4H(), v27.V4H());
1704 __ sqsub(v13.V4S(), v3.V4S(), v6.V4S());
1705 __ sqsub(v28.V8B(), v29.V8B(), v16.V8B());
1706 __ sqsub(v17.V8H(), v6.V8H(), v10.V8H());
1707 __ sqxtn(b27, h26);
1708 __ sqxtn(h17, s11);
1709 __ sqxtn(s22, d31);
1710 __ sqxtn(v26.V2S(), v5.V2D());
1711 __ sqxtn(v13.V4H(), v7.V4S());
1712 __ sqxtn(v19.V8B(), v19.V8H());
1713 __ sqxtn2(v19.V16B(), v3.V8H());
1714 __ sqxtn2(v23.V4S(), v1.V2D());
1715 __ sqxtn2(v13.V8H(), v3.V4S());
1716 __ sqxtun(b26, h9);
1717 __ sqxtun(h19, s12);
1718 __ sqxtun(s3, d6);
1719 __ sqxtun(v29.V2S(), v26.V2D());
1720 __ sqxtun(v26.V4H(), v10.V4S());
1721 __ sqxtun(v7.V8B(), v29.V8H());
1722 __ sqxtun2(v21.V16B(), v14.V8H());
1723 __ sqxtun2(v24.V4S(), v15.V2D());
1724 __ sqxtun2(v30.V8H(), v1.V4S());
1725 __ srhadd(v21.V16B(), v17.V16B(), v15.V16B());
1726 __ srhadd(v28.V2S(), v21.V2S(), v29.V2S());
1727 __ srhadd(v9.V4H(), v1.V4H(), v30.V4H());
1728 __ srhadd(v24.V4S(), v0.V4S(), v2.V4S());
1729 __ srhadd(v6.V8B(), v17.V8B(), v15.V8B());
1730 __ srhadd(v5.V8H(), v7.V8H(), v21.V8H());
1731 __ sri(d14, d14, 49);
1732 __ sri(v23.V16B(), v8.V16B(), 4);
1733 __ sri(v20.V2D(), v13.V2D(), 20);
1734 __ sri(v16.V2S(), v2.V2S(), 24);
1735 __ sri(v5.V4H(), v23.V4H(), 11);
1736 __ sri(v27.V4S(), v15.V4S(), 23);
1737 __ sri(v19.V8B(), v29.V8B(), 4);
1738 __ sri(v7.V8H(), v29.V8H(), 3);
1739 __ srshl(d2, d9, d26);
1740 __ srshl(v29.V16B(), v17.V16B(), v11.V16B());
1741 __ srshl(v8.V2D(), v15.V2D(), v4.V2D());
1742 __ srshl(v25.V2S(), v17.V2S(), v8.V2S());
1743 __ srshl(v19.V4H(), v7.V4H(), v7.V4H());
1744 __ srshl(v13.V4S(), v2.V4S(), v17.V4S());
1745 __ srshl(v22.V8B(), v6.V8B(), v21.V8B());
1746 __ srshl(v10.V8H(), v17.V8H(), v4.V8H());
1747 __ srshr(d21, d18, 45);
1748 __ srshr(v3.V16B(), v11.V16B(), 7);
1749 __ srshr(v21.V2D(), v26.V2D(), 53);
1750 __ srshr(v11.V2S(), v5.V2S(), 28);
1751 __ srshr(v7.V4H(), v18.V4H(), 12);
1752 __ srshr(v7.V4S(), v3.V4S(), 30);
1753 __ srshr(v14.V8B(), v2.V8B(), 6);
1754 __ srshr(v21.V8H(), v20.V8H(), 3);
1755 __ srsra(d21, d30, 63);
1756 __ srsra(v27.V16B(), v30.V16B(), 6);
1757 __ srsra(v20.V2D(), v12.V2D(), 27);
1758 __ srsra(v0.V2S(), v17.V2S(), 5);
1759 __ srsra(v14.V4H(), v16.V4H(), 15);
1760 __ srsra(v18.V4S(), v3.V4S(), 20);
1761 __ srsra(v21.V8B(), v1.V8B(), 1);
1762 __ srsra(v31.V8H(), v25.V8H(), 2);
1763 __ sshl(d1, d13, d9);
1764 __ sshl(v17.V16B(), v31.V16B(), v15.V16B());
1765 __ sshl(v13.V2D(), v16.V2D(), v0.V2D());
1766 __ sshl(v0.V2S(), v7.V2S(), v22.V2S());
1767 __ sshl(v23.V4H(), v19.V4H(), v4.V4H());
1768 __ sshl(v5.V4S(), v5.V4S(), v11.V4S());
1769 __ sshl(v23.V8B(), v27.V8B(), v7.V8B());
1770 __ sshl(v29.V8H(), v10.V8H(), v5.V8H());
1771 __ sshll(v0.V2D(), v2.V2S(), 23);
1772 __ sshll(v11.V4S(), v8.V4H(), 8);
1773 __ sshll(v4.V8H(), v29.V8B(), 1);
1774 __ sshll2(v10.V2D(), v4.V4S(), 14);
1775 __ sshll2(v26.V4S(), v31.V8H(), 6);
1776 __ sshll2(v3.V8H(), v26.V16B(), 4);
1777 __ sshr(d19, d21, 20);
1778 __ sshr(v15.V16B(), v23.V16B(), 5);
1779 __ sshr(v17.V2D(), v14.V2D(), 38);
1780 __ sshr(v3.V2S(), v29.V2S(), 23);
1781 __ sshr(v23.V4H(), v27.V4H(), 4);
1782 __ sshr(v28.V4S(), v3.V4S(), 4);
1783 __ sshr(v14.V8B(), v2.V8B(), 6);
1784 __ sshr(v3.V8H(), v8.V8H(), 6);
1785 __ ssra(d12, d28, 44);
1786 __ ssra(v29.V16B(), v31.V16B(), 4);
1787 __ ssra(v3.V2D(), v0.V2D(), 24);
1788 __ ssra(v14.V2S(), v28.V2S(), 6);
1789 __ ssra(v18.V4H(), v8.V4H(), 7);
1790 __ ssra(v31.V4S(), v14.V4S(), 24);
1791 __ ssra(v28.V8B(), v26.V8B(), 5);
1792 __ ssra(v9.V8H(), v9.V8H(), 14);
1793 __ ssubl(v13.V2D(), v14.V2S(), v3.V2S());
1794 __ ssubl(v5.V4S(), v16.V4H(), v8.V4H());
1795 __ ssubl(v0.V8H(), v28.V8B(), v6.V8B());
1796 __ ssubl2(v5.V2D(), v13.V4S(), v25.V4S());
1797 __ ssubl2(v3.V4S(), v15.V8H(), v17.V8H());
1798 __ ssubl2(v15.V8H(), v15.V16B(), v14.V16B());
1799 __ ssubw(v25.V2D(), v23.V2D(), v26.V2S());
1800 __ ssubw(v21.V4S(), v18.V4S(), v24.V4H());
1801 __ ssubw(v30.V8H(), v22.V8H(), v3.V8B());
1802 __ ssubw2(v16.V2D(), v24.V2D(), v28.V4S());
1803 __ ssubw2(v31.V4S(), v11.V4S(), v15.V8H());
1804 __ ssubw2(v4.V8H(), v8.V8H(), v16.V16B());
1805 __ st1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0));
1806 __ st1(v10.V16B(),
1807 v11.V16B(),
1808 v12.V16B(),
1809 v13.V16B(),
1810 MemOperand(x1, x2, PostIndex));
1811 __ st1(v27.V16B(),
1812 v28.V16B(),
1813 v29.V16B(),
1814 v30.V16B(),
1815 MemOperand(x1, 64, PostIndex));
1816 __ st1(v16.V16B(), v17.V16B(), v18.V16B(), MemOperand(x0));
1817 __ st1(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex));
1818 __ st1(v9.V16B(), v10.V16B(), v11.V16B(), MemOperand(x1, 48, PostIndex));
1819 __ st1(v7.V16B(), v8.V16B(), MemOperand(x0));
1820 __ st1(v26.V16B(), v27.V16B(), MemOperand(x1, x2, PostIndex));
1821 __ st1(v22.V16B(), v23.V16B(), MemOperand(x1, 32, PostIndex));
1822 __ st1(v23.V16B(), MemOperand(x0));
1823 __ st1(v28.V16B(), MemOperand(x1, x2, PostIndex));
1824 __ st1(v2.V16B(), MemOperand(x1, 16, PostIndex));
1825 __ st1(v29.V1D(), v30.V1D(), v31.V1D(), v0.V1D(), MemOperand(x0));
1826 __ st1(v12.V1D(),
1827 v13.V1D(),
1828 v14.V1D(),
1829 v15.V1D(),
1830 MemOperand(x1, x2, PostIndex));
1831 __ st1(v30.V1D(),
1832 v31.V1D(),
1833 v0.V1D(),
1834 v1.V1D(),
1835 MemOperand(x1, 32, PostIndex));
1836 __ st1(v16.V1D(), v17.V1D(), v18.V1D(), MemOperand(x0));
1837 __ st1(v3.V1D(), v4.V1D(), v5.V1D(), MemOperand(x1, x2, PostIndex));
1838 __ st1(v14.V1D(), v15.V1D(), v16.V1D(), MemOperand(x1, 24, PostIndex));
1839 __ st1(v18.V1D(), v19.V1D(), MemOperand(x0));
1840 __ st1(v5.V1D(), v6.V1D(), MemOperand(x1, x2, PostIndex));
1841 __ st1(v2.V1D(), v3.V1D(), MemOperand(x1, 16, PostIndex));
1842 __ st1(v4.V1D(), MemOperand(x0));
1843 __ st1(v27.V1D(), MemOperand(x1, x2, PostIndex));
1844 __ st1(v23.V1D(), MemOperand(x1, 8, PostIndex));
1845 __ st1(v2.V2D(), v3.V2D(), v4.V2D(), v5.V2D(), MemOperand(x0));
1846 __ st1(v22.V2D(),
1847 v23.V2D(),
1848 v24.V2D(),
1849 v25.V2D(),
1850 MemOperand(x1, x2, PostIndex));
1851 __ st1(v28.V2D(),
1852 v29.V2D(),
1853 v30.V2D(),
1854 v31.V2D(),
1855 MemOperand(x1, 64, PostIndex));
1856 __ st1(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
1857 __ st1(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x1, x2, PostIndex));
1858 __ st1(v22.V2D(), v23.V2D(), v24.V2D(), MemOperand(x1, 48, PostIndex));
1859 __ st1(v21.V2D(), v22.V2D(), MemOperand(x0));
1860 __ st1(v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
1861 __ st1(v27.V2D(), v28.V2D(), MemOperand(x1, 32, PostIndex));
1862 __ st1(v21.V2D(), MemOperand(x0));
1863 __ st1(v29.V2D(), MemOperand(x1, x2, PostIndex));
1864 __ st1(v20.V2D(), MemOperand(x1, 16, PostIndex));
1865 __ st1(v22.V2S(), v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x0));
1866 __ st1(v8.V2S(),
1867 v9.V2S(),
1868 v10.V2S(),
1869 v11.V2S(),
1870 MemOperand(x1, x2, PostIndex));
1871 __ st1(v15.V2S(),
1872 v16.V2S(),
1873 v17.V2S(),
1874 v18.V2S(),
1875 MemOperand(x1, 32, PostIndex));
1876 __ st1(v2.V2S(), v3.V2S(), v4.V2S(), MemOperand(x0));
1877 __ st1(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, x2, PostIndex));
1878 __ st1(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x1, 24, PostIndex));
1879 __ st1(v28.V2S(), v29.V2S(), MemOperand(x0));
1880 __ st1(v29.V2S(), v30.V2S(), MemOperand(x1, x2, PostIndex));
1881 __ st1(v23.V2S(), v24.V2S(), MemOperand(x1, 16, PostIndex));
1882 __ st1(v6.V2S(), MemOperand(x0));
1883 __ st1(v11.V2S(), MemOperand(x1, x2, PostIndex));
1884 __ st1(v17.V2S(), MemOperand(x1, 8, PostIndex));
1885 __ st1(v6.V4H(), v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x0));
1886 __ st1(v9.V4H(),
1887 v10.V4H(),
1888 v11.V4H(),
1889 v12.V4H(),
1890 MemOperand(x1, x2, PostIndex));
1891 __ st1(v25.V4H(),
1892 v26.V4H(),
1893 v27.V4H(),
1894 v28.V4H(),
1895 MemOperand(x1, 32, PostIndex));
1896 __ st1(v11.V4H(), v12.V4H(), v13.V4H(), MemOperand(x0));
1897 __ st1(v10.V4H(), v11.V4H(), v12.V4H(), MemOperand(x1, x2, PostIndex));
1898 __ st1(v12.V4H(), v13.V4H(), v14.V4H(), MemOperand(x1, 24, PostIndex));
1899 __ st1(v13.V4H(), v14.V4H(), MemOperand(x0));
1900 __ st1(v15.V4H(), v16.V4H(), MemOperand(x1, x2, PostIndex));
1901 __ st1(v21.V4H(), v22.V4H(), MemOperand(x1, 16, PostIndex));
1902 __ st1(v16.V4H(), MemOperand(x0));
1903 __ st1(v8.V4H(), MemOperand(x1, x2, PostIndex));
1904 __ st1(v30.V4H(), MemOperand(x1, 8, PostIndex));
1905 __ st1(v3.V4S(), v4.V4S(), v5.V4S(), v6.V4S(), MemOperand(x0));
1906 __ st1(v25.V4S(),
1907 v26.V4S(),
1908 v27.V4S(),
1909 v28.V4S(),
1910 MemOperand(x1, x2, PostIndex));
1911 __ st1(v5.V4S(), v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 64, PostIndex));
1912 __ st1(v31.V4S(), v0.V4S(), v1.V4S(), MemOperand(x0));
1913 __ st1(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex));
1914 __ st1(v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 48, PostIndex));
1915 __ st1(v17.V4S(), v18.V4S(), MemOperand(x0));
1916 __ st1(v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex));
1917 __ st1(v1.V4S(), v2.V4S(), MemOperand(x1, 32, PostIndex));
1918 __ st1(v26.V4S(), MemOperand(x0));
1919 __ st1(v15.V4S(), MemOperand(x1, x2, PostIndex));
1920 __ st1(v13.V4S(), MemOperand(x1, 16, PostIndex));
1921 __ st1(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
1922 __ st1(v10.V8B(),
1923 v11.V8B(),
1924 v12.V8B(),
1925 v13.V8B(),
1926 MemOperand(x1, x2, PostIndex));
1927 __ st1(v15.V8B(),
1928 v16.V8B(),
1929 v17.V8B(),
1930 v18.V8B(),
1931 MemOperand(x1, 32, PostIndex));
1932 __ st1(v19.V8B(), v20.V8B(), v21.V8B(), MemOperand(x0));
1933 __ st1(v31.V8B(), v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex));
1934 __ st1(v9.V8B(), v10.V8B(), v11.V8B(), MemOperand(x1, 24, PostIndex));
1935 __ st1(v12.V8B(), v13.V8B(), MemOperand(x0));
1936 __ st1(v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex));
1937 __ st1(v0.V8B(), v1.V8B(), MemOperand(x1, 16, PostIndex));
1938 __ st1(v16.V8B(), MemOperand(x0));
1939 __ st1(v25.V8B(), MemOperand(x1, x2, PostIndex));
1940 __ st1(v31.V8B(), MemOperand(x1, 8, PostIndex));
1941 __ st1(v4.V8H(), v5.V8H(), v6.V8H(), v7.V8H(), MemOperand(x0));
1942 __ st1(v3.V8H(), v4.V8H(), v5.V8H(), v6.V8H(), MemOperand(x1, x2, PostIndex));
1943 __ st1(v26.V8H(),
1944 v27.V8H(),
1945 v28.V8H(),
1946 v29.V8H(),
1947 MemOperand(x1, 64, PostIndex));
1948 __ st1(v10.V8H(), v11.V8H(), v12.V8H(), MemOperand(x0));
1949 __ st1(v21.V8H(), v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex));
1950 __ st1(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex));
1951 __ st1(v26.V8H(), v27.V8H(), MemOperand(x0));
1952 __ st1(v24.V8H(), v25.V8H(), MemOperand(x1, x2, PostIndex));
1953 __ st1(v17.V8H(), v18.V8H(), MemOperand(x1, 32, PostIndex));
1954 __ st1(v29.V8H(), MemOperand(x0));
1955 __ st1(v19.V8H(), MemOperand(x1, x2, PostIndex));
1956 __ st1(v23.V8H(), MemOperand(x1, 16, PostIndex));
1957 __ st1(v19.B(), 15, MemOperand(x0));
1958 __ st1(v25.B(), 9, MemOperand(x1, x2, PostIndex));
1959 __ st1(v4.B(), 8, MemOperand(x1, 1, PostIndex));
1960 __ st1(v13.D(), 0, MemOperand(x0));
1961 __ st1(v30.D(), 0, MemOperand(x1, x2, PostIndex));
1962 __ st1(v3.D(), 0, MemOperand(x1, 8, PostIndex));
1963 __ st1(v22.H(), 0, MemOperand(x0));
1964 __ st1(v31.H(), 7, MemOperand(x1, x2, PostIndex));
1965 __ st1(v23.H(), 3, MemOperand(x1, 2, PostIndex));
1966 __ st1(v0.S(), 0, MemOperand(x0));
1967 __ st1(v11.S(), 3, MemOperand(x1, x2, PostIndex));
1968 __ st1(v24.S(), 3, MemOperand(x1, 4, PostIndex));
1969 __ st2(v7.V16B(), v8.V16B(), MemOperand(x0));
1970 __ st2(v5.V16B(), v6.V16B(), MemOperand(x1, x2, PostIndex));
1971 __ st2(v18.V16B(), v19.V16B(), MemOperand(x1, 32, PostIndex));
1972 __ st2(v14.V2D(), v15.V2D(), MemOperand(x0));
1973 __ st2(v7.V2D(), v8.V2D(), MemOperand(x1, x2, PostIndex));
1974 __ st2(v24.V2D(), v25.V2D(), MemOperand(x1, 32, PostIndex));
1975 __ st2(v22.V2S(), v23.V2S(), MemOperand(x0));
1976 __ st2(v4.V2S(), v5.V2S(), MemOperand(x1, x2, PostIndex));
1977 __ st2(v2.V2S(), v3.V2S(), MemOperand(x1, 16, PostIndex));
1978 __ st2(v23.V4H(), v24.V4H(), MemOperand(x0));
1979 __ st2(v8.V4H(), v9.V4H(), MemOperand(x1, x2, PostIndex));
1980 __ st2(v7.V4H(), v8.V4H(), MemOperand(x1, 16, PostIndex));
1981 __ st2(v17.V4S(), v18.V4S(), MemOperand(x0));
1982 __ st2(v6.V4S(), v7.V4S(), MemOperand(x1, x2, PostIndex));
1983 __ st2(v26.V4S(), v27.V4S(), MemOperand(x1, 32, PostIndex));
1984 __ st2(v31.V8B(), v0.V8B(), MemOperand(x0));
1985 __ st2(v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex));
1986 __ st2(v21.V8B(), v22.V8B(), MemOperand(x1, 16, PostIndex));
1987 __ st2(v7.V8H(), v8.V8H(), MemOperand(x0));
1988 __ st2(v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex));
1989 __ st2(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex));
1990 __ st2(v8.B(), v9.B(), 15, MemOperand(x0));
1991 __ st2(v8.B(), v9.B(), 15, MemOperand(x1, x2, PostIndex));
1992 __ st2(v7.B(), v8.B(), 4, MemOperand(x1, 2, PostIndex));
1993 __ st2(v25.D(), v26.D(), 0, MemOperand(x0));
1994 __ st2(v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex));
1995 __ st2(v3.D(), v4.D(), 1, MemOperand(x1, 16, PostIndex));
1996 __ st2(v4.H(), v5.H(), 3, MemOperand(x0));
1997 __ st2(v0.H(), v1.H(), 5, MemOperand(x1, x2, PostIndex));
1998 __ st2(v22.H(), v23.H(), 2, MemOperand(x1, 4, PostIndex));
1999 __ st2(v14.S(), v15.S(), 3, MemOperand(x0));
2000 __ st2(v23.S(), v24.S(), 3, MemOperand(x1, x2, PostIndex));
2001 __ st2(v0.S(), v1.S(), 2, MemOperand(x1, 8, PostIndex));
2002 __ st3(v26.V16B(), v27.V16B(), v28.V16B(), MemOperand(x0));
2003 __ st3(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex));
2004 __ st3(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, 48, PostIndex));
2005 __ st3(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
2006 __ st3(v23.V2D(), v24.V2D(), v25.V2D(), MemOperand(x1, x2, PostIndex));
2007 __ st3(v10.V2D(), v11.V2D(), v12.V2D(), MemOperand(x1, 48, PostIndex));
2008 __ st3(v9.V2S(), v10.V2S(), v11.V2S(), MemOperand(x0));
2009 __ st3(v13.V2S(), v14.V2S(), v15.V2S(), MemOperand(x1, x2, PostIndex));
2010 __ st3(v22.V2S(), v23.V2S(), v24.V2S(), MemOperand(x1, 24, PostIndex));
2011 __ st3(v31.V4H(), v0.V4H(), v1.V4H(), MemOperand(x0));
2012 __ st3(v8.V4H(), v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex));
2013 __ st3(v19.V4H(), v20.V4H(), v21.V4H(), MemOperand(x1, 24, PostIndex));
2014 __ st3(v18.V4S(), v19.V4S(), v20.V4S(), MemOperand(x0));
2015 __ st3(v25.V4S(), v26.V4S(), v27.V4S(), MemOperand(x1, x2, PostIndex));
2016 __ st3(v16.V4S(), v17.V4S(), v18.V4S(), MemOperand(x1, 48, PostIndex));
2017 __ st3(v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
2018 __ st3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x1, x2, PostIndex));
2019 __ st3(v30.V8B(), v31.V8B(), v0.V8B(), MemOperand(x1, 24, PostIndex));
2020 __ st3(v8.V8H(), v9.V8H(), v10.V8H(), MemOperand(x0));
2021 __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, x2, PostIndex));
2022 __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex));
2023 __ st3(v31.B(), v0.B(), v1.B(), 10, MemOperand(x0));
2024 __ st3(v4.B(), v5.B(), v6.B(), 5, MemOperand(x1, x2, PostIndex));
2025 __ st3(v5.B(), v6.B(), v7.B(), 1, MemOperand(x1, 3, PostIndex));
2026 __ st3(v5.D(), v6.D(), v7.D(), 0, MemOperand(x0));
2027 __ st3(v6.D(), v7.D(), v8.D(), 0, MemOperand(x1, x2, PostIndex));
2028 __ st3(v0.D(), v1.D(), v2.D(), 0, MemOperand(x1, 24, PostIndex));
2029 __ st3(v31.H(), v0.H(), v1.H(), 2, MemOperand(x0));
2030 __ st3(v14.H(), v15.H(), v16.H(), 5, MemOperand(x1, x2, PostIndex));
2031 __ st3(v21.H(), v22.H(), v23.H(), 6, MemOperand(x1, 6, PostIndex));
2032 __ st3(v21.S(), v22.S(), v23.S(), 0, MemOperand(x0));
2033 __ st3(v11.S(), v12.S(), v13.S(), 1, MemOperand(x1, x2, PostIndex));
2034 __ st3(v15.S(), v16.S(), v17.S(), 0, MemOperand(x1, 12, PostIndex));
2035 __ st4(v22.V16B(), v23.V16B(), v24.V16B(), v25.V16B(), MemOperand(x0));
2036 __ st4(v24.V16B(),
2037 v25.V16B(),
2038 v26.V16B(),
2039 v27.V16B(),
2040 MemOperand(x1, x2, PostIndex));
2041 __ st4(v15.V16B(),
2042 v16.V16B(),
2043 v17.V16B(),
2044 v18.V16B(),
2045 MemOperand(x1, 64, PostIndex));
2046 __ st4(v16.V2D(), v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
2047 __ st4(v17.V2D(),
2048 v18.V2D(),
2049 v19.V2D(),
2050 v20.V2D(),
2051 MemOperand(x1, x2, PostIndex));
2052 __ st4(v9.V2D(),
2053 v10.V2D(),
2054 v11.V2D(),
2055 v12.V2D(),
2056 MemOperand(x1, 64, PostIndex));
2057 __ st4(v23.V2S(), v24.V2S(), v25.V2S(), v26.V2S(), MemOperand(x0));
2058 __ st4(v15.V2S(),
2059 v16.V2S(),
2060 v17.V2S(),
2061 v18.V2S(),
2062 MemOperand(x1, x2, PostIndex));
2063 __ st4(v24.V2S(),
2064 v25.V2S(),
2065 v26.V2S(),
2066 v27.V2S(),
2067 MemOperand(x1, 32, PostIndex));
2068 __ st4(v14.V4H(), v15.V4H(), v16.V4H(), v17.V4H(), MemOperand(x0));
2069 __ st4(v18.V4H(),
2070 v19.V4H(),
2071 v20.V4H(),
2072 v21.V4H(),
2073 MemOperand(x1, x2, PostIndex));
2074 __ st4(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex));
2075 __ st4(v13.V4S(), v14.V4S(), v15.V4S(), v16.V4S(), MemOperand(x0));
2076 __ st4(v6.V4S(), v7.V4S(), v8.V4S(), v9.V4S(), MemOperand(x1, x2, PostIndex));
2077 __ st4(v15.V4S(),
2078 v16.V4S(),
2079 v17.V4S(),
2080 v18.V4S(),
2081 MemOperand(x1, 64, PostIndex));
2082 __ st4(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
2083 __ st4(v25.V8B(),
2084 v26.V8B(),
2085 v27.V8B(),
2086 v28.V8B(),
2087 MemOperand(x1, x2, PostIndex));
2088 __ st4(v19.V8B(),
2089 v20.V8B(),
2090 v21.V8B(),
2091 v22.V8B(),
2092 MemOperand(x1, 32, PostIndex));
2093 __ st4(v19.V8H(), v20.V8H(), v21.V8H(), v22.V8H(), MemOperand(x0));
2094 __ st4(v15.V8H(),
2095 v16.V8H(),
2096 v17.V8H(),
2097 v18.V8H(),
2098 MemOperand(x1, x2, PostIndex));
2099 __ st4(v31.V8H(),
2100 v0.V8H(),
2101 v1.V8H(),
2102 v2.V8H(),
2103 MemOperand(x1, 64, PostIndex));
2104 __ st4(v0.B(), v1.B(), v2.B(), v3.B(), 13, MemOperand(x0));
2105 __ st4(v4.B(), v5.B(), v6.B(), v7.B(), 10, MemOperand(x1, x2, PostIndex));
2106 __ st4(v9.B(), v10.B(), v11.B(), v12.B(), 9, MemOperand(x1, 4, PostIndex));
2107 __ st4(v2.D(), v3.D(), v4.D(), v5.D(), 1, MemOperand(x0));
2108 __ st4(v7.D(), v8.D(), v9.D(), v10.D(), 0, MemOperand(x1, x2, PostIndex));
2109 __ st4(v31.D(), v0.D(), v1.D(), v2.D(), 1, MemOperand(x1, 32, PostIndex));
2110 __ st4(v2.H(), v3.H(), v4.H(), v5.H(), 1, MemOperand(x0));
2111 __ st4(v27.H(), v28.H(), v29.H(), v30.H(), 3, MemOperand(x1, x2, PostIndex));
2112 __ st4(v24.H(), v25.H(), v26.H(), v27.H(), 4, MemOperand(x1, 8, PostIndex));
2113 __ st4(v18.S(), v19.S(), v20.S(), v21.S(), 2, MemOperand(x0));
2114 __ st4(v6.S(), v7.S(), v8.S(), v9.S(), 2, MemOperand(x1, x2, PostIndex));
2115 __ st4(v25.S(), v26.S(), v27.S(), v28.S(), 1, MemOperand(x1, 16, PostIndex));
2116 __ sub(d12, d17, d2);
2117 __ sub(v20.V16B(), v24.V16B(), v8.V16B());
2118 __ sub(v8.V2D(), v29.V2D(), v5.V2D());
2119 __ sub(v2.V2S(), v28.V2S(), v24.V2S());
2120 __ sub(v24.V4H(), v10.V4H(), v4.V4H());
2121 __ sub(v28.V4S(), v4.V4S(), v17.V4S());
2122 __ sub(v16.V8B(), v27.V8B(), v2.V8B());
2123 __ sub(v20.V8H(), v10.V8H(), v13.V8H());
2124 __ subhn(v5.V2S(), v14.V2D(), v13.V2D());
2125 __ subhn(v10.V4H(), v5.V4S(), v8.V4S());
2126 __ subhn(v6.V8B(), v10.V8H(), v22.V8H());
2127 __ subhn2(v11.V16B(), v6.V8H(), v9.V8H());
2128 __ subhn2(v25.V4S(), v18.V2D(), v24.V2D());
2129 __ subhn2(v20.V8H(), v21.V4S(), v1.V4S());
2130 __ suqadd(b25, b11);
2131 __ suqadd(d13, d1);
2132 __ suqadd(h0, h9);
2133 __ suqadd(s22, s8);
2134 __ suqadd(v24.V16B(), v27.V16B());
2135 __ suqadd(v26.V2D(), v14.V2D());
2136 __ suqadd(v7.V2S(), v10.V2S());
2137 __ suqadd(v25.V4H(), v12.V4H());
2138 __ suqadd(v4.V4S(), v3.V4S());
2139 __ suqadd(v14.V8B(), v18.V8B());
2140 __ suqadd(v31.V8H(), v8.V8H());
2141 __ sxtl(v16.V2D(), v20.V2S());
2142 __ sxtl(v27.V4S(), v28.V4H());
2143 __ sxtl(v0.V8H(), v22.V8B());
2144 __ sxtl2(v6.V2D(), v7.V4S());
2145 __ sxtl2(v9.V4S(), v27.V8H());
2146 __ sxtl2(v16.V8H(), v16.V16B());
2147 __ tbl(v25.V16B(),
2148 v17.V16B(),
2149 v18.V16B(),
2150 v19.V16B(),
2151 v20.V16B(),
2152 v22.V16B());
2153 __ tbl(v28.V16B(), v13.V16B(), v14.V16B(), v15.V16B(), v4.V16B());
2154 __ tbl(v3.V16B(), v0.V16B(), v1.V16B(), v2.V16B());
2155 __ tbl(v20.V16B(), v15.V16B(), v4.V16B());
2156 __ tbl(v7.V8B(), v23.V16B(), v24.V16B(), v25.V16B(), v26.V16B(), v20.V8B());
2157 __ tbl(v8.V8B(), v1.V16B(), v2.V16B(), v3.V16B(), v31.V8B());
2158 __ tbl(v8.V8B(), v25.V16B(), v26.V16B(), v16.V8B());
2159 __ tbl(v11.V8B(), v19.V16B(), v30.V8B());
2160 __ tbx(v25.V16B(), v25.V16B(), v26.V16B(), v27.V16B(), v28.V16B(), v5.V16B());
2161 __ tbx(v21.V16B(), v29.V16B(), v30.V16B(), v31.V16B(), v24.V16B());
2162 __ tbx(v6.V16B(), v16.V16B(), v17.V16B(), v1.V16B());
2163 __ tbx(v13.V16B(), v3.V16B(), v20.V16B());
2164 __ tbx(v24.V8B(), v29.V16B(), v30.V16B(), v31.V16B(), v0.V16B(), v9.V8B());
2165 __ tbx(v17.V8B(), v9.V16B(), v10.V16B(), v11.V16B(), v26.V8B());
2166 __ tbx(v5.V8B(), v3.V16B(), v4.V16B(), v21.V8B());
2167 __ tbx(v16.V8B(), v11.V16B(), v29.V8B());
2168 __ trn1(v19.V16B(), v24.V16B(), v12.V16B());
2169 __ trn1(v2.V2D(), v7.V2D(), v10.V2D());
2170 __ trn1(v22.V2S(), v0.V2S(), v21.V2S());
2171 __ trn1(v12.V4H(), v15.V4H(), v20.V4H());
2172 __ trn1(v30.V4S(), v17.V4S(), v9.V4S());
2173 __ trn1(v12.V8B(), v19.V8B(), v29.V8B());
2174 __ trn1(v23.V8H(), v8.V8H(), v9.V8H());
2175 __ trn2(v28.V16B(), v30.V16B(), v25.V16B());
2176 __ trn2(v7.V2D(), v27.V2D(), v7.V2D());
2177 __ trn2(v30.V2S(), v16.V2S(), v19.V2S());
2178 __ trn2(v24.V4H(), v6.V4H(), v25.V4H());
2179 __ trn2(v2.V4S(), v19.V4S(), v11.V4S());
2180 __ trn2(v25.V8B(), v27.V8B(), v18.V8B());
2181 __ trn2(v12.V8H(), v4.V8H(), v15.V8H());
2182 __ uaba(v31.V16B(), v12.V16B(), v28.V16B());
2183 __ uaba(v18.V2S(), v5.V2S(), v14.V2S());
2184 __ uaba(v9.V4H(), v20.V4H(), v21.V4H());
2185 __ uaba(v6.V4S(), v20.V4S(), v2.V4S());
2186 __ uaba(v16.V8B(), v12.V8B(), v5.V8B());
2187 __ uaba(v15.V8H(), v26.V8H(), v30.V8H());
2188 __ uabal(v10.V2D(), v18.V2S(), v15.V2S());
2189 __ uabal(v30.V4S(), v19.V4H(), v7.V4H());
2190 __ uabal(v4.V8H(), v27.V8B(), v0.V8B());
2191 __ uabal2(v19.V2D(), v12.V4S(), v2.V4S());
2192 __ uabal2(v26.V4S(), v5.V8H(), v12.V8H());
2193 __ uabal2(v19.V8H(), v20.V16B(), v28.V16B());
2194 __ uabd(v18.V16B(), v4.V16B(), v21.V16B());
2195 __ uabd(v30.V2S(), v21.V2S(), v16.V2S());
2196 __ uabd(v8.V4H(), v28.V4H(), v25.V4H());
2197 __ uabd(v28.V4S(), v12.V4S(), v21.V4S());
2198 __ uabd(v19.V8B(), v16.V8B(), v28.V8B());
2199 __ uabd(v9.V8H(), v12.V8H(), v29.V8H());
2200 __ uabdl(v26.V2D(), v0.V2S(), v8.V2S());
2201 __ uabdl(v29.V4S(), v31.V4H(), v25.V4H());
2202 __ uabdl(v27.V8H(), v29.V8B(), v14.V8B());
2203 __ uabdl2(v20.V2D(), v20.V4S(), v8.V4S());
2204 __ uabdl2(v22.V4S(), v15.V8H(), v18.V8H());
2205 __ uabdl2(v9.V8H(), v18.V16B(), v23.V16B());
2206 __ uadalp(v9.V1D(), v15.V2S());
2207 __ uadalp(v14.V2D(), v12.V4S());
2208 __ uadalp(v28.V2S(), v12.V4H());
2209 __ uadalp(v0.V4H(), v17.V8B());
2210 __ uadalp(v1.V4S(), v29.V8H());
2211 __ uadalp(v15.V8H(), v22.V16B());
2212 __ uaddl(v1.V2D(), v20.V2S(), v27.V2S());
2213 __ uaddl(v31.V4S(), v25.V4H(), v5.V4H());
2214 __ uaddl(v12.V8H(), v3.V8B(), v3.V8B());
2215 __ uaddl2(v5.V2D(), v23.V4S(), v6.V4S());
2216 __ uaddl2(v1.V4S(), v5.V8H(), v25.V8H());
2217 __ uaddl2(v22.V8H(), v30.V16B(), v28.V16B());
2218 __ uaddlp(v7.V1D(), v9.V2S());
2219 __ uaddlp(v26.V2D(), v4.V4S());
2220 __ uaddlp(v28.V2S(), v1.V4H());
2221 __ uaddlp(v20.V4H(), v31.V8B());
2222 __ uaddlp(v16.V4S(), v17.V8H());
2223 __ uaddlp(v6.V8H(), v2.V16B());
2224 __ uaddlv(d28, v22.V4S());
2225 __ uaddlv(h0, v19.V16B());
2226 __ uaddlv(h30, v30.V8B());
2227 __ uaddlv(s24, v18.V4H());
2228 __ uaddlv(s10, v0.V8H());
2229 __ uaddw(v9.V2D(), v17.V2D(), v14.V2S());
2230 __ uaddw(v9.V4S(), v25.V4S(), v3.V4H());
2231 __ uaddw(v18.V8H(), v1.V8H(), v0.V8B());
2232 __ uaddw2(v18.V2D(), v5.V2D(), v6.V4S());
2233 __ uaddw2(v17.V4S(), v15.V4S(), v11.V8H());
2234 __ uaddw2(v29.V8H(), v11.V8H(), v7.V16B());
2235 __ uhadd(v13.V16B(), v9.V16B(), v3.V16B());
2236 __ uhadd(v17.V2S(), v25.V2S(), v24.V2S());
2237 __ uhadd(v25.V4H(), v23.V4H(), v13.V4H());
2238 __ uhadd(v0.V4S(), v20.V4S(), v16.V4S());
2239 __ uhadd(v5.V8B(), v5.V8B(), v25.V8B());
2240 __ uhadd(v3.V8H(), v29.V8H(), v18.V8H());
2241 __ uhsub(v1.V16B(), v22.V16B(), v13.V16B());
2242 __ uhsub(v14.V2S(), v30.V2S(), v30.V2S());
2243 __ uhsub(v29.V4H(), v14.V4H(), v17.V4H());
2244 __ uhsub(v26.V4S(), v5.V4S(), v18.V4S());
2245 __ uhsub(v3.V8B(), v7.V8B(), v12.V8B());
2246 __ uhsub(v25.V8H(), v21.V8H(), v5.V8H());
2247 __ umax(v28.V16B(), v12.V16B(), v6.V16B());
2248 __ umax(v20.V2S(), v19.V2S(), v26.V2S());
2249 __ umax(v0.V4H(), v31.V4H(), v18.V4H());
2250 __ umax(v6.V4S(), v21.V4S(), v28.V4S());
2251 __ umax(v0.V8B(), v2.V8B(), v20.V8B());
2252 __ umax(v4.V8H(), v11.V8H(), v22.V8H());
2253 __ umaxp(v1.V16B(), v6.V16B(), v29.V16B());
2254 __ umaxp(v19.V2S(), v17.V2S(), v27.V2S());
2255 __ umaxp(v21.V4H(), v16.V4H(), v7.V4H());
2256 __ umaxp(v9.V4S(), v20.V4S(), v29.V4S());
2257 __ umaxp(v13.V8B(), v1.V8B(), v16.V8B());
2258 __ umaxp(v19.V8H(), v23.V8H(), v26.V8H());
2259 __ umaxv(b17, v30.V16B());
2260 __ umaxv(b23, v12.V8B());
2261 __ umaxv(h31, v15.V4H());
2262 __ umaxv(h15, v25.V8H());
2263 __ umaxv(s18, v21.V4S());
2264 __ umin(v22.V16B(), v0.V16B(), v18.V16B());
2265 __ umin(v1.V2S(), v21.V2S(), v16.V2S());
2266 __ umin(v17.V4H(), v4.V4H(), v25.V4H());
2267 __ umin(v24.V4S(), v26.V4S(), v13.V4S());
2268 __ umin(v20.V8B(), v1.V8B(), v5.V8B());
2269 __ umin(v26.V8H(), v25.V8H(), v23.V8H());
2270 __ uminp(v5.V16B(), v1.V16B(), v23.V16B());
2271 __ uminp(v7.V2S(), v26.V2S(), v30.V2S());
2272 __ uminp(v9.V4H(), v5.V4H(), v25.V4H());
2273 __ uminp(v23.V4S(), v10.V4S(), v1.V4S());
2274 __ uminp(v4.V8B(), v29.V8B(), v14.V8B());
2275 __ uminp(v21.V8H(), v0.V8H(), v14.V8H());
2276 __ uminv(b0, v17.V16B());
2277 __ uminv(b0, v31.V8B());
2278 __ uminv(h24, v0.V4H());
2279 __ uminv(h29, v14.V8H());
2280 __ uminv(s30, v3.V4S());
2281 __ umlal(v11.V2D(), v11.V2S(), v24.V2S());
2282 __ umlal(v30.V2D(), v16.V2S(), v11.S(), 3);
2283 __ umlal(v0.V4S(), v9.V4H(), v26.V4H());
2284 __ umlal(v20.V4S(), v24.V4H(), v12.H(), 4);
2285 __ umlal(v16.V8H(), v21.V8B(), v6.V8B());
2286 __ umlal2(v17.V2D(), v19.V4S(), v23.V4S());
2287 __ umlal2(v5.V2D(), v30.V4S(), v8.S(), 0);
2288 __ umlal2(v16.V4S(), v8.V8H(), v15.V8H());
2289 __ umlal2(v15.V4S(), v26.V8H(), v1.H(), 5);
2290 __ umlal2(v30.V8H(), v1.V16B(), v17.V16B());
2291 __ umlsl(v18.V2D(), v19.V2S(), v28.V2S());
2292 __ umlsl(v7.V2D(), v7.V2S(), v8.S(), 0);
2293 __ umlsl(v24.V4S(), v8.V4H(), v4.V4H());
2294 __ umlsl(v18.V4S(), v22.V4H(), v12.H(), 4);
2295 __ umlsl(v28.V8H(), v14.V8B(), v20.V8B());
2296 __ umlsl2(v11.V2D(), v0.V4S(), v9.V4S());
2297 __ umlsl2(v26.V2D(), v16.V4S(), v9.S(), 2);
2298 __ umlsl2(v3.V4S(), v11.V8H(), v9.V8H());
2299 __ umlsl2(v10.V4S(), v25.V8H(), v9.H(), 4);
2300 __ umlsl2(v24.V8H(), v16.V16B(), v28.V16B());
2301 __ umov(x30, v25.D(), 1);
2302 __ umull(v12.V2D(), v10.V2S(), v29.V2S());
2303 __ umull(v22.V2D(), v30.V2S(), v5.S(), 3);
2304 __ umull(v7.V4S(), v0.V4H(), v25.V4H());
2305 __ umull(v11.V4S(), v13.V4H(), v3.H(), 2);
2306 __ umull(v25.V8H(), v16.V8B(), v10.V8B());
2307 __ umull2(v17.V2D(), v3.V4S(), v26.V4S());
2308 __ umull2(v26.V2D(), v11.V4S(), v2.S(), 3);
2309 __ umull2(v12.V4S(), v17.V8H(), v23.V8H());
2310 __ umull2(v4.V4S(), v31.V8H(), v1.H(), 2);
2311 __ umull2(v5.V8H(), v12.V16B(), v17.V16B());
2312 __ uqadd(b30, b4, b28);
2313 __ uqadd(d27, d20, d16);
2314 __ uqadd(h7, h14, h28);
2315 __ uqadd(s28, s17, s4);
2316 __ uqadd(v19.V16B(), v22.V16B(), v21.V16B());
2317 __ uqadd(v16.V2D(), v4.V2D(), v11.V2D());
2318 __ uqadd(v20.V2S(), v14.V2S(), v4.V2S());
2319 __ uqadd(v5.V4H(), v0.V4H(), v16.V4H());
2320 __ uqadd(v21.V4S(), v31.V4S(), v9.V4S());
2321 __ uqadd(v23.V8B(), v24.V8B(), v3.V8B());
2322 __ uqadd(v17.V8H(), v27.V8H(), v11.V8H());
2323 __ uqrshl(b10, b22, b10);
2324 __ uqrshl(d29, d5, d11);
2325 __ uqrshl(h27, h24, h30);
2326 __ uqrshl(s10, s13, s8);
2327 __ uqrshl(v9.V16B(), v18.V16B(), v14.V16B());
2328 __ uqrshl(v24.V2D(), v15.V2D(), v17.V2D());
2329 __ uqrshl(v4.V2S(), v14.V2S(), v27.V2S());
2330 __ uqrshl(v15.V4H(), v5.V4H(), v8.V4H());
2331 __ uqrshl(v21.V4S(), v29.V4S(), v0.V4S());
2332 __ uqrshl(v16.V8B(), v24.V8B(), v9.V8B());
2333 __ uqrshl(v2.V8H(), v0.V8H(), v15.V8H());
2334 __ uqrshrn(b11, h26, 4);
2335 __ uqrshrn(h7, s30, 5);
2336 __ uqrshrn(s10, d8, 21);
2337 __ uqrshrn(v15.V2S(), v6.V2D(), 11);
2338 __ uqrshrn(v5.V4H(), v26.V4S(), 12);
2339 __ uqrshrn(v28.V8B(), v25.V8H(), 5);
2340 __ uqrshrn2(v25.V16B(), v30.V8H(), 2);
2341 __ uqrshrn2(v21.V4S(), v14.V2D(), 32);
2342 __ uqrshrn2(v13.V8H(), v7.V4S(), 2);
2343 __ uqshl(b13, b0, b23);
2344 __ uqshl(b9, b17, 4);
2345 __ uqshl(d23, d6, d4);
2346 __ uqshl(d8, d11, 44);
2347 __ uqshl(h19, h13, h15);
2348 __ uqshl(h25, h26, 6);
2349 __ uqshl(s4, s24, s10);
2350 __ uqshl(s19, s14, 1);
2351 __ uqshl(v14.V16B(), v30.V16B(), v25.V16B());
2352 __ uqshl(v6.V16B(), v10.V16B(), 5);
2353 __ uqshl(v18.V2D(), v8.V2D(), v7.V2D());
2354 __ uqshl(v25.V2D(), v14.V2D(), 18);
2355 __ uqshl(v25.V2S(), v16.V2S(), v23.V2S());
2356 __ uqshl(v13.V2S(), v15.V2S(), 31);
2357 __ uqshl(v28.V4H(), v24.V4H(), v15.V4H());
2358 __ uqshl(v4.V4H(), v17.V4H(), 1);
2359 __ uqshl(v9.V4S(), v31.V4S(), v23.V4S());
2360 __ uqshl(v18.V4S(), v28.V4S(), 31);
2361 __ uqshl(v31.V8B(), v21.V8B(), v15.V8B());
2362 __ uqshl(v6.V8B(), v21.V8B(), 1);
2363 __ uqshl(v28.V8H(), v2.V8H(), v17.V8H());
2364 __ uqshl(v24.V8H(), v8.V8H(), 14);
2365 __ uqshrn(b21, h27, 7);
2366 __ uqshrn(h28, s26, 11);
2367 __ uqshrn(s13, d31, 17);
2368 __ uqshrn(v21.V2S(), v16.V2D(), 8);
2369 __ uqshrn(v24.V4H(), v24.V4S(), 2);
2370 __ uqshrn(v5.V8B(), v1.V8H(), 8);
2371 __ uqshrn2(v16.V16B(), v29.V8H(), 6);
2372 __ uqshrn2(v2.V4S(), v6.V2D(), 1);
2373 __ uqshrn2(v16.V8H(), v10.V4S(), 14);
2374 __ uqsub(b28, b20, b26);
2375 __ uqsub(d0, d7, d10);
2376 __ uqsub(h26, h24, h7);
2377 __ uqsub(s23, s23, s16);
2378 __ uqsub(v14.V16B(), v16.V16B(), v24.V16B());
2379 __ uqsub(v11.V2D(), v17.V2D(), v6.V2D());
2380 __ uqsub(v10.V2S(), v10.V2S(), v8.V2S());
2381 __ uqsub(v9.V4H(), v15.V4H(), v12.V4H());
2382 __ uqsub(v23.V4S(), v18.V4S(), v7.V4S());
2383 __ uqsub(v9.V8B(), v19.V8B(), v17.V8B());
2384 __ uqsub(v20.V8H(), v2.V8H(), v6.V8H());
2385 __ uqxtn(b29, h19);
2386 __ uqxtn(h0, s13);
2387 __ uqxtn(s26, d22);
2388 __ uqxtn(v5.V2S(), v31.V2D());
2389 __ uqxtn(v30.V4H(), v19.V4S());
2390 __ uqxtn(v15.V8B(), v2.V8H());
2391 __ uqxtn2(v29.V16B(), v3.V8H());
2392 __ uqxtn2(v13.V4S(), v17.V2D());
2393 __ uqxtn2(v28.V8H(), v11.V4S());
2394 __ urecpe(v23.V2S(), v15.V2S());
2395 __ urecpe(v27.V4S(), v7.V4S());
2396 __ urhadd(v2.V16B(), v15.V16B(), v27.V16B());
2397 __ urhadd(v15.V2S(), v1.V2S(), v18.V2S());
2398 __ urhadd(v17.V4H(), v4.V4H(), v26.V4H());
2399 __ urhadd(v2.V4S(), v27.V4S(), v14.V4S());
2400 __ urhadd(v5.V8B(), v17.V8B(), v14.V8B());
2401 __ urhadd(v30.V8H(), v2.V8H(), v25.V8H());
2402 __ urshl(d4, d28, d30);
2403 __ urshl(v13.V16B(), v31.V16B(), v19.V16B());
2404 __ urshl(v14.V2D(), v23.V2D(), v21.V2D());
2405 __ urshl(v10.V2S(), v7.V2S(), v8.V2S());
2406 __ urshl(v15.V4H(), v21.V4H(), v28.V4H());
2407 __ urshl(v30.V4S(), v8.V4S(), v23.V4S());
2408 __ urshl(v31.V8B(), v20.V8B(), v5.V8B());
2409 __ urshl(v30.V8H(), v27.V8H(), v30.V8H());
2410 __ urshr(d4, d13, 49);
2411 __ urshr(v2.V16B(), v20.V16B(), 1);
2412 __ urshr(v13.V2D(), v11.V2D(), 51);
2413 __ urshr(v21.V2S(), v31.V2S(), 10);
2414 __ urshr(v21.V4H(), v17.V4H(), 11);
2415 __ urshr(v4.V4S(), v22.V4S(), 1);
2416 __ urshr(v0.V8B(), v1.V8B(), 7);
2417 __ urshr(v13.V8H(), v20.V8H(), 1);
2418 __ ursqrte(v20.V2S(), v16.V2S());
2419 __ ursqrte(v28.V4S(), v8.V4S());
2420 __ ursra(d27, d16, 45);
2421 __ ursra(v18.V16B(), v17.V16B(), 3);
2422 __ ursra(v26.V2D(), v28.V2D(), 58);
2423 __ ursra(v8.V2S(), v22.V2S(), 31);
2424 __ ursra(v31.V4H(), v4.V4H(), 7);
2425 __ ursra(v31.V4S(), v15.V4S(), 2);
2426 __ ursra(v3.V8B(), v1.V8B(), 5);
2427 __ ursra(v18.V8H(), v14.V8H(), 13);
2428 __ ushl(d31, d0, d16);
2429 __ ushl(v0.V16B(), v6.V16B(), v2.V16B());
2430 __ ushl(v18.V2D(), v1.V2D(), v18.V2D());
2431 __ ushl(v27.V2S(), v7.V2S(), v29.V2S());
2432 __ ushl(v14.V4H(), v14.V4H(), v13.V4H());
2433 __ ushl(v22.V4S(), v4.V4S(), v9.V4S());
2434 __ ushl(v23.V8B(), v22.V8B(), v27.V8B());
2435 __ ushl(v21.V8H(), v25.V8H(), v8.V8H());
2436 __ ushll(v11.V2D(), v0.V2S(), 21);
2437 __ ushll(v2.V4S(), v17.V4H(), 8);
2438 __ ushll(v11.V8H(), v14.V8B(), 1);
2439 __ ushll2(v8.V2D(), v29.V4S(), 7);
2440 __ ushll2(v29.V4S(), v9.V8H(), 2);
2441 __ ushll2(v5.V8H(), v24.V16B(), 6);
2442 __ ushr(d28, d27, 53);
2443 __ ushr(v1.V16B(), v9.V16B(), 7);
2444 __ ushr(v2.V2D(), v24.V2D(), 43);
2445 __ ushr(v30.V2S(), v25.V2S(), 11);
2446 __ ushr(v10.V4H(), v26.V4H(), 12);
2447 __ ushr(v4.V4S(), v5.V4S(), 30);
2448 __ ushr(v30.V8B(), v2.V8B(), 1);
2449 __ ushr(v6.V8H(), v12.V8H(), 2);
2450 __ usqadd(b19, b5);
2451 __ usqadd(d9, d2);
2452 __ usqadd(h2, h16);
2453 __ usqadd(s16, s3);
2454 __ usqadd(v31.V16B(), v29.V16B());
2455 __ usqadd(v8.V2D(), v10.V2D());
2456 __ usqadd(v18.V2S(), v9.V2S());
2457 __ usqadd(v24.V4H(), v14.V4H());
2458 __ usqadd(v10.V4S(), v30.V4S());
2459 __ usqadd(v16.V8B(), v20.V8B());
2460 __ usqadd(v12.V8H(), v16.V8H());
2461 __ usra(d28, d27, 37);
2462 __ usra(v5.V16B(), v22.V16B(), 5);
2463 __ usra(v2.V2D(), v19.V2D(), 33);
2464 __ usra(v0.V2S(), v0.V2S(), 21);
2465 __ usra(v7.V4H(), v6.V4H(), 12);
2466 __ usra(v4.V4S(), v17.V4S(), 9);
2467 __ usra(v9.V8B(), v12.V8B(), 7);
2468 __ usra(v3.V8H(), v27.V8H(), 14);
2469 __ usubl(v29.V2D(), v12.V2S(), v30.V2S());
2470 __ usubl(v29.V4S(), v28.V4H(), v6.V4H());
2471 __ usubl(v12.V8H(), v4.V8B(), v14.V8B());
2472 __ usubl2(v1.V2D(), v24.V4S(), v17.V4S());
2473 __ usubl2(v4.V4S(), v1.V8H(), v3.V8H());
2474 __ usubl2(v23.V8H(), v4.V16B(), v7.V16B());
2475 __ usubw(v9.V2D(), v20.V2D(), v30.V2S());
2476 __ usubw(v20.V4S(), v16.V4S(), v23.V4H());
2477 __ usubw(v25.V8H(), v8.V8H(), v29.V8B());
2478 __ usubw2(v18.V2D(), v29.V2D(), v6.V4S());
2479 __ usubw2(v6.V4S(), v6.V4S(), v20.V8H());
2480 __ usubw2(v18.V8H(), v4.V8H(), v16.V16B());
2481 __ uxtl(v27.V2D(), v21.V2S());
2482 __ uxtl(v0.V4S(), v31.V4H());
2483 __ uxtl(v27.V8H(), v10.V8B());
2484 __ uxtl2(v6.V2D(), v16.V4S());
2485 __ uxtl2(v22.V4S(), v20.V8H());
2486 __ uxtl2(v20.V8H(), v21.V16B());
2487 __ uzp1(v30.V16B(), v9.V16B(), v17.V16B());
2488 __ uzp1(v7.V2D(), v26.V2D(), v28.V2D());
2489 __ uzp1(v26.V2S(), v16.V2S(), v22.V2S());
2490 __ uzp1(v14.V4H(), v19.V4H(), v6.V4H());
2491 __ uzp1(v17.V4S(), v23.V4S(), v30.V4S());
2492 __ uzp1(v28.V8B(), v27.V8B(), v13.V8B());
2493 __ uzp1(v17.V8H(), v1.V8H(), v12.V8H());
2494 __ uzp2(v8.V16B(), v18.V16B(), v26.V16B());
2495 __ uzp2(v21.V2D(), v22.V2D(), v24.V2D());
2496 __ uzp2(v20.V2S(), v21.V2S(), v2.V2S());
2497 __ uzp2(v16.V4H(), v31.V4H(), v6.V4H());
2498 __ uzp2(v25.V4S(), v11.V4S(), v8.V4S());
2499 __ uzp2(v31.V8B(), v31.V8B(), v13.V8B());
2500 __ uzp2(v8.V8H(), v17.V8H(), v1.V8H());
2501 __ xtn(v17.V2S(), v26.V2D());
2502 __ xtn(v3.V4H(), v0.V4S());
2503 __ xtn(v18.V8B(), v8.V8H());
2504 __ xtn2(v0.V16B(), v0.V8H());
2505 __ xtn2(v15.V4S(), v4.V2D());
2506 __ xtn2(v31.V8H(), v18.V4S());
2507 __ zip1(v22.V16B(), v9.V16B(), v6.V16B());
2508 __ zip1(v23.V2D(), v11.V2D(), v2.V2D());
2509 __ zip1(v26.V2S(), v16.V2S(), v9.V2S());
2510 __ zip1(v1.V4H(), v9.V4H(), v7.V4H());
2511 __ zip1(v0.V4S(), v30.V4S(), v20.V4S());
2512 __ zip1(v30.V8B(), v17.V8B(), v15.V8B());
2513 __ zip1(v17.V8H(), v8.V8H(), v2.V8H());
2514 __ zip2(v23.V16B(), v10.V16B(), v11.V16B());
2515 __ zip2(v30.V2D(), v6.V2D(), v14.V2D());
2516 __ zip2(v9.V2S(), v10.V2S(), v21.V2S());
2517 __ zip2(v8.V4H(), v24.V4H(), v29.V4H());
2518 __ zip2(v0.V4S(), v21.V4S(), v23.V4S());
2519 __ zip2(v25.V8B(), v23.V8B(), v30.V8B());
2520 __ zip2(v7.V8H(), v10.V8H(), v30.V8H());
2521 } // NOLINT(readability/fn_size)
2522
2523
GenerateTestSequenceNEONFP(MacroAssembler* masm)2524 static void GenerateTestSequenceNEONFP(MacroAssembler* masm) {
2525 ExactAssemblyScope guard(masm,
2526 masm->GetBuffer()->GetRemainingBytes(),
2527 ExactAssemblyScope::kMaximumSize);
2528
2529 // NEON floating point instructions.
2530 __ fabd(v3.V2D(), v25.V2D(), v8.V2D());
2531 __ fabd(v14.V2S(), v27.V2S(), v11.V2S());
2532 __ fabd(v9.V4S(), v22.V4S(), v18.V4S());
2533 __ fabs(v1.V2D(), v29.V2D());
2534 __ fabs(v6.V2S(), v21.V2S());
2535 __ fabs(v12.V4S(), v25.V4S());
2536 __ facge(v18.V2D(), v5.V2D(), v0.V2D());
2537 __ facge(v15.V2S(), v11.V2S(), v6.V2S());
2538 __ facge(v30.V4S(), v10.V4S(), v25.V4S());
2539 __ facgt(v28.V2D(), v16.V2D(), v31.V2D());
2540 __ facgt(v15.V2S(), v1.V2S(), v4.V2S());
2541 __ facgt(v22.V4S(), v3.V4S(), v10.V4S());
2542 __ fadd(v7.V2D(), v10.V2D(), v24.V2D());
2543 __ fadd(v10.V2S(), v23.V2S(), v7.V2S());
2544 __ fadd(v16.V4S(), v22.V4S(), v11.V4S());
2545 __ faddp(d27, v28.V2D());
2546 __ faddp(s20, v23.V2S());
2547 __ faddp(v21.V2D(), v4.V2D(), v11.V2D());
2548 __ faddp(v31.V2S(), v26.V2S(), v1.V2S());
2549 __ faddp(v13.V4S(), v27.V4S(), v28.V4S());
2550 __ fcmeq(v17.V2D(), v13.V2D(), v20.V2D());
2551 __ fcmeq(v24.V2D(), v16.V2D(), 0.0);
2552 __ fcmeq(v26.V2S(), v17.V2S(), v10.V2S());
2553 __ fcmeq(v24.V2S(), v4.V2S(), 0.0);
2554 __ fcmeq(v8.V4S(), v4.V4S(), v14.V4S());
2555 __ fcmeq(v26.V4S(), v25.V4S(), 0.0);
2556 __ fcmge(v27.V2D(), v0.V2D(), v0.V2D());
2557 __ fcmge(v22.V2D(), v30.V2D(), 0.0);
2558 __ fcmge(v7.V2S(), v21.V2S(), v25.V2S());
2559 __ fcmge(v15.V2S(), v15.V2S(), 0.0);
2560 __ fcmge(v29.V4S(), v4.V4S(), v27.V4S());
2561 __ fcmge(v22.V4S(), v21.V4S(), 0.0);
2562 __ fcmgt(v1.V2D(), v26.V2D(), v15.V2D());
2563 __ fcmgt(v15.V2D(), v23.V2D(), 0.0);
2564 __ fcmgt(v21.V2S(), v16.V2S(), v6.V2S());
2565 __ fcmgt(v1.V2S(), v13.V2S(), 0.0);
2566 __ fcmgt(v14.V4S(), v0.V4S(), v25.V4S());
2567 __ fcmgt(v13.V4S(), v8.V4S(), 0.0);
2568 __ fcmle(v4.V2D(), v6.V2D(), 0.0);
2569 __ fcmle(v24.V2S(), v31.V2S(), 0.0);
2570 __ fcmle(v8.V4S(), v23.V4S(), 0.0);
2571 __ fcmlt(v7.V2D(), v3.V2D(), 0.0);
2572 __ fcmlt(v15.V2S(), v21.V2S(), 0.0);
2573 __ fcmlt(v1.V4S(), v2.V4S(), 0.0);
2574 __ fcvtas(v6.V2D(), v8.V2D());
2575 __ fcvtas(v1.V2S(), v9.V2S());
2576 __ fcvtas(v8.V4S(), v19.V4S());
2577 __ fcvtau(v5.V2D(), v31.V2D());
2578 __ fcvtau(v28.V2S(), v29.V2S());
2579 __ fcvtau(v11.V4S(), v26.V4S());
2580 __ fcvtl(v8.V2D(), v25.V2S());
2581 __ fcvtl(v27.V4S(), v14.V4H());
2582 __ fcvtl2(v1.V2D(), v6.V4S());
2583 __ fcvtl2(v24.V4S(), v9.V8H());
2584 __ fcvtms(v9.V2D(), v24.V2D());
2585 __ fcvtms(v7.V2S(), v11.V2S());
2586 __ fcvtms(v23.V4S(), v21.V4S());
2587 __ fcvtmu(v13.V2D(), v1.V2D());
2588 __ fcvtmu(v26.V2S(), v12.V2S());
2589 __ fcvtmu(v21.V4S(), v21.V4S());
2590 __ fcvtn(v11.V2S(), v1.V2D());
2591 __ fcvtn(v8.V4H(), v2.V4S());
2592 __ fcvtn2(v24.V4S(), v29.V2D());
2593 __ fcvtn2(v4.V8H(), v10.V4S());
2594 __ fcvtns(v25.V2D(), v10.V2D());
2595 __ fcvtns(v4.V2S(), v8.V2S());
2596 __ fcvtns(v29.V4S(), v27.V4S());
2597 __ fcvtnu(v18.V2D(), v27.V2D());
2598 __ fcvtnu(v11.V2S(), v14.V2S());
2599 __ fcvtnu(v27.V4S(), v21.V4S());
2600 __ fcvtps(v23.V2D(), v5.V2D());
2601 __ fcvtps(v24.V2S(), v15.V2S());
2602 __ fcvtps(v5.V4S(), v19.V4S());
2603 __ fcvtpu(v3.V2D(), v21.V2D());
2604 __ fcvtpu(v3.V2S(), v21.V2S());
2605 __ fcvtpu(v0.V4S(), v7.V4S());
2606 __ fcvtxn(v29.V2S(), v11.V2D());
2607 __ fcvtxn2(v31.V4S(), v25.V2D());
2608 __ fcvtzs(v19.V2D(), v17.V2D());
2609 __ fcvtzs(v12.V2D(), v24.V2D(), 64);
2610 __ fcvtzs(v9.V2S(), v2.V2S());
2611 __ fcvtzs(v5.V2S(), v20.V2S(), 29);
2612 __ fcvtzs(v21.V4S(), v25.V4S());
2613 __ fcvtzs(v26.V4S(), v1.V4S(), 6);
2614 __ fcvtzu(v13.V2D(), v25.V2D());
2615 __ fcvtzu(v28.V2D(), v13.V2D(), 32);
2616 __ fcvtzu(v26.V2S(), v6.V2S());
2617 __ fcvtzu(v9.V2S(), v10.V2S(), 15);
2618 __ fcvtzu(v30.V4S(), v6.V4S());
2619 __ fcvtzu(v19.V4S(), v22.V4S(), 18);
2620 __ fdiv(v15.V2D(), v8.V2D(), v15.V2D());
2621 __ fdiv(v12.V2S(), v9.V2S(), v26.V2S());
2622 __ fdiv(v19.V4S(), v22.V4S(), v19.V4S());
2623 __ fmax(v19.V2D(), v7.V2D(), v8.V2D());
2624 __ fmax(v25.V2S(), v12.V2S(), v29.V2S());
2625 __ fmax(v6.V4S(), v15.V4S(), v5.V4S());
2626 __ fmaxnm(v16.V2D(), v8.V2D(), v20.V2D());
2627 __ fmaxnm(v15.V2S(), v26.V2S(), v25.V2S());
2628 __ fmaxnm(v23.V4S(), v14.V4S(), v16.V4S());
2629 __ fmaxnmp(d6, v19.V2D());
2630 __ fmaxnmp(s27, v26.V2S());
2631 __ fmaxnmp(v8.V2D(), v12.V2D(), v23.V2D());
2632 __ fmaxnmp(v13.V2S(), v25.V2S(), v22.V2S());
2633 __ fmaxnmp(v15.V4S(), v11.V4S(), v17.V4S());
2634 __ fmaxnmv(s27, v19.V4S());
2635 __ fmaxp(d20, v14.V2D());
2636 __ fmaxp(s18, v2.V2S());
2637 __ fmaxp(v9.V2D(), v23.V2D(), v31.V2D());
2638 __ fmaxp(v7.V2S(), v22.V2S(), v31.V2S());
2639 __ fmaxp(v18.V4S(), v7.V4S(), v29.V4S());
2640 __ fmaxv(s31, v29.V4S());
2641 __ fmin(v2.V2D(), v5.V2D(), v2.V2D());
2642 __ fmin(v31.V2S(), v17.V2S(), v10.V2S());
2643 __ fmin(v10.V4S(), v4.V4S(), v16.V4S());
2644 __ fminnm(v21.V2D(), v6.V2D(), v5.V2D());
2645 __ fminnm(v22.V2S(), v18.V2S(), v14.V2S());
2646 __ fminnm(v25.V4S(), v31.V4S(), v3.V4S());
2647 __ fminnmp(d9, v1.V2D());
2648 __ fminnmp(s21, v20.V2S());
2649 __ fminnmp(v16.V2D(), v21.V2D(), v19.V2D());
2650 __ fminnmp(v16.V2S(), v31.V2S(), v25.V2S());
2651 __ fminnmp(v26.V4S(), v16.V4S(), v15.V4S());
2652 __ fminnmv(s3, v4.V4S());
2653 __ fminp(d24, v26.V2D());
2654 __ fminp(s7, v17.V2S());
2655 __ fminp(v23.V2D(), v19.V2D(), v3.V2D());
2656 __ fminp(v29.V2S(), v21.V2S(), v9.V2S());
2657 __ fminp(v0.V4S(), v24.V4S(), v21.V4S());
2658 __ fminv(s25, v8.V4S());
2659 __ fmla(d23, d0, v9.D(), 1);
2660 __ fmla(s23, s15, v7.S(), 0);
2661 __ fmla(v17.V2D(), v11.V2D(), v6.V2D());
2662 __ fmla(v30.V2D(), v30.V2D(), v11.D(), 0);
2663 __ fmla(v19.V2S(), v12.V2S(), v6.V2S());
2664 __ fmla(v24.V2S(), v17.V2S(), v9.S(), 0);
2665 __ fmla(v16.V4S(), v11.V4S(), v11.V4S());
2666 __ fmla(v27.V4S(), v23.V4S(), v9.S(), 2);
2667 __ fmls(d27, d30, v6.D(), 0);
2668 __ fmls(s21, s16, v2.S(), 0);
2669 __ fmls(v5.V2D(), v19.V2D(), v21.V2D());
2670 __ fmls(v18.V2D(), v30.V2D(), v12.D(), 0);
2671 __ fmls(v5.V2S(), v16.V2S(), v7.V2S());
2672 __ fmls(v3.V2S(), v18.V2S(), v11.S(), 1);
2673 __ fmls(v27.V4S(), v5.V4S(), v30.V4S());
2674 __ fmls(v26.V4S(), v20.V4S(), v4.S(), 3);
2675 __ fmov(v14.V2D(), -0.34375);
2676 __ fmov(v26.V2S(), 0.90625f);
2677 __ fmov(v31.V4S(), -5.0000f);
2678 __ fmov(v28.D(), 1, x25);
2679 __ fmov(x18, v2.D(), 1);
2680 __ fmul(d12, d4, v1.D(), 1);
2681 __ fmul(s30, s1, v15.S(), 3);
2682 __ fmul(v25.V2D(), v0.V2D(), v21.V2D());
2683 __ fmul(v10.V2D(), v24.V2D(), v10.D(), 1);
2684 __ fmul(v7.V2S(), v24.V2S(), v16.V2S());
2685 __ fmul(v1.V2S(), v16.V2S(), v4.S(), 2);
2686 __ fmul(v5.V4S(), v28.V4S(), v25.V4S());
2687 __ fmul(v11.V4S(), v3.V4S(), v8.S(), 0);
2688 __ fmulx(d28, d9, v3.D(), 1);
2689 __ fmulx(s25, s21, v15.S(), 1);
2690 __ fmulx(v31.V2D(), v28.V2D(), v8.V2D());
2691 __ fmulx(v3.V2D(), v21.V2D(), v6.D(), 0);
2692 __ fmulx(v9.V2S(), v1.V2S(), v0.V2S());
2693 __ fmulx(v16.V2S(), v27.V2S(), v6.S(), 0);
2694 __ fmulx(v2.V4S(), v4.V4S(), v5.V4S());
2695 __ fmulx(v18.V4S(), v7.V4S(), v4.S(), 0);
2696 __ fneg(v1.V2D(), v25.V2D());
2697 __ fneg(v14.V2S(), v31.V2S());
2698 __ fneg(v5.V4S(), v4.V4S());
2699 __ frecpe(v18.V2D(), v12.V2D());
2700 __ frecpe(v10.V2S(), v22.V2S());
2701 __ frecpe(v5.V4S(), v6.V4S());
2702 __ frecps(v22.V2D(), v7.V2D(), v26.V2D());
2703 __ frecps(v31.V2S(), v27.V2S(), v2.V2S());
2704 __ frecps(v18.V4S(), v6.V4S(), v27.V4S());
2705 __ frinta(v26.V2D(), v13.V2D());
2706 __ frinta(v15.V2S(), v26.V2S());
2707 __ frinta(v13.V4S(), v16.V4S());
2708 __ frinti(v9.V2D(), v12.V2D());
2709 __ frinti(v5.V2S(), v19.V2S());
2710 __ frinti(v15.V4S(), v11.V4S());
2711 __ frintm(v17.V2D(), v29.V2D());
2712 __ frintm(v30.V2S(), v11.V2S());
2713 __ frintm(v1.V4S(), v20.V4S());
2714 __ frintn(v24.V2D(), v6.V2D());
2715 __ frintn(v12.V2S(), v17.V2S());
2716 __ frintn(v29.V4S(), v11.V4S());
2717 __ frintp(v10.V2D(), v7.V2D());
2718 __ frintp(v12.V2S(), v18.V2S());
2719 __ frintp(v26.V4S(), v31.V4S());
2720 __ frintx(v24.V2D(), v13.V2D());
2721 __ frintx(v7.V2S(), v9.V2S());
2722 __ frintx(v18.V4S(), v21.V4S());
2723 __ frintz(v19.V2D(), v25.V2D());
2724 __ frintz(v15.V2S(), v8.V2S());
2725 __ frintz(v20.V4S(), v3.V4S());
2726 __ frsqrte(v23.V2D(), v5.V2D());
2727 __ frsqrte(v9.V2S(), v7.V2S());
2728 __ frsqrte(v3.V4S(), v9.V4S());
2729 __ frsqrts(v25.V2D(), v28.V2D(), v15.V2D());
2730 __ frsqrts(v9.V2S(), v26.V2S(), v10.V2S());
2731 __ frsqrts(v5.V4S(), v1.V4S(), v10.V4S());
2732 __ fsqrt(v6.V2D(), v18.V2D());
2733 __ fsqrt(v6.V2S(), v18.V2S());
2734 __ fsqrt(v0.V4S(), v31.V4S());
2735 __ fsub(v31.V2D(), v30.V2D(), v31.V2D());
2736 __ fsub(v11.V2S(), v8.V2S(), v6.V2S());
2737 __ fsub(v16.V4S(), v0.V4S(), v31.V4S());
2738 __ scvtf(v25.V2D(), v31.V2D());
2739 __ scvtf(v10.V2D(), v13.V2D(), 45);
2740 __ scvtf(v10.V2S(), v15.V2S());
2741 __ scvtf(v18.V2S(), v4.V2S(), 27);
2742 __ scvtf(v17.V4S(), v5.V4S());
2743 __ scvtf(v11.V4S(), v25.V4S(), 24);
2744 __ ucvtf(v9.V2D(), v3.V2D());
2745 __ ucvtf(v26.V2D(), v30.V2D(), 46);
2746 __ ucvtf(v11.V2S(), v4.V2S());
2747 __ ucvtf(v29.V2S(), v3.V2S(), 25);
2748 __ ucvtf(v22.V4S(), v23.V4S());
2749 __ ucvtf(v18.V4S(), v9.V4S(), 25);
2750 }
2751
2752
GenerateTestSequenceSVE(MacroAssembler* masm)2753 static void GenerateTestSequenceSVE(MacroAssembler* masm) {
2754 ExactAssemblyScope guard(masm,
2755 masm->GetBuffer()->GetRemainingBytes(),
2756 ExactAssemblyScope::kMaximumSize);
2757 CPUFeaturesScope feature_guard(masm, CPUFeatures::kSVE);
2758
2759 // Simple, unpredicated loads and stores.
2760 __ str(p12.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2761 __ str(p13.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2762 __ str(p14.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2763 __ str(p15.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2764 __ ldr(p8.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2765 __ ldr(p9.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2766 __ ldr(p10.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2767 __ ldr(p11.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2768
2769 __ str(z0.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2770 __ str(z1.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2771 __ str(z2.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2772 __ str(z3.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2773 __ ldr(z20.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2774 __ ldr(z21.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2775 __ ldr(z22.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2776 __ ldr(z23.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2777
2778 // Structured accesses.
2779 __ st1b(z0.VnB(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2780 __ st1h(z1.VnH(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2781 __ st1w(z2.VnS(), p1, SVEMemOperand(x0, x3, LSL, 2));
2782 __ st1d(z3.VnD(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2783 __ ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2784 __ ld1h(z21.VnH(), p2.Zeroing(), SVEMemOperand(x0, x2, LSL, 1));
2785 __ ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2786 __ ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2787
2788 // Structured, packed accesses.
2789 __ st1b(z2.VnH(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2790 __ st1b(z3.VnS(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2791 __ st1b(z4.VnD(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2792 __ st1h(z0.VnS(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2793 __ st1h(z1.VnD(), p1, SVEMemOperand(x0, x2, LSL, 1));
2794 __ st1w(z2.VnD(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2795 __ ld1b(z20.VnH(), p1.Zeroing(), SVEMemOperand(x0, x2));
2796 __ ld1b(z21.VnS(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2797 __ ld1b(z22.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2798 __ ld1h(z23.VnS(), p2.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2799 __ ld1h(z24.VnD(), p2.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2800 __ ld1w(z20.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2801 __ ld1sb(z21.VnH(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2802 __ ld1sb(z22.VnS(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2803 __ ld1sb(z23.VnD(), p2.Zeroing(), SVEMemOperand(x0, x2));
2804 __ ld1sh(z24.VnS(), p2.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2805 __ ld1sh(z20.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2806 __ ld1sw(z21.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2807
2808 // Structured, interleaved accesses.
2809 __ st2b(z0.VnB(), z1.VnB(), p4, SVEMemOperand(x0, 4, SVE_MUL_VL));
2810 __ st2h(z1.VnH(), z2.VnH(), p4, SVEMemOperand(x0, 4, SVE_MUL_VL));
2811 __ st2w(z2.VnS(), z3.VnS(), p3, SVEMemOperand(x0, x2, LSL, 2));
2812 __ st2d(z3.VnD(), z4.VnD(), p4, SVEMemOperand(x0, 4, SVE_MUL_VL));
2813 __ ld2b(z20.VnB(), z21.VnB(), p5.Zeroing(), SVEMemOperand(x0, x2));
2814 __ ld2h(z21.VnH(), z22.VnH(), p6.Zeroing(), SVEMemOperand(x0, 4, SVE_MUL_VL));
2815 __ ld2w(z22.VnS(), z23.VnS(), p6.Zeroing(), SVEMemOperand(x0, 4, SVE_MUL_VL));
2816 __ ld2d(z23.VnD(), z24.VnD(), p5.Zeroing(), SVEMemOperand(x0, 4, SVE_MUL_VL));
2817
2818 __ st3b(z4.VnB(), z5.VnB(), z6.VnB(), p4, SVEMemOperand(x0, 3, SVE_MUL_VL));
2819 __ st3h(z5.VnH(), z6.VnH(), z7.VnH(), p4, SVEMemOperand(x0, 3, SVE_MUL_VL));
2820 __ st3w(z6.VnS(), z7.VnS(), z8.VnS(), p3, SVEMemOperand(x0, 3, SVE_MUL_VL));
2821 __ st3d(z7.VnD(), z8.VnD(), z9.VnD(), p4, SVEMemOperand(x0, x2, LSL, 3));
2822 __ ld3b(z24.VnB(),
2823 z25.VnB(),
2824 z26.VnB(),
2825 p5.Zeroing(),
2826 SVEMemOperand(x0, 3, SVE_MUL_VL));
2827 __ ld3h(z25.VnH(),
2828 z26.VnH(),
2829 z27.VnH(),
2830 p6.Zeroing(),
2831 SVEMemOperand(x0, x2, LSL, 1));
2832 __ ld3w(z26.VnS(),
2833 z27.VnS(),
2834 z28.VnS(),
2835 p6.Zeroing(),
2836 SVEMemOperand(x0, 3, SVE_MUL_VL));
2837 __ ld3d(z27.VnD(),
2838 z28.VnD(),
2839 z29.VnD(),
2840 p5.Zeroing(),
2841 SVEMemOperand(x0, 3, SVE_MUL_VL));
2842
2843 __ st4b(z31.VnB(),
2844 z0.VnB(),
2845 z1.VnB(),
2846 z2.VnB(),
2847 p4,
2848 SVEMemOperand(x0, 4, SVE_MUL_VL));
2849 __ st4h(z0.VnH(),
2850 z1.VnH(),
2851 z2.VnH(),
2852 z3.VnH(),
2853 p4,
2854 SVEMemOperand(x0, 4, SVE_MUL_VL));
2855 __ st4w(z1.VnS(),
2856 z2.VnS(),
2857 z3.VnS(),
2858 z4.VnS(),
2859 p3,
2860 SVEMemOperand(x0, 4, SVE_MUL_VL));
2861 __ st4d(z2.VnD(),
2862 z3.VnD(),
2863 z4.VnD(),
2864 z5.VnD(),
2865 p4,
2866 SVEMemOperand(x0, x2, LSL, 3));
2867 __ ld4b(z25.VnB(),
2868 z26.VnB(),
2869 z27.VnB(),
2870 z28.VnB(),
2871 p5.Zeroing(),
2872 SVEMemOperand(x0, 4, SVE_MUL_VL));
2873 __ ld4h(z26.VnH(),
2874 z27.VnH(),
2875 z28.VnH(),
2876 z29.VnH(),
2877 p6.Zeroing(),
2878 SVEMemOperand(x0, 4, SVE_MUL_VL));
2879 __ ld4w(z27.VnS(),
2880 z28.VnS(),
2881 z29.VnS(),
2882 z30.VnS(),
2883 p6.Zeroing(),
2884 SVEMemOperand(x0, x2, LSL, 2));
2885 __ ld4d(z28.VnD(),
2886 z29.VnD(),
2887 z30.VnD(),
2888 z31.VnD(),
2889 p5.Zeroing(),
2890 SVEMemOperand(x0, 4, SVE_MUL_VL));
2891 }
2892
GenerateTestSequenceAtomics(MacroAssembler* masm)2893 static void GenerateTestSequenceAtomics(MacroAssembler* masm) {
2894 ExactAssemblyScope guard(masm,
2895 masm->GetBuffer()->GetRemainingBytes(),
2896 ExactAssemblyScope::kMaximumSize);
2897 CPUFeaturesScope feature_guard(masm, CPUFeatures::kAtomics);
2898 __ sub(sp, sp, 16); // Claim some working space on the stack.
2899 __ mov(x0, 0x5555555555555555);
2900 __ str(x0, MemOperand(sp)); // Initialise working space.
2901
2902 #define INST_LIST(OP) \
2903 __ ld##OP##b(w0, w0, MemOperand(sp)); \
2904 __ ld##OP##ab(w0, w1, MemOperand(sp)); \
2905 __ ld##OP##lb(w0, w2, MemOperand(sp)); \
2906 __ ld##OP##alb(w0, w3, MemOperand(sp)); \
2907 __ ld##OP##h(w0, w0, MemOperand(sp)); \
2908 __ ld##OP##ah(w0, w1, MemOperand(sp)); \
2909 __ ld##OP##lh(w0, w2, MemOperand(sp)); \
2910 __ ld##OP##alh(w0, w3, MemOperand(sp)); \
2911 __ ld##OP(w0, w0, MemOperand(sp)); \
2912 __ ld##OP##a(w0, w1, MemOperand(sp)); \
2913 __ ld##OP##l(w0, w2, MemOperand(sp)); \
2914 __ ld##OP##al(w0, w3, MemOperand(sp)); \
2915 __ ld##OP(x0, x0, MemOperand(sp)); \
2916 __ ld##OP##a(x0, x1, MemOperand(sp)); \
2917 __ ld##OP##l(x0, x2, MemOperand(sp)); \
2918 __ ld##OP##al(x0, x3, MemOperand(sp)); \
2919 __ st##OP##b(w0, MemOperand(sp)); \
2920 __ st##OP##lb(w0, MemOperand(sp)); \
2921 __ st##OP##h(w0, MemOperand(sp)); \
2922 __ st##OP##lh(w0, MemOperand(sp)); \
2923 __ st##OP(w0, MemOperand(sp)); \
2924 __ st##OP##l(w0, MemOperand(sp)); \
2925 __ st##OP(x0, MemOperand(sp)); \
2926 __ st##OP##l(x0, MemOperand(sp));
2927
2928 INST_LIST(add);
2929 INST_LIST(set);
2930 INST_LIST(eor);
2931 INST_LIST(smin);
2932 INST_LIST(smax);
2933 INST_LIST(umin);
2934 INST_LIST(umax);
2935 INST_LIST(clr);
2936
2937 #undef INST_LIST
2938
2939 __ add(sp, sp, 16); // Restore stack pointer.
2940 }
2941
MaskAddresses(const char* trace)2942 static void MaskAddresses(const char* trace) {
2943 #define VIXL_COLOUR "(\x1b\\[[01];([0-9][0-9])?m)?"
2944 // All patterns are replaced with "$1~~~~~~~~~~~~~~~~".
2945 std::regex patterns[] =
2946 {// Mask registers that hold addresses that change from run to run.
2947 std::regex("((x0|x1|x2|sp): " VIXL_COLOUR "0x)[0-9a-f]{16}"),
2948 // Mask accessed memory addresses.
2949 std::regex("((<-|->) " VIXL_COLOUR "0x)[0-9a-f]{16}"),
2950 // Mask instruction addresses.
2951 std::regex("^(0x)[0-9a-f]{16}"),
2952 // Mask branch targets.
2953 std::regex("(Branch" VIXL_COLOUR " to 0x)[0-9a-f]{16}"),
2954 // Mask explicit address annotations.
2955 std::regex("(addr 0x)[0-9a-f]+")};
2956 #undef VIXL_COLOUR
2957
2958 std::vector<std::string> lines;
2959 std::ifstream in(trace);
2960 while (!in.eof()) {
2961 std::string line;
2962 std::getline(in, line);
2963 for (auto&& pattern : patterns) {
2964 line = std::regex_replace(line, pattern, "$1~~~~~~~~~~~~~~~~");
2965 }
2966 lines.push_back(line);
2967 }
2968 in.close();
2969
2970 // `getline` produces an empty line after a terminal "\n".
2971 if (lines.back().empty()) lines.pop_back();
2972
2973 std::ofstream out(trace, std::ofstream::trunc);
2974 for (auto&& line : lines) {
2975 out << line << "\n";
2976 }
2977 }
2978
PrintFile(const char* name)2979 static void PrintFile(const char* name) {
2980 FILE* file = fopen(name, "r");
2981 char buffer[1024]; // The buffer size is arbitrary.
2982 while (fgets(buffer, sizeof(buffer), file) != NULL) fputs(buffer, stdout);
2983 fclose(file);
2984 }
2985
CheckOrGenerateTrace(const char* filename, const char* ref_file)2986 static bool CheckOrGenerateTrace(const char* filename, const char* ref_file) {
2987 bool trace_matched_reference;
2988 if (Test::generate_test_trace()) {
2989 // Copy trace_stream to stdout.
2990 FILE* trace_stream = fopen(filename, "r");
2991 VIXL_ASSERT(trace_stream != NULL);
2992 fseek(trace_stream, 0, SEEK_SET);
2993 int c;
2994 while (1) {
2995 c = getc(trace_stream);
2996 if (c == EOF) break;
2997 putc(c, stdout);
2998 }
2999 fclose(trace_stream);
3000 trace_matched_reference = true;
3001 } else {
3002 // Check trace_stream against ref_file.
3003 char command[1024];
3004 size_t length =
3005 snprintf(command, sizeof(command), "diff -u %s %s", ref_file, filename);
3006 VIXL_CHECK(length < sizeof(command));
3007 trace_matched_reference = (system(command) == 0);
3008 }
3009 return trace_matched_reference;
3010 }
3011
3012
3013 // Trace tests can only work with the simulator.
3014 #ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
3015
3016 static void TraceTestHelper(bool coloured_trace,
3017 TraceParameters trace_parameters,
3018 const char* ref_file) {
3019 MacroAssembler masm(12 * KBytes);
3020
3021 char trace_stream_filename[] = "/tmp/vixl-test-trace-XXXXXX";
3022 FILE* trace_stream = fdopen(mkstemp(trace_stream_filename), "w");
3023
3024 Decoder decoder;
3025 Simulator simulator(&decoder, trace_stream);
3026 simulator.SetColouredTrace(coloured_trace);
3027 simulator.SetTraceParameters(trace_parameters);
3028 simulator.SilenceExclusiveAccessWarning();
3029
3030 const int vl_in_bytes = 5 * kZRegMinSizeInBytes;
3031 const int vl_in_bits = vl_in_bytes * kBitsPerByte;
3032 const int pl_in_bits = vl_in_bits / kZRegBitsPerPRegBit;
3033 simulator.SetVectorLengthInBits(vl_in_bits);
3034
3035 // Set up a scratch buffer so we can test loads and stores.
3036 const int kScratchSize = vl_in_bytes * 1024;
3037 const int kScratchGuardSize = vl_in_bytes;
3038 char scratch_buffer[kScratchSize + kScratchGuardSize];
3039 for (size_t i = 0; i < (sizeof(scratch_buffer) / sizeof(scratch_buffer[0]));
3040 i++) {
3041 scratch_buffer[i] = i & 0xff;
3042 }
3043 // Used for offset addressing.
3044 simulator.WriteXRegister(0, reinterpret_cast<uintptr_t>(scratch_buffer));
3045 // Used for pre-/post-index addressing.
3046 simulator.WriteXRegister(1, reinterpret_cast<uintptr_t>(scratch_buffer));
3047
3048 const int kPostIndexRegisterStep = 13; // Arbitrary interesting value.
3049 // Used for post-index offsets.
3050 simulator.WriteXRegister(2, kPostIndexRegisterStep);
3051
3052 // Initialize the other registers with unique values.
3053 uint64_t initial_base_u64 = 0x0100001000100101;
3054 for (unsigned i = 3; i < kNumberOfRegisters; i++) {
3055 if (i == kLinkRegCode) continue;
3056 if (i == kZeroRegCode) continue;
3057 // NoRegLog suppresses the log now, but the registers will still be logged
3058 // before the first instruction is executed since they have been written but
3059 // not printed.
3060 simulator.WriteRegister(i, initial_base_u64 * i, Simulator::NoRegLog);
3061 }
3062 for (unsigned r = 0; r < kNumberOfVRegisters; r++) {
3063 LogicVRegister reg(simulator.ReadVRegister(r));
3064 // Try to initialise Z registers with reasonable FP values. We prioritise
3065 // setting double values, then floats and half-precision values. The lanes
3066 // overlap, so this is a compromise, but d0, s0 and h0 views all see similar
3067 // arithmetic values.
3068 //
3069 // The exponent of each value is set to the (biased) register number. We set
3070 // the double, float and half-precision exponents where we can.
3071 uint64_t base = 0x3ff000003f803c00 + (0x0010000000800400 * (0x7f + r));
3072 for (unsigned lane = 0; lane < (vl_in_bytes / kDRegSizeInBytes); lane++) {
3073 uint64_t mantissas = 0x0000000100010001 * (lane & 0x7f);
3074 reg.SetUint(kFormatVnD, lane, base | mantissas);
3075 }
3076 }
3077 for (unsigned r = 0; r < kNumberOfPRegisters; r++) {
3078 LogicPRegister reg(simulator.ReadPRegister(r));
3079 // Set `r` active lanes between each inactive lane.
3080 for (unsigned bit = 0; bit < pl_in_bits; bit++) {
3081 reg.SetActive(kFormatVnB, bit, ((bit + 1) % (r + 2)) != 0);
3082 }
3083 // Completely clear some Q-sized blocks. The trace will completely omit
3084 // these for stores.
3085 for (unsigned chunk = 0; chunk < (vl_in_bits / kQRegSize); chunk++) {
3086 if (((chunk + 1) % (r + 2)) == 0) {
3087 reg.SetActiveMask(chunk, static_cast<uint16_t>(0));
3088 }
3089 }
3090 }
3091
3092 GenerateTestSequenceBase(&masm);
3093 GenerateTestSequenceFP(&masm);
3094 GenerateTestSequenceNEON(&masm);
3095 GenerateTestSequenceNEONFP(&masm);
3096 GenerateTestSequenceSVE(&masm);
3097 GenerateTestSequenceAtomics(&masm);
3098 masm.Ret();
3099 masm.FinalizeCode();
3100
3101 if (Test::disassemble()) {
3102 PrintDisassembler disasm(stdout);
3103 Instruction* start = masm.GetBuffer()->GetStartAddress<Instruction*>();
3104 Instruction* end = masm.GetBuffer()->GetEndAddress<Instruction*>();
3105 disasm.DisassembleBuffer(start, end);
3106 }
3107
3108 simulator.RunFrom(masm.GetBuffer()->GetStartAddress<Instruction*>());
3109
3110 fclose(trace_stream);
3111
3112 // We already traced into the temporary file, so just print the file.
3113 // Note that these tests need to control the trace flags, so we ignore all
3114 // --trace-* options here except for --trace-sim.
3115 if (Test::trace_sim()) PrintFile(trace_stream_filename);
3116
3117 MaskAddresses(trace_stream_filename);
3118
3119 bool trace_matched_reference =
3120 CheckOrGenerateTrace(trace_stream_filename, ref_file);
3121 remove(trace_stream_filename); // Clean up before checking the result.
3122 VIXL_CHECK(trace_matched_reference);
3123
3124 uint64_t offset_base = simulator.ReadRegister<uint64_t>(0);
3125 uint64_t index_base = simulator.ReadRegister<uint64_t>(1);
3126
3127 VIXL_CHECK(index_base >= offset_base);
3128 VIXL_CHECK((index_base - offset_base) <= kScratchSize);
3129 }
3130
3131
3132 // Test individual options.
3133 TEST(disasm) { TraceTestHelper(false, LOG_DISASM, REF("log-disasm")); }
3134 TEST(regs) { TraceTestHelper(false, LOG_REGS, REF("log-regs")); }
3135 TEST(vregs) { TraceTestHelper(false, LOG_VREGS, REF("log-vregs")); }
3136 TEST(sysregs) { TraceTestHelper(false, LOG_SYSREGS, REF("log-sysregs")); }
3137 TEST(write) { TraceTestHelper(false, LOG_WRITE, REF("log-write")); }
3138 TEST(branch) { TraceTestHelper(false, LOG_WRITE, REF("log-branch")); }
3139
3140 // Test standard combinations.
3141 TEST(none) { TraceTestHelper(false, LOG_NONE, REF("log-none")); }
3142 TEST(state) { TraceTestHelper(false, LOG_STATE, REF("log-state")); }
3143 TEST(all) { TraceTestHelper(false, LOG_ALL, REF("log-all")); }
3144
3145
3146 // Test individual options (with colour).
3147 TEST(disasm_colour) {
3148 TraceTestHelper(true, LOG_DISASM, REF("log-disasm-colour"));
3149 }
3150 TEST(regs_colour) { TraceTestHelper(true, LOG_REGS, REF("log-regs-colour")); }
3151 TEST(vregs_colour) {
3152 TraceTestHelper(true, LOG_VREGS, REF("log-vregs-colour"));
3153 }
3154 TEST(sysregs_colour) {
3155 TraceTestHelper(true, LOG_SYSREGS, REF("log-sysregs-colour"));
3156 }
3157 TEST(write_colour) {
3158 TraceTestHelper(true, LOG_WRITE, REF("log-write-colour"));
3159 }
3160 TEST(branch_colour) {
3161 TraceTestHelper(true, LOG_WRITE, REF("log-branch-colour"));
3162 }
3163
3164 // Test standard combinations (with colour).
3165 TEST(none_colour) { TraceTestHelper(true, LOG_NONE, REF("log-none-colour")); }
3166 TEST(state_colour) {
3167 TraceTestHelper(true, LOG_STATE, REF("log-state-colour"));
3168 }
3169 TEST(all_colour) { TraceTestHelper(true, LOG_ALL, REF("log-all-colour")); }
3170
3171 #endif // VIXL_INCLUDE_SIMULATOR_AARCH64
3172
3173 static void PrintDisassemblerTestHelper(const char* prefix,
3174 const char* suffix,
3175 const char* ref_file) {
3176 MacroAssembler masm(12 * KBytes);
3177
3178 char trace_stream_filename[] = "/tmp/vixl-test-trace-XXXXXX";
3179 FILE* trace_stream = fdopen(mkstemp(trace_stream_filename), "w");
3180
3181 // We don't need to execute this code so there's no need for the execution
3182 // environment setup from TraceTestHelper.
3183
3184 GenerateTestSequenceBase(&masm);
3185 GenerateTestSequenceFP(&masm);
3186 GenerateTestSequenceNEON(&masm);
3187 GenerateTestSequenceNEONFP(&masm);
3188 GenerateTestSequenceSVE(&masm);
3189 GenerateTestSequenceAtomics(&masm);
3190 masm.FinalizeCode();
3191
3192 Decoder decoder;
3193 CPUFeaturesAuditor auditor(&decoder);
3194 PrintDisassembler disasm(trace_stream);
3195 if (prefix != NULL) disasm.SetCPUFeaturesPrefix(prefix);
3196 if (suffix != NULL) disasm.SetCPUFeaturesSuffix(suffix);
3197 disasm.RegisterCPUFeaturesAuditor(&auditor);
3198 decoder.AppendVisitor(&disasm);
3199
3200 Instruction* instruction = masm.GetBuffer()->GetStartAddress<Instruction*>();
3201 Instruction* end = masm.GetCursorAddress<Instruction*>();
3202 while (instruction != end) {
3203 decoder.Decode(instruction);
3204 instruction += kInstructionSize;
3205 }
3206
3207 fclose(trace_stream);
3208
3209 // We already disassembled into the temporary file, so just print the file.
3210 if (Test::disassemble()) PrintFile(trace_stream_filename);
3211
3212 MaskAddresses(trace_stream_filename);
3213
3214 bool trace_matched_reference =
3215 CheckOrGenerateTrace(trace_stream_filename, ref_file);
3216 remove(trace_stream_filename); // Clean up before checking the result.
3217 VIXL_CHECK(trace_matched_reference);
3218 }
3219
3220
3221 // Test CPUFeatures disassembly annotations.
3222 TEST(cpufeatures) {
3223 PrintDisassemblerTestHelper(NULL, NULL, REF("log-cpufeatures"));
3224 }
3225 TEST(cpufeatures_custom) {
3226 PrintDisassemblerTestHelper("### {", "} ###", REF("log-cpufeatures-custom"));
3227 }
3228 TEST(cpufeatures_colour) {
3229 // The colour chosen is arbitrary.
3230 PrintDisassemblerTestHelper("\033[1;35m", // Prefix: Bold magenta.
3231 "\033[0;m", // Suffix: Reset colour.
3232 REF("log-cpufeatures-colour"));
3233 }
3234 } // namespace aarch64
3235 } // namespace vixl
3236