1// Copyright 2016, VIXL authors
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions are met:
6//
7//   * Redistributions of source code must retain the above copyright notice,
8//     this list of conditions and the following disclaimer.
9//   * Redistributions in binary form must reproduce the above copyright notice,
10//     this list of conditions and the following disclaimer in the documentation
11//     and/or other materials provided with the distribution.
12//   * Neither the name of ARM Limited nor the names of its contributors may be
13//     used to endorse or promote products derived from this software without
14//     specific prior written permission.
15//
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS CONTRIBUTORS "AS IS" AND
17// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26
27#include <cfloat>
28#include <cmath>
29#include <cstdio>
30#include <cstdlib>
31#include <cstring>
32#include <fstream>
33#include <regex>
34
35#include "test-runner.h"
36
37#include "aarch64/cpu-aarch64.h"
38#include "aarch64/disasm-aarch64.h"
39#include "aarch64/macro-assembler-aarch64.h"
40#include "aarch64/simulator-aarch64.h"
41#include "test-utils-aarch64.h"
42
43namespace vixl {
44namespace aarch64 {
45
46#define __ masm->
47#define TEST(name) TEST_(TRACE_##name)
48
49#define REF(name) "test/test-trace-reference/" name
50
51static void GenerateTestSequenceBase(MacroAssembler* masm) {
52  ExactAssemblyScope guard(masm,
53                           masm->GetBuffer()->GetRemainingBytes(),
54                           ExactAssemblyScope::kMaximumSize);
55
56  __ adc(w3, w4, w5);
57  __ adc(x6, x7, x8);
58  __ adcs(w9, w10, w11);
59  __ adcs(x12, x13, x14);
60  __ add(w15, w16, w17);
61  __ add(x18, x19, x20);
62  __ adds(w21, w22, w23);
63  __ adds(x24, x25, x26);
64  __ and_(w27, w28, w29);
65  __ and_(x2, x3, x4);
66  __ ands(w5, w6, w7);
67  __ ands(x8, x9, x10);
68  __ asr(w11, w12, 0);
69  __ asr(x13, x14, 1);
70  __ asrv(w15, w16, w17);
71  __ asrv(x18, x19, x20);
72  __ bfm(w21, w22, 5, 6);
73  __ bfm(x23, x24, 7, 8);
74  __ bic(w25, w26, w27);
75  __ bic(x28, x29, x2);
76  __ bics(w3, w4, w5);
77  __ bics(x6, x7, x8);
78  __ ccmn(w9, w10, NoFlag, al);
79  __ ccmn(w9, w10, NoFlag, eq);
80  __ ccmn(w9, w10, NoFlag, ne);
81  __ ccmn(x11, x12, CFlag, al);
82  __ ccmn(x11, x12, CFlag, cc);
83  __ ccmn(x11, x12, CFlag, cs);
84  __ ccmp(w13, w14, VFlag, al);
85  __ ccmp(w13, w14, VFlag, hi);
86  __ ccmp(w13, w14, VFlag, ls);
87  __ ccmp(x15, x16, CVFlag, al);
88  __ ccmp(x15, x16, CVFlag, eq);
89  __ ccmp(x15, x16, CVFlag, ne);
90  __ cinc(w17, w18, cc);
91  __ cinc(w17, w18, cs);
92  __ cinc(x19, x20, hi);
93  __ cinc(x19, x20, ls);
94  __ cinv(w21, w22, eq);
95  __ cinv(w21, w22, ne);
96  __ cinv(x23, x24, cc);
97  __ cinv(x23, x24, cs);
98  __ clrex();
99  __ cls(w25, w26);
100  __ cls(x27, x28);
101  __ clz(w29, w2);
102  __ clz(x3, x4);
103  __ cmn(w5, w6);
104  __ cmn(x7, x8);
105  __ cmp(w9, w10);
106  __ cmp(x11, x12);
107  __ cneg(w13, w14, hi);
108  __ cneg(w13, w14, ls);
109  __ cneg(x15, x16, eq);
110  __ cneg(x15, x16, ne);
111  __ crc32b(w17, w18, w19);
112  __ crc32cb(w20, w21, w22);
113  __ crc32ch(w23, w24, w25);
114  __ crc32cw(w26, w27, w28);
115  __ crc32h(w4, w5, w6);
116  __ crc32w(w7, w8, w9);
117  __ csel(w13, w14, w15, cc);
118  __ csel(w13, w14, w15, cs);
119  __ csel(x16, x17, x18, hi);
120  __ csel(x16, x17, x18, ls);
121  __ cset(w19, eq);
122  __ cset(w19, ne);
123  __ cset(x20, cc);
124  __ cset(x20, cs);
125  __ csetm(w21, hi);
126  __ csetm(w21, ls);
127  __ csetm(x22, eq);
128  __ csetm(x22, ne);
129  __ csinc(w23, w24, w25, cc);
130  __ csinc(w23, w24, w25, cs);
131  __ csinc(x26, x27, x28, hi);
132  __ csinc(x26, x27, x28, ls);
133  __ csinv(w29, w2, w3, eq);
134  __ csinv(w29, w2, w3, ne);
135  __ csinv(x4, x5, x6, cc);
136  __ csinv(x4, x5, x6, cs);
137  __ csneg(w7, w8, w9, hi);
138  __ csneg(w7, w8, w9, ls);
139  __ csneg(x10, x11, x12, eq);
140  __ csneg(x10, x11, x12, ne);
141  __ dc(CVAC, x0);
142  __ dmb(InnerShareable, BarrierAll);
143  __ dsb(InnerShareable, BarrierAll);
144  __ eon(w13, w14, w15);
145  __ eon(x16, x17, x18);
146  __ eor(w19, w20, w21);
147  __ eor(x22, x23, x24);
148  __ extr(w25, w26, w27, 9);
149  __ extr(x28, x29, x2, 10);
150  __ hint(NOP);
151  __ ic(IVAU, x0);
152  __ isb();
153  __ ldar(w3, MemOperand(x0));
154  __ ldar(x4, MemOperand(x0));
155  __ ldarb(w5, MemOperand(x0));
156  __ ldarb(x6, MemOperand(x0));
157  __ ldarh(w7, MemOperand(x0));
158  __ ldarh(x8, MemOperand(x0));
159  __ ldaxp(w9, w10, MemOperand(x0));
160  __ ldaxp(x11, x12, MemOperand(x0));
161  __ ldaxr(w13, MemOperand(x0));
162  __ ldaxr(x14, MemOperand(x0));
163  __ ldaxrb(w15, MemOperand(x0));
164  __ ldaxrb(x16, MemOperand(x0));
165  __ ldaxrh(w17, MemOperand(x0));
166  __ ldaxrh(x18, MemOperand(x0));
167  __ ldnp(w19, w20, MemOperand(x0));
168  __ ldnp(x21, x22, MemOperand(x0));
169  __ ldp(w23, w24, MemOperand(x0));
170  __ ldp(w23, w24, MemOperand(x1, 8, PostIndex));
171  __ ldp(w23, w24, MemOperand(x1, 8, PreIndex));
172  __ ldp(x25, x26, MemOperand(x0));
173  __ ldp(x25, x26, MemOperand(x1, 16, PostIndex));
174  __ ldp(x25, x26, MemOperand(x1, 16, PreIndex));
175  __ ldpsw(x27, x28, MemOperand(x0));
176  __ ldpsw(x27, x28, MemOperand(x1, 8, PostIndex));
177  __ ldpsw(x27, x28, MemOperand(x1, 8, PreIndex));
178  __ ldr(w29, MemOperand(x0));
179  __ ldr(w29, MemOperand(x1, 4, PostIndex));
180  __ ldr(w29, MemOperand(x1, 4, PreIndex));
181  __ ldr(x2, MemOperand(x0));
182  __ ldr(x2, MemOperand(x1, 8, PostIndex));
183  __ ldr(x2, MemOperand(x1, 8, PreIndex));
184  __ ldrb(w3, MemOperand(x0));
185  __ ldrb(w3, MemOperand(x1, 1, PostIndex));
186  __ ldrb(w3, MemOperand(x1, 1, PreIndex));
187  __ ldrb(x4, MemOperand(x0));
188  __ ldrb(x4, MemOperand(x1, 1, PostIndex));
189  __ ldrb(x4, MemOperand(x1, 1, PreIndex));
190  __ ldrh(w5, MemOperand(x0));
191  __ ldrh(w5, MemOperand(x1, 2, PostIndex));
192  __ ldrh(w5, MemOperand(x1, 2, PreIndex));
193  __ ldrh(x6, MemOperand(x0));
194  __ ldrh(x6, MemOperand(x1, 2, PostIndex));
195  __ ldrh(x6, MemOperand(x1, 2, PreIndex));
196  __ ldrsb(w7, MemOperand(x0));
197  __ ldrsb(w7, MemOperand(x1, 1, PostIndex));
198  __ ldrsb(w7, MemOperand(x1, 1, PreIndex));
199  __ ldrsb(x8, MemOperand(x0));
200  __ ldrsb(x8, MemOperand(x1, 1, PostIndex));
201  __ ldrsb(x8, MemOperand(x1, 1, PreIndex));
202  __ ldrsh(w9, MemOperand(x0));
203  __ ldrsh(w9, MemOperand(x1, 2, PostIndex));
204  __ ldrsh(w9, MemOperand(x1, 2, PreIndex));
205  __ ldrsh(x10, MemOperand(x0));
206  __ ldrsh(x10, MemOperand(x1, 2, PostIndex));
207  __ ldrsh(x10, MemOperand(x1, 2, PreIndex));
208  __ ldrsw(x11, MemOperand(x0));
209  __ ldrsw(x11, MemOperand(x1, 4, PostIndex));
210  __ ldrsw(x11, MemOperand(x1, 4, PreIndex));
211  __ ldur(w12, MemOperand(x0, 7));
212  __ ldur(x13, MemOperand(x0, 15));
213  __ ldurb(w14, MemOperand(x0, 1));
214  __ ldurb(x15, MemOperand(x0, 1));
215  __ ldurh(w16, MemOperand(x0, 3));
216  __ ldurh(x17, MemOperand(x0, 3));
217  __ ldursb(w18, MemOperand(x0, 1));
218  __ ldursb(x19, MemOperand(x0, 1));
219  __ ldursh(w20, MemOperand(x0, 3));
220  __ ldursh(x21, MemOperand(x0, 3));
221  __ ldursw(x22, MemOperand(x0, 7));
222  __ ldxp(w23, w24, MemOperand(x0));
223  __ ldxp(x25, x26, MemOperand(x0));
224  __ ldxr(w27, MemOperand(x0));
225  __ ldxr(x28, MemOperand(x0));
226  __ ldxrb(w29, MemOperand(x0));
227  __ ldxrb(x2, MemOperand(x0));
228  __ ldxrh(w3, MemOperand(x0));
229  __ ldxrh(x4, MemOperand(x0));
230  __ lsl(w5, w6, 2);
231  __ lsl(x7, x8, 3);
232  __ lslv(w9, w10, w11);
233  __ lslv(x12, x13, x14);
234  __ lsr(w15, w16, 4);
235  __ lsr(x17, x18, 5);
236  __ lsrv(w19, w20, w21);
237  __ lsrv(x22, x23, x24);
238  __ madd(w25, w26, w27, w28);
239  __ madd(x29, x2, x3, x4);
240  __ mneg(w5, w6, w7);
241  __ mneg(x8, x9, x10);
242  __ mov(w11, w12);
243  __ mov(x13, x14);
244  __ movk(w15, 130);
245  __ movk(x16, 131);
246  __ movn(w17, 132);
247  __ movn(x18, 133);
248  __ movz(w19, 134);
249  __ movz(x20, 135);
250  __ msub(w22, w23, w24, w25);
251  __ msub(x26, x27, x28, x29);
252  __ mul(w2, w3, w4);
253  __ mul(x5, x6, x7);
254  __ mvn(w8, w9);
255  __ mvn(x10, x11);
256  __ neg(w12, w13);
257  __ neg(x14, x15);
258  __ negs(w16, w17);
259  __ negs(x18, x19);
260  __ ngc(w20, w21);
261  __ ngc(x22, x23);
262  __ ngcs(w24, w25);
263  __ ngcs(x26, x27);
264  __ nop();
265  __ orn(w28, w29, w2);
266  __ orn(x3, x4, x5);
267  __ orr(w6, w7, w8);
268  __ orr(x9, x10, x11);
269  __ prfm(PLDL1KEEP, MemOperand(x0, 4));
270  __ prfum(PLDL1KEEP, MemOperand(x0, 1));
271  __ rbit(w12, w13);
272  __ rbit(x14, x15);
273  __ rev(w16, w17);
274  __ rev(x18, x19);
275  __ rev16(w20, w21);
276  __ rev16(x22, x23);
277  __ rev32(x24, x25);
278  __ rorv(w26, w27, w28);
279  __ rorv(x29, x2, x3);
280  __ sbc(w4, w5, w6);
281  __ sbc(x7, x8, x9);
282  __ sbcs(w10, w11, w12);
283  __ sbcs(x13, x14, x15);
284  __ sbfiz(w16, w17, 2, 3);
285  __ sbfiz(x18, x19, 4, 5);
286  __ sbfx(w22, w23, 6, 7);
287  __ sbfx(x24, x25, 8, 9);
288  __ sdiv(w26, w27, w28);
289  __ sdiv(x29, x2, x3);
290  __ smulh(x12, x13, x14);
291  __ stlr(w18, MemOperand(x0));
292  __ stlr(x19, MemOperand(x0));
293  __ stlrb(w20, MemOperand(x0));
294  __ stlrb(x21, MemOperand(x0));
295  __ stlrh(w22, MemOperand(x0));
296  __ stlrh(x23, MemOperand(x0));
297  __ stlxp(w24, w25, w26, MemOperand(x0));
298  __ stlxp(x27, x28, x29, MemOperand(x0));
299  __ stlxr(w2, w3, MemOperand(x0));
300  __ stlxr(x4, x5, MemOperand(x0));
301  __ stlxrb(w6, w7, MemOperand(x0));
302  __ stlxrb(x8, x9, MemOperand(x0));
303  __ stlxrh(w10, w11, MemOperand(x0));
304  __ stlxrh(x12, x13, MemOperand(x0));
305  __ stnp(w14, w15, MemOperand(x0));
306  __ stnp(x16, x17, MemOperand(x0));
307  __ stp(w18, w19, MemOperand(x0));
308  __ stp(w18, w19, MemOperand(x1, 8, PostIndex));
309  __ stp(w18, w19, MemOperand(x1, 8, PreIndex));
310  __ stp(x20, x21, MemOperand(x0));
311  __ stp(x20, x21, MemOperand(x1, 16, PostIndex));
312  __ stp(x20, x21, MemOperand(x1, 16, PreIndex));
313  __ str(w22, MemOperand(x0));
314  __ str(w22, MemOperand(x1, 4, PostIndex));
315  __ str(w22, MemOperand(x1, 4, PreIndex));
316  __ str(x23, MemOperand(x0));
317  __ str(x23, MemOperand(x1, 8, PostIndex));
318  __ str(x23, MemOperand(x1, 8, PreIndex));
319  __ strb(w24, MemOperand(x0));
320  __ strb(w24, MemOperand(x1, 1, PostIndex));
321  __ strb(w24, MemOperand(x1, 1, PreIndex));
322  __ strb(x25, MemOperand(x0));
323  __ strb(x25, MemOperand(x1, 1, PostIndex));
324  __ strb(x25, MemOperand(x1, 1, PreIndex));
325  __ strh(w26, MemOperand(x0));
326  __ strh(w26, MemOperand(x1, 2, PostIndex));
327  __ strh(w26, MemOperand(x1, 2, PreIndex));
328  __ strh(x27, MemOperand(x0));
329  __ strh(x27, MemOperand(x1, 2, PostIndex));
330  __ strh(x27, MemOperand(x1, 2, PreIndex));
331  __ stur(w28, MemOperand(x0, 7));
332  __ stur(x29, MemOperand(x0, 15));
333  __ sturb(w2, MemOperand(x0, 1));
334  __ sturb(x3, MemOperand(x0, 1));
335  __ sturh(w4, MemOperand(x0, 3));
336  __ sturh(x5, MemOperand(x0, 3));
337  __ stxp(w6, w7, w8, MemOperand(x0));
338  __ stxp(x9, x10, x11, MemOperand(x0));
339  __ stxr(w12, w13, MemOperand(x0));
340  __ stxr(x14, x15, MemOperand(x0));
341  __ stxrb(w16, w17, MemOperand(x0));
342  __ stxrb(x18, x19, MemOperand(x0));
343  __ stxrh(w20, w21, MemOperand(x0));
344  __ stxrh(x22, x23, MemOperand(x0));
345  __ sub(w24, w25, w26);
346  __ sub(x27, x28, x29);
347  __ subs(w2, w3, w4);
348  __ subs(x5, x6, x7);
349  __ sxtb(w8, w9);
350  __ sxtb(x10, x11);
351  __ sxth(w12, w13);
352  __ sxth(x14, x15);
353  __ sxtw(w16, w17);
354  __ sxtw(x18, x19);
355  __ tst(w20, w21);
356  __ tst(x22, x23);
357  __ ubfiz(w24, w25, 10, 11);
358  __ ubfiz(x26, x27, 12, 13);
359  __ ubfm(w28, w29, 14, 15);
360  __ ubfm(x2, x3, 1, 2);
361  __ ubfx(w4, w5, 3, 4);
362  __ ubfx(x6, x7, 5, 6);
363  __ udiv(w8, w9, w10);
364  __ udiv(x11, x12, x13);
365  __ umulh(x22, x23, x24);
366  __ uxtb(w28, w29);
367  __ uxtb(x2, x3);
368  __ uxth(w4, w5);
369  __ uxth(x6, x7);
370  __ uxtw(w8, w9);
371  __ uxtw(x10, x11);
372
373  // Regression tests.
374  __ stp(x10, xzr, MemOperand(sp, -16, PreIndex));
375  __ ldp(x10, xzr, MemOperand(sp, 16, PostIndex));
376  __ str(xzr, MemOperand(sp, -16, PreIndex));
377  __ ldrsb(xzr, MemOperand(sp, 16, PostIndex));
378  __ str(xzr, MemOperand(sp, -16, PreIndex));
379  __ ldrsh(xzr, MemOperand(sp, 16, PostIndex));
380  __ str(xzr, MemOperand(sp, -16, PreIndex));
381  __ ldrsw(xzr, MemOperand(sp, 16, PostIndex));
382
383  // Branch tests.
384  {
385    Label end;
386    // Branch to the next instruction.
387    __ b(&end);
388    __ bind(&end);
389  }
390  {
391    Label loop, end;
392    __ subs(x3, x3, x3);
393    __ bind(&loop);
394    // Not-taken branch (the first time).
395    // Taken branch (the second time).
396    __ b(&end, ne);
397    __ cmp(x3, 1);
398    // Backwards branch.
399    __ b(&loop);
400    __ bind(&end);
401  }
402}
403
404
405static void GenerateTestSequenceFP(MacroAssembler* masm) {
406  ExactAssemblyScope guard(masm,
407                           masm->GetBuffer()->GetRemainingBytes(),
408                           ExactAssemblyScope::kMaximumSize);
409
410  // Scalar floating point instructions.
411  __ fabd(d13, d2, d19);
412  __ fabd(s8, s10, s30);
413  __ fabs(d1, d1);
414  __ fabs(s25, s7);
415  __ facge(d1, d23, d16);
416  __ facge(s4, s17, s1);
417  __ facgt(d2, d21, d24);
418  __ facgt(s12, s26, s12);
419  __ fadd(d13, d11, d22);
420  __ fadd(s27, s19, s8);
421  __ fccmp(d6, d10, NoFlag, hs);
422  __ fccmp(s29, s20, NZVFlag, ne);
423  __ fccmpe(d10, d2, NZCFlag, al);
424  __ fccmpe(s3, s3, NZVFlag, pl);
425  __ fcmeq(d19, d8, d10);
426  __ fcmeq(d0, d18, 0.0);
427  __ fcmeq(s1, s4, s30);
428  __ fcmeq(s22, s29, 0.0);
429  __ fcmge(d27, d18, d1);
430  __ fcmge(d31, d28, 0.0);
431  __ fcmge(s31, s19, s9);
432  __ fcmge(s1, s25, 0.0);
433  __ fcmgt(d18, d1, d15);
434  __ fcmgt(d3, d31, 0.0);
435  __ fcmgt(s11, s25, s2);
436  __ fcmgt(s17, s16, 0.0);
437  __ fcmle(d24, d17, 0.0);
438  __ fcmle(s11, s8, 0.0);
439  __ fcmlt(d5, d31, 0.0);
440  __ fcmlt(s18, s23, 0.0);
441  __ fcmp(d10, d24);
442  __ fcmp(d13, 0.0);
443  __ fcmp(s18, s6);
444  __ fcmp(s16, 0.0);
445  __ fcmpe(d9, d17);
446  __ fcmpe(d29, 0.0);
447  __ fcmpe(s16, s17);
448  __ fcmpe(s22, 0.0);
449  __ fcsel(d10, d14, d19, gt);
450  __ fcsel(s22, s18, s2, ge);
451  __ fcvt(d4, h24);
452  __ fcvt(d11, s2);
453  __ fcvt(h8, d9);
454  __ fcvt(h12, s1);
455  __ fcvt(s12, d31);
456  __ fcvt(s27, h25);
457  __ fcvtas(d28, d16);
458  __ fcvtas(s3, s5);
459  __ fcvtas(w18, d31);
460  __ fcvtas(w29, s24);
461  __ fcvtas(x9, d1);
462  __ fcvtas(x30, s2);
463  __ fcvtau(d14, d0);
464  __ fcvtau(s31, s14);
465  __ fcvtau(w16, d2);
466  __ fcvtau(w18, s0);
467  __ fcvtau(x26, d7);
468  __ fcvtau(x25, s19);
469  __ fcvtms(d30, d25);
470  __ fcvtms(s12, s15);
471  __ fcvtms(w9, d7);
472  __ fcvtms(w19, s6);
473  __ fcvtms(x6, d6);
474  __ fcvtms(x22, s7);
475  __ fcvtmu(d27, d0);
476  __ fcvtmu(s8, s22);
477  __ fcvtmu(w29, d19);
478  __ fcvtmu(w26, s0);
479  __ fcvtmu(x13, d5);
480  __ fcvtmu(x5, s18);
481  __ fcvtns(d30, d15);
482  __ fcvtns(s10, s11);
483  __ fcvtns(w21, d15);
484  __ fcvtns(w18, s10);
485  __ fcvtns(x8, d17);
486  __ fcvtns(x17, s12);
487  __ fcvtnu(d0, d21);
488  __ fcvtnu(s6, s25);
489  __ fcvtnu(w29, d11);
490  __ fcvtnu(w25, s31);
491  __ fcvtnu(x30, d11);
492  __ fcvtnu(x27, s18);
493  __ fcvtps(d11, d22);
494  __ fcvtps(s29, s20);
495  __ fcvtps(w15, d25);
496  __ fcvtps(w16, s7);
497  __ fcvtps(x13, d20);
498  __ fcvtps(x3, s23);
499  __ fcvtpu(d24, d1);
500  __ fcvtpu(s14, s24);
501  __ fcvtpu(w26, d29);
502  __ fcvtpu(wzr, s26);
503  __ fcvtpu(x27, d6);
504  __ fcvtpu(x29, s14);
505  __ fcvtxn(s12, d12);
506  __ fcvtzs(d15, d0);
507  __ fcvtzs(d13, d4, 42);
508  __ fcvtzs(s8, s11);
509  __ fcvtzs(s31, s6, 25);
510  __ fcvtzs(w6, d9);
511  __ fcvtzs(w25, d10, 20);
512  __ fcvtzs(w9, s1);
513  __ fcvtzs(w17, s29, 30);
514  __ fcvtzs(x19, d2);
515  __ fcvtzs(x22, d14, 1);
516  __ fcvtzs(x14, s20);
517  __ fcvtzs(x3, s30, 33);
518  __ fcvtzu(d28, d15);
519  __ fcvtzu(d0, d4, 3);
520  __ fcvtzu(s2, s5);
521  __ fcvtzu(s4, s0, 30);
522  __ fcvtzu(w11, d4);
523  __ fcvtzu(w7, d24, 32);
524  __ fcvtzu(w18, s24);
525  __ fcvtzu(w14, s27, 4);
526  __ fcvtzu(x22, d11);
527  __ fcvtzu(x8, d27, 52);
528  __ fcvtzu(x7, s20);
529  __ fcvtzu(x22, s7, 44);
530  __ fdiv(d6, d14, d15);
531  __ fdiv(s26, s5, s25);
532  __ fmadd(d18, d26, d12, d30);
533  __ fmadd(s13, s9, s28, s4);
534  __ fmax(d12, d5, d5);
535  __ fmax(s12, s28, s6);
536  __ fmaxnm(d28, d4, d2);
537  __ fmaxnm(s6, s10, s8);
538  __ fmin(d20, d20, d18);
539  __ fmin(s7, s13, s16);
540  __ fminnm(d19, d14, d30);
541  __ fminnm(s0, s1, s1);
542  __ fmov(d13, d6);
543  __ fmov(d2, x17);
544  __ fmov(d8, -2.5000);
545  __ fmov(s5, s3);
546  __ fmov(s25, w20);
547  __ fmov(s21, 2.8750f);
548  __ fmov(w18, s24);
549  __ fmov(x18, d2);
550  __ fmsub(d20, d30, d3, d19);
551  __ fmsub(s5, s19, s4, s12);
552  __ fmul(d30, d27, d23);
553  __ fmul(s25, s17, s15);
554  __ fmulx(d4, d17, d1);
555  __ fmulx(s14, s25, s4);
556  __ fneg(d15, d0);
557  __ fneg(s14, s15);
558  __ fnmadd(d0, d16, d22, d31);
559  __ fnmadd(s0, s18, s26, s18);
560  __ fnmsub(d19, d12, d15, d21);
561  __ fnmsub(s29, s0, s11, s26);
562  __ fnmul(d31, d19, d1);
563  __ fnmul(s18, s3, s17);
564  __ frecpe(d7, d21);
565  __ frecpe(s29, s17);
566  __ frecps(d11, d26, d17);
567  __ frecps(s18, s27, s1);
568  __ frecpx(d15, d18);
569  __ frecpx(s5, s10);
570  __ frinta(d16, d30);
571  __ frinta(s1, s22);
572  __ frinti(d19, d29);
573  __ frinti(s14, s21);
574  __ frintm(d20, d30);
575  __ frintm(s1, s16);
576  __ frintn(d30, d1);
577  __ frintn(s24, s10);
578  __ frintp(d4, d20);
579  __ frintp(s13, s3);
580  __ frintx(d13, d20);
581  __ frintx(s17, s7);
582  __ frintz(d0, d8);
583  __ frintz(s15, s29);
584  __ frsqrte(d21, d10);
585  __ frsqrte(s17, s25);
586  __ frsqrts(d4, d29, d17);
587  __ frsqrts(s14, s3, s24);
588  __ fsqrt(d14, d17);
589  __ fsqrt(s4, s14);
590  __ fsub(d13, d19, d7);
591  __ fsub(s3, s21, s27);
592  __ scvtf(d31, d16);
593  __ scvtf(d26, d31, 24);
594  __ scvtf(d6, w16);
595  __ scvtf(d5, w20, 6);
596  __ scvtf(d16, x8);
597  __ scvtf(d15, x8, 10);
598  __ scvtf(s7, s4);
599  __ scvtf(s8, s15, 14);
600  __ scvtf(s29, w10);
601  __ scvtf(s15, w21, 11);
602  __ scvtf(s27, x26);
603  __ scvtf(s26, x12, 38);
604  __ ucvtf(d0, d9);
605  __ ucvtf(d5, d22, 47);
606  __ ucvtf(d30, w27);
607  __ ucvtf(d3, w19, 1);
608  __ ucvtf(d28, x21);
609  __ ucvtf(d27, x30, 35);
610  __ ucvtf(s11, s5);
611  __ ucvtf(s0, s23, 14);
612  __ ucvtf(s20, w19);
613  __ ucvtf(s21, w22, 18);
614  __ ucvtf(s6, x13);
615  __ ucvtf(s7, x2, 21);
616}
617
618
619static void GenerateTestSequenceNEON(MacroAssembler* masm) {
620  ExactAssemblyScope guard(masm,
621                           masm->GetBuffer()->GetRemainingBytes(),
622                           ExactAssemblyScope::kMaximumSize);
623
624  // NEON integer instructions.
625  __ abs(d19, d0);
626  __ abs(v16.V16B(), v11.V16B());
627  __ abs(v0.V2D(), v31.V2D());
628  __ abs(v27.V2S(), v25.V2S());
629  __ abs(v21.V4H(), v27.V4H());
630  __ abs(v16.V4S(), v1.V4S());
631  __ abs(v31.V8B(), v5.V8B());
632  __ abs(v29.V8H(), v13.V8H());
633  __ add(d10, d5, d17);
634  __ add(v31.V16B(), v15.V16B(), v23.V16B());
635  __ add(v10.V2D(), v31.V2D(), v14.V2D());
636  __ add(v15.V2S(), v14.V2S(), v19.V2S());
637  __ add(v27.V4H(), v23.V4H(), v17.V4H());
638  __ add(v25.V4S(), v28.V4S(), v29.V4S());
639  __ add(v13.V8B(), v7.V8B(), v18.V8B());
640  __ add(v4.V8H(), v2.V8H(), v1.V8H());
641  __ addhn(v10.V2S(), v14.V2D(), v15.V2D());
642  __ addhn(v10.V4H(), v30.V4S(), v26.V4S());
643  __ addhn(v31.V8B(), v12.V8H(), v22.V8H());
644  __ addhn2(v16.V16B(), v21.V8H(), v20.V8H());
645  __ addhn2(v0.V4S(), v2.V2D(), v17.V2D());
646  __ addhn2(v31.V8H(), v7.V4S(), v17.V4S());
647  __ addp(d14, v19.V2D());
648  __ addp(v3.V16B(), v8.V16B(), v28.V16B());
649  __ addp(v8.V2D(), v5.V2D(), v17.V2D());
650  __ addp(v22.V2S(), v30.V2S(), v26.V2S());
651  __ addp(v29.V4H(), v24.V4H(), v14.V4H());
652  __ addp(v30.V4S(), v26.V4S(), v24.V4S());
653  __ addp(v12.V8B(), v26.V8B(), v7.V8B());
654  __ addp(v17.V8H(), v8.V8H(), v12.V8H());
655  __ addv(b27, v23.V16B());
656  __ addv(b12, v20.V8B());
657  __ addv(h27, v30.V4H());
658  __ addv(h19, v14.V8H());
659  __ addv(s14, v27.V4S());
660  __ and_(v10.V16B(), v8.V16B(), v27.V16B());
661  __ and_(v5.V8B(), v1.V8B(), v16.V8B());
662  __ bic(v26.V16B(), v3.V16B(), v24.V16B());
663  __ bic(v7.V2S(), 0xe4, 16);
664  __ bic(v28.V4H(), 0x23, 8);
665  __ bic(v29.V4S(), 0xac);
666  __ bic(v12.V8B(), v31.V8B(), v21.V8B());
667  __ bic(v18.V8H(), 0x98);
668  __ bif(v12.V16B(), v26.V16B(), v8.V16B());
669  __ bif(v2.V8B(), v23.V8B(), v27.V8B());
670  __ bit(v8.V16B(), v3.V16B(), v13.V16B());
671  __ bit(v5.V8B(), v5.V8B(), v23.V8B());
672  __ bsl(v9.V16B(), v31.V16B(), v23.V16B());
673  __ bsl(v14.V8B(), v7.V8B(), v3.V8B());
674  __ cls(v29.V16B(), v5.V16B());
675  __ cls(v21.V2S(), v0.V2S());
676  __ cls(v1.V4H(), v12.V4H());
677  __ cls(v27.V4S(), v10.V4S());
678  __ cls(v19.V8B(), v4.V8B());
679  __ cls(v15.V8H(), v14.V8H());
680  __ clz(v1.V16B(), v4.V16B());
681  __ clz(v27.V2S(), v17.V2S());
682  __ clz(v9.V4H(), v9.V4H());
683  __ clz(v31.V4S(), v15.V4S());
684  __ clz(v14.V8B(), v19.V8B());
685  __ clz(v6.V8H(), v11.V8H());
686  __ cmeq(d18, d5, d29);
687  __ cmeq(d14, d31, 0);
688  __ cmeq(v19.V16B(), v3.V16B(), v22.V16B());
689  __ cmeq(v15.V16B(), v9.V16B(), 0);
690  __ cmeq(v12.V2D(), v16.V2D(), v10.V2D());
691  __ cmeq(v8.V2D(), v22.V2D(), 0);
692  __ cmeq(v2.V2S(), v3.V2S(), v9.V2S());
693  __ cmeq(v16.V2S(), v25.V2S(), 0);
694  __ cmeq(v6.V4H(), v23.V4H(), v20.V4H());
695  __ cmeq(v16.V4H(), v13.V4H(), 0);
696  __ cmeq(v21.V4S(), v17.V4S(), v2.V4S());
697  __ cmeq(v6.V4S(), v25.V4S(), 0);
698  __ cmeq(v16.V8B(), v13.V8B(), v2.V8B());
699  __ cmeq(v21.V8B(), v16.V8B(), 0);
700  __ cmeq(v20.V8H(), v7.V8H(), v25.V8H());
701  __ cmeq(v26.V8H(), v8.V8H(), 0);
702  __ cmge(d16, d13, d31);
703  __ cmge(d25, d24, 0);
704  __ cmge(v17.V16B(), v19.V16B(), v17.V16B());
705  __ cmge(v22.V16B(), v30.V16B(), 0);
706  __ cmge(v28.V2D(), v20.V2D(), v26.V2D());
707  __ cmge(v6.V2D(), v23.V2D(), 0);
708  __ cmge(v25.V2S(), v22.V2S(), v3.V2S());
709  __ cmge(v21.V2S(), v11.V2S(), 0);
710  __ cmge(v16.V4H(), v3.V4H(), v12.V4H());
711  __ cmge(v23.V4H(), v9.V4H(), 0);
712  __ cmge(v7.V4S(), v2.V4S(), v11.V4S());
713  __ cmge(v0.V4S(), v22.V4S(), 0);
714  __ cmge(v10.V8B(), v30.V8B(), v9.V8B());
715  __ cmge(v21.V8B(), v8.V8B(), 0);
716  __ cmge(v2.V8H(), v7.V8H(), v26.V8H());
717  __ cmge(v19.V8H(), v10.V8H(), 0);
718  __ cmgt(d6, d13, d1);
719  __ cmgt(d30, d24, 0);
720  __ cmgt(v20.V16B(), v25.V16B(), v27.V16B());
721  __ cmgt(v0.V16B(), v25.V16B(), 0);
722  __ cmgt(v22.V2D(), v25.V2D(), v1.V2D());
723  __ cmgt(v16.V2D(), v16.V2D(), 0);
724  __ cmgt(v5.V2S(), v9.V2S(), v15.V2S());
725  __ cmgt(v12.V2S(), v18.V2S(), 0);
726  __ cmgt(v28.V4H(), v18.V4H(), v11.V4H());
727  __ cmgt(v22.V4H(), v3.V4H(), 0);
728  __ cmgt(v5.V4S(), v11.V4S(), v27.V4S());
729  __ cmgt(v13.V4S(), v20.V4S(), 0);
730  __ cmgt(v27.V8B(), v31.V8B(), v7.V8B());
731  __ cmgt(v5.V8B(), v0.V8B(), 0);
732  __ cmgt(v22.V8H(), v28.V8H(), v13.V8H());
733  __ cmgt(v6.V8H(), v2.V8H(), 0);
734  __ cmhi(d21, d8, d22);
735  __ cmhi(v18.V16B(), v19.V16B(), v19.V16B());
736  __ cmhi(v7.V2D(), v0.V2D(), v21.V2D());
737  __ cmhi(v15.V2S(), v19.V2S(), v0.V2S());
738  __ cmhi(v31.V4H(), v7.V4H(), v12.V4H());
739  __ cmhi(v9.V4S(), v16.V4S(), v22.V4S());
740  __ cmhi(v7.V8B(), v24.V8B(), v28.V8B());
741  __ cmhi(v11.V8H(), v10.V8H(), v25.V8H());
742  __ cmhs(d1, d12, d17);
743  __ cmhs(v21.V16B(), v25.V16B(), v30.V16B());
744  __ cmhs(v8.V2D(), v2.V2D(), v26.V2D());
745  __ cmhs(v1.V2S(), v22.V2S(), v29.V2S());
746  __ cmhs(v26.V4H(), v30.V4H(), v30.V4H());
747  __ cmhs(v19.V4S(), v20.V4S(), v16.V4S());
748  __ cmhs(v1.V8B(), v3.V8B(), v26.V8B());
749  __ cmhs(v20.V8H(), v28.V8H(), v8.V8H());
750  __ cmle(d30, d24, 0);
751  __ cmle(v0.V16B(), v3.V16B(), 0);
752  __ cmle(v2.V2D(), v30.V2D(), 0);
753  __ cmle(v7.V2S(), v10.V2S(), 0);
754  __ cmle(v9.V4H(), v31.V4H(), 0);
755  __ cmle(v9.V4S(), v18.V4S(), 0);
756  __ cmle(v21.V8B(), v31.V8B(), 0);
757  __ cmle(v29.V8H(), v21.V8H(), 0);
758  __ cmlt(d25, d23, 0);
759  __ cmlt(v7.V16B(), v21.V16B(), 0);
760  __ cmlt(v7.V2D(), v30.V2D(), 0);
761  __ cmlt(v25.V2S(), v28.V2S(), 0);
762  __ cmlt(v0.V4H(), v11.V4H(), 0);
763  __ cmlt(v24.V4S(), v5.V4S(), 0);
764  __ cmlt(v26.V8B(), v11.V8B(), 0);
765  __ cmlt(v1.V8H(), v21.V8H(), 0);
766  __ cmtst(d28, d23, d30);
767  __ cmtst(v26.V16B(), v6.V16B(), v31.V16B());
768  __ cmtst(v1.V2D(), v21.V2D(), v4.V2D());
769  __ cmtst(v27.V2S(), v26.V2S(), v20.V2S());
770  __ cmtst(v26.V4H(), v0.V4H(), v18.V4H());
771  __ cmtst(v25.V4S(), v16.V4S(), v4.V4S());
772  __ cmtst(v11.V8B(), v10.V8B(), v9.V8B());
773  __ cmtst(v0.V8H(), v2.V8H(), v1.V8H());
774  __ cnt(v25.V16B(), v15.V16B());
775  __ cnt(v28.V8B(), v6.V8B());
776  __ dup(v6.V16B(), v7.B(), 7);
777  __ dup(v9.V16B(), w20);
778  __ dup(v12.V2D(), v13.D(), 1);
779  __ dup(v9.V2D(), xzr);
780  __ dup(v4.V2S(), v26.S(), 2);
781  __ dup(v3.V2S(), w12);
782  __ dup(v22.V4H(), v5.H(), 7);
783  __ dup(v16.V4H(), w25);
784  __ dup(v20.V4S(), v10.S(), 2);
785  __ dup(v10.V4S(), w7);
786  __ dup(v30.V8B(), v30.B(), 2);
787  __ dup(v31.V8B(), w15);
788  __ dup(v28.V8H(), v17.H(), 4);
789  __ dup(v2.V8H(), w3);
790  __ eor(v29.V16B(), v25.V16B(), v3.V16B());
791  __ eor(v3.V8B(), v16.V8B(), v28.V8B());
792  __ ext(v1.V16B(), v26.V16B(), v6.V16B(), 1);
793  __ ext(v2.V8B(), v30.V8B(), v1.V8B(), 1);
794  __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0));
795  __ ld1(v23.V16B(),
796         v24.V16B(),
797         v25.V16B(),
798         v26.V16B(),
799         MemOperand(x1, x2, PostIndex));
800  __ ld1(v5.V16B(),
801         v6.V16B(),
802         v7.V16B(),
803         v8.V16B(),
804         MemOperand(x1, 64, PostIndex));
805  __ ld1(v18.V16B(), v19.V16B(), v20.V16B(), MemOperand(x0));
806  __ ld1(v13.V16B(), v14.V16B(), v15.V16B(), MemOperand(x1, x2, PostIndex));
807  __ ld1(v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x1, 48, PostIndex));
808  __ ld1(v17.V16B(), v18.V16B(), MemOperand(x0));
809  __ ld1(v20.V16B(), v21.V16B(), MemOperand(x1, x2, PostIndex));
810  __ ld1(v28.V16B(), v29.V16B(), MemOperand(x1, 32, PostIndex));
811  __ ld1(v29.V16B(), MemOperand(x0));
812  __ ld1(v21.V16B(), MemOperand(x1, x2, PostIndex));
813  __ ld1(v4.V16B(), MemOperand(x1, 16, PostIndex));
814  __ ld1(v4.V1D(), v5.V1D(), v6.V1D(), v7.V1D(), MemOperand(x0));
815  __ ld1(v17.V1D(),
816         v18.V1D(),
817         v19.V1D(),
818         v20.V1D(),
819         MemOperand(x1, x2, PostIndex));
820  __ ld1(v28.V1D(),
821         v29.V1D(),
822         v30.V1D(),
823         v31.V1D(),
824         MemOperand(x1, 32, PostIndex));
825  __ ld1(v20.V1D(), v21.V1D(), v22.V1D(), MemOperand(x0));
826  __ ld1(v19.V1D(), v20.V1D(), v21.V1D(), MemOperand(x1, x2, PostIndex));
827  __ ld1(v12.V1D(), v13.V1D(), v14.V1D(), MemOperand(x1, 24, PostIndex));
828  __ ld1(v29.V1D(), v30.V1D(), MemOperand(x0));
829  __ ld1(v31.V1D(), v0.V1D(), MemOperand(x1, x2, PostIndex));
830  __ ld1(v3.V1D(), v4.V1D(), MemOperand(x1, 16, PostIndex));
831  __ ld1(v28.V1D(), MemOperand(x0));
832  __ ld1(v11.V1D(), MemOperand(x1, x2, PostIndex));
833  __ ld1(v29.V1D(), MemOperand(x1, 8, PostIndex));
834  __ ld1(v28.V2D(), v29.V2D(), v30.V2D(), v31.V2D(), MemOperand(x0));
835  __ ld1(v8.V2D(),
836         v9.V2D(),
837         v10.V2D(),
838         v11.V2D(),
839         MemOperand(x1, x2, PostIndex));
840  __ ld1(v14.V2D(),
841         v15.V2D(),
842         v16.V2D(),
843         v17.V2D(),
844         MemOperand(x1, 64, PostIndex));
845  __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x0));
846  __ ld1(v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
847  __ ld1(v26.V2D(), v27.V2D(), v28.V2D(), MemOperand(x1, 48, PostIndex));
848  __ ld1(v18.V2D(), v19.V2D(), MemOperand(x0));
849  __ ld1(v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex));
850  __ ld1(v17.V2D(), v18.V2D(), MemOperand(x1, 32, PostIndex));
851  __ ld1(v5.V2D(), MemOperand(x0));
852  __ ld1(v6.V2D(), MemOperand(x1, x2, PostIndex));
853  __ ld1(v15.V2D(), MemOperand(x1, 16, PostIndex));
854  __ ld1(v30.V2S(), v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x0));
855  __ ld1(v24.V2S(),
856         v25.V2S(),
857         v26.V2S(),
858         v27.V2S(),
859         MemOperand(x1, x2, PostIndex));
860  __ ld1(v27.V2S(),
861         v28.V2S(),
862         v29.V2S(),
863         v30.V2S(),
864         MemOperand(x1, 32, PostIndex));
865  __ ld1(v11.V2S(), v12.V2S(), v13.V2S(), MemOperand(x0));
866  __ ld1(v8.V2S(), v9.V2S(), v10.V2S(), MemOperand(x1, x2, PostIndex));
867  __ ld1(v31.V2S(), v0.V2S(), v1.V2S(), MemOperand(x1, 24, PostIndex));
868  __ ld1(v0.V2S(), v1.V2S(), MemOperand(x0));
869  __ ld1(v13.V2S(), v14.V2S(), MemOperand(x1, x2, PostIndex));
870  __ ld1(v3.V2S(), v4.V2S(), MemOperand(x1, 16, PostIndex));
871  __ ld1(v26.V2S(), MemOperand(x0));
872  __ ld1(v0.V2S(), MemOperand(x1, x2, PostIndex));
873  __ ld1(v11.V2S(), MemOperand(x1, 8, PostIndex));
874  __ ld1(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0));
875  __ ld1(v24.V4H(),
876         v25.V4H(),
877         v26.V4H(),
878         v27.V4H(),
879         MemOperand(x1, x2, PostIndex));
880  __ ld1(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex));
881  __ ld1(v30.V4H(), v31.V4H(), v0.V4H(), MemOperand(x0));
882  __ ld1(v25.V4H(), v26.V4H(), v27.V4H(), MemOperand(x1, x2, PostIndex));
883  __ ld1(v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 24, PostIndex));
884  __ ld1(v3.V4H(), v4.V4H(), MemOperand(x0));
885  __ ld1(v3.V4H(), v4.V4H(), MemOperand(x1, x2, PostIndex));
886  __ ld1(v23.V4H(), v24.V4H(), MemOperand(x1, 16, PostIndex));
887  __ ld1(v26.V4H(), MemOperand(x0));
888  __ ld1(v1.V4H(), MemOperand(x1, x2, PostIndex));
889  __ ld1(v14.V4H(), MemOperand(x1, 8, PostIndex));
890  __ ld1(v26.V4S(), v27.V4S(), v28.V4S(), v29.V4S(), MemOperand(x0));
891  __ ld1(v28.V4S(),
892         v29.V4S(),
893         v30.V4S(),
894         v31.V4S(),
895         MemOperand(x1, x2, PostIndex));
896  __ ld1(v4.V4S(), v5.V4S(), v6.V4S(), v7.V4S(), MemOperand(x1, 64, PostIndex));
897  __ ld1(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0));
898  __ ld1(v22.V4S(), v23.V4S(), v24.V4S(), MemOperand(x1, x2, PostIndex));
899  __ ld1(v15.V4S(), v16.V4S(), v17.V4S(), MemOperand(x1, 48, PostIndex));
900  __ ld1(v20.V4S(), v21.V4S(), MemOperand(x0));
901  __ ld1(v30.V4S(), v31.V4S(), MemOperand(x1, x2, PostIndex));
902  __ ld1(v11.V4S(), v12.V4S(), MemOperand(x1, 32, PostIndex));
903  __ ld1(v15.V4S(), MemOperand(x0));
904  __ ld1(v12.V4S(), MemOperand(x1, x2, PostIndex));
905  __ ld1(v0.V4S(), MemOperand(x1, 16, PostIndex));
906  __ ld1(v17.V8B(), v18.V8B(), v19.V8B(), v20.V8B(), MemOperand(x0));
907  __ ld1(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, x2, PostIndex));
908  __ ld1(v9.V8B(),
909         v10.V8B(),
910         v11.V8B(),
911         v12.V8B(),
912         MemOperand(x1, 32, PostIndex));
913  __ ld1(v4.V8B(), v5.V8B(), v6.V8B(), MemOperand(x0));
914  __ ld1(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x1, x2, PostIndex));
915  __ ld1(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex));
916  __ ld1(v10.V8B(), v11.V8B(), MemOperand(x0));
917  __ ld1(v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex));
918  __ ld1(v27.V8B(), v28.V8B(), MemOperand(x1, 16, PostIndex));
919  __ ld1(v31.V8B(), MemOperand(x0));
920  __ ld1(v10.V8B(), MemOperand(x1, x2, PostIndex));
921  __ ld1(v28.V8B(), MemOperand(x1, 8, PostIndex));
922  __ ld1(v5.V8H(), v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0));
923  __ ld1(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
924  __ ld1(v10.V8H(),
925         v11.V8H(),
926         v12.V8H(),
927         v13.V8H(),
928         MemOperand(x1, 64, PostIndex));
929  __ ld1(v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0));
930  __ ld1(v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
931  __ ld1(v17.V8H(), v18.V8H(), v19.V8H(), MemOperand(x1, 48, PostIndex));
932  __ ld1(v4.V8H(), v5.V8H(), MemOperand(x0));
933  __ ld1(v21.V8H(), v22.V8H(), MemOperand(x1, x2, PostIndex));
934  __ ld1(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex));
935  __ ld1(v9.V8H(), MemOperand(x0));
936  __ ld1(v27.V8H(), MemOperand(x1, x2, PostIndex));
937  __ ld1(v26.V8H(), MemOperand(x1, 16, PostIndex));
938  __ ld1(v19.B(), 1, MemOperand(x0));
939  __ ld1(v12.B(), 3, MemOperand(x1, x2, PostIndex));
940  __ ld1(v27.B(), 12, MemOperand(x1, 1, PostIndex));
941  __ ld1(v10.D(), 1, MemOperand(x0));
942  __ ld1(v26.D(), 1, MemOperand(x1, x2, PostIndex));
943  __ ld1(v7.D(), 1, MemOperand(x1, 8, PostIndex));
944  __ ld1(v19.H(), 5, MemOperand(x0));
945  __ ld1(v10.H(), 1, MemOperand(x1, x2, PostIndex));
946  __ ld1(v5.H(), 4, MemOperand(x1, 2, PostIndex));
947  __ ld1(v21.S(), 2, MemOperand(x0));
948  __ ld1(v13.S(), 2, MemOperand(x1, x2, PostIndex));
949  __ ld1(v1.S(), 2, MemOperand(x1, 4, PostIndex));
950  __ ld1r(v2.V16B(), MemOperand(x0));
951  __ ld1r(v2.V16B(), MemOperand(x1, x2, PostIndex));
952  __ ld1r(v22.V16B(), MemOperand(x1, 1, PostIndex));
953  __ ld1r(v25.V1D(), MemOperand(x0));
954  __ ld1r(v9.V1D(), MemOperand(x1, x2, PostIndex));
955  __ ld1r(v23.V1D(), MemOperand(x1, 8, PostIndex));
956  __ ld1r(v19.V2D(), MemOperand(x0));
957  __ ld1r(v21.V2D(), MemOperand(x1, x2, PostIndex));
958  __ ld1r(v30.V2D(), MemOperand(x1, 8, PostIndex));
959  __ ld1r(v24.V2S(), MemOperand(x0));
960  __ ld1r(v26.V2S(), MemOperand(x1, x2, PostIndex));
961  __ ld1r(v28.V2S(), MemOperand(x1, 4, PostIndex));
962  __ ld1r(v19.V4H(), MemOperand(x0));
963  __ ld1r(v1.V4H(), MemOperand(x1, x2, PostIndex));
964  __ ld1r(v21.V4H(), MemOperand(x1, 2, PostIndex));
965  __ ld1r(v15.V4S(), MemOperand(x0));
966  __ ld1r(v21.V4S(), MemOperand(x1, x2, PostIndex));
967  __ ld1r(v23.V4S(), MemOperand(x1, 4, PostIndex));
968  __ ld1r(v26.V8B(), MemOperand(x0));
969  __ ld1r(v14.V8B(), MemOperand(x1, x2, PostIndex));
970  __ ld1r(v19.V8B(), MemOperand(x1, 1, PostIndex));
971  __ ld1r(v13.V8H(), MemOperand(x0));
972  __ ld1r(v30.V8H(), MemOperand(x1, x2, PostIndex));
973  __ ld1r(v27.V8H(), MemOperand(x1, 2, PostIndex));
974  __ ld2(v21.V16B(), v22.V16B(), MemOperand(x0));
975  __ ld2(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex));
976  __ ld2(v12.V16B(), v13.V16B(), MemOperand(x1, 32, PostIndex));
977  __ ld2(v14.V2D(), v15.V2D(), MemOperand(x0));
978  __ ld2(v0.V2D(), v1.V2D(), MemOperand(x1, x2, PostIndex));
979  __ ld2(v12.V2D(), v13.V2D(), MemOperand(x1, 32, PostIndex));
980  __ ld2(v27.V2S(), v28.V2S(), MemOperand(x0));
981  __ ld2(v2.V2S(), v3.V2S(), MemOperand(x1, x2, PostIndex));
982  __ ld2(v12.V2S(), v13.V2S(), MemOperand(x1, 16, PostIndex));
983  __ ld2(v9.V4H(), v10.V4H(), MemOperand(x0));
984  __ ld2(v23.V4H(), v24.V4H(), MemOperand(x1, x2, PostIndex));
985  __ ld2(v1.V4H(), v2.V4H(), MemOperand(x1, 16, PostIndex));
986  __ ld2(v20.V4S(), v21.V4S(), MemOperand(x0));
987  __ ld2(v10.V4S(), v11.V4S(), MemOperand(x1, x2, PostIndex));
988  __ ld2(v24.V4S(), v25.V4S(), MemOperand(x1, 32, PostIndex));
989  __ ld2(v17.V8B(), v18.V8B(), MemOperand(x0));
990  __ ld2(v13.V8B(), v14.V8B(), MemOperand(x1, x2, PostIndex));
991  __ ld2(v7.V8B(), v8.V8B(), MemOperand(x1, 16, PostIndex));
992  __ ld2(v30.V8H(), v31.V8H(), MemOperand(x0));
993  __ ld2(v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
994  __ ld2(v13.V8H(), v14.V8H(), MemOperand(x1, 32, PostIndex));
995  __ ld2(v5.B(), v6.B(), 12, MemOperand(x0));
996  __ ld2(v16.B(), v17.B(), 7, MemOperand(x1, x2, PostIndex));
997  __ ld2(v29.B(), v30.B(), 2, MemOperand(x1, 2, PostIndex));
998  __ ld2(v11.D(), v12.D(), 1, MemOperand(x0));
999  __ ld2(v26.D(), v27.D(), 0, MemOperand(x1, x2, PostIndex));
1000  __ ld2(v25.D(), v26.D(), 0, MemOperand(x1, 16, PostIndex));
1001  __ ld2(v18.H(), v19.H(), 7, MemOperand(x0));
1002  __ ld2(v17.H(), v18.H(), 5, MemOperand(x1, x2, PostIndex));
1003  __ ld2(v30.H(), v31.H(), 2, MemOperand(x1, 4, PostIndex));
1004  __ ld2(v29.S(), v30.S(), 3, MemOperand(x0));
1005  __ ld2(v28.S(), v29.S(), 0, MemOperand(x1, x2, PostIndex));
1006  __ ld2(v6.S(), v7.S(), 1, MemOperand(x1, 8, PostIndex));
1007  __ ld2r(v26.V16B(), v27.V16B(), MemOperand(x0));
1008  __ ld2r(v21.V16B(), v22.V16B(), MemOperand(x1, x2, PostIndex));
1009  __ ld2r(v5.V16B(), v6.V16B(), MemOperand(x1, 2, PostIndex));
1010  __ ld2r(v26.V1D(), v27.V1D(), MemOperand(x0));
1011  __ ld2r(v14.V1D(), v15.V1D(), MemOperand(x1, x2, PostIndex));
1012  __ ld2r(v23.V1D(), v24.V1D(), MemOperand(x1, 16, PostIndex));
1013  __ ld2r(v11.V2D(), v12.V2D(), MemOperand(x0));
1014  __ ld2r(v29.V2D(), v30.V2D(), MemOperand(x1, x2, PostIndex));
1015  __ ld2r(v15.V2D(), v16.V2D(), MemOperand(x1, 16, PostIndex));
1016  __ ld2r(v26.V2S(), v27.V2S(), MemOperand(x0));
1017  __ ld2r(v22.V2S(), v23.V2S(), MemOperand(x1, x2, PostIndex));
1018  __ ld2r(v2.V2S(), v3.V2S(), MemOperand(x1, 8, PostIndex));
1019  __ ld2r(v2.V4H(), v3.V4H(), MemOperand(x0));
1020  __ ld2r(v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex));
1021  __ ld2r(v6.V4H(), v7.V4H(), MemOperand(x1, 4, PostIndex));
1022  __ ld2r(v7.V4S(), v8.V4S(), MemOperand(x0));
1023  __ ld2r(v19.V4S(), v20.V4S(), MemOperand(x1, x2, PostIndex));
1024  __ ld2r(v21.V4S(), v22.V4S(), MemOperand(x1, 8, PostIndex));
1025  __ ld2r(v26.V8B(), v27.V8B(), MemOperand(x0));
1026  __ ld2r(v20.V8B(), v21.V8B(), MemOperand(x1, x2, PostIndex));
1027  __ ld2r(v11.V8B(), v12.V8B(), MemOperand(x1, 2, PostIndex));
1028  __ ld2r(v12.V8H(), v13.V8H(), MemOperand(x0));
1029  __ ld2r(v6.V8H(), v7.V8H(), MemOperand(x1, x2, PostIndex));
1030  __ ld2r(v25.V8H(), v26.V8H(), MemOperand(x1, 4, PostIndex));
1031  __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x0));
1032  __ ld3(v28.V16B(), v29.V16B(), v30.V16B(), MemOperand(x1, x2, PostIndex));
1033  __ ld3(v20.V16B(), v21.V16B(), v22.V16B(), MemOperand(x1, 48, PostIndex));
1034  __ ld3(v21.V2D(), v22.V2D(), v23.V2D(), MemOperand(x0));
1035  __ ld3(v18.V2D(), v19.V2D(), v20.V2D(), MemOperand(x1, x2, PostIndex));
1036  __ ld3(v27.V2D(), v28.V2D(), v29.V2D(), MemOperand(x1, 48, PostIndex));
1037  __ ld3(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x0));
1038  __ ld3(v20.V2S(), v21.V2S(), v22.V2S(), MemOperand(x1, x2, PostIndex));
1039  __ ld3(v26.V2S(), v27.V2S(), v28.V2S(), MemOperand(x1, 24, PostIndex));
1040  __ ld3(v27.V4H(), v28.V4H(), v29.V4H(), MemOperand(x0));
1041  __ ld3(v28.V4H(), v29.V4H(), v30.V4H(), MemOperand(x1, x2, PostIndex));
1042  __ ld3(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 24, PostIndex));
1043  __ ld3(v2.V4S(), v3.V4S(), v4.V4S(), MemOperand(x0));
1044  __ ld3(v24.V4S(), v25.V4S(), v26.V4S(), MemOperand(x1, x2, PostIndex));
1045  __ ld3(v11.V4S(), v12.V4S(), v13.V4S(), MemOperand(x1, 48, PostIndex));
1046  __ ld3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x0));
1047  __ ld3(v1.V8B(), v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex));
1048  __ ld3(v12.V8B(), v13.V8B(), v14.V8B(), MemOperand(x1, 24, PostIndex));
1049  __ ld3(v22.V8H(), v23.V8H(), v24.V8H(), MemOperand(x0));
1050  __ ld3(v13.V8H(), v14.V8H(), v15.V8H(), MemOperand(x1, x2, PostIndex));
1051  __ ld3(v28.V8H(), v29.V8H(), v30.V8H(), MemOperand(x1, 48, PostIndex));
1052  __ ld3(v21.B(), v22.B(), v23.B(), 11, MemOperand(x0));
1053  __ ld3(v5.B(), v6.B(), v7.B(), 9, MemOperand(x1, x2, PostIndex));
1054  __ ld3(v23.B(), v24.B(), v25.B(), 0, MemOperand(x1, 3, PostIndex));
1055  __ ld3(v16.D(), v17.D(), v18.D(), 0, MemOperand(x0));
1056  __ ld3(v30.D(), v31.D(), v0.D(), 0, MemOperand(x1, x2, PostIndex));
1057  __ ld3(v28.D(), v29.D(), v30.D(), 1, MemOperand(x1, 24, PostIndex));
1058  __ ld3(v13.H(), v14.H(), v15.H(), 2, MemOperand(x0));
1059  __ ld3(v22.H(), v23.H(), v24.H(), 7, MemOperand(x1, x2, PostIndex));
1060  __ ld3(v14.H(), v15.H(), v16.H(), 3, MemOperand(x1, 6, PostIndex));
1061  __ ld3(v22.S(), v23.S(), v24.S(), 3, MemOperand(x0));
1062  __ ld3(v30.S(), v31.S(), v0.S(), 2, MemOperand(x1, x2, PostIndex));
1063  __ ld3(v12.S(), v13.S(), v14.S(), 1, MemOperand(x1, 12, PostIndex));
1064  __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x0));
1065  __ ld3r(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, x2, PostIndex));
1066  __ ld3r(v3.V16B(), v4.V16B(), v5.V16B(), MemOperand(x1, 3, PostIndex));
1067  __ ld3r(v4.V1D(), v5.V1D(), v6.V1D(), MemOperand(x0));
1068  __ ld3r(v7.V1D(), v8.V1D(), v9.V1D(), MemOperand(x1, x2, PostIndex));
1069  __ ld3r(v17.V1D(), v18.V1D(), v19.V1D(), MemOperand(x1, 24, PostIndex));
1070  __ ld3r(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x0));
1071  __ ld3r(v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x1, x2, PostIndex));
1072  __ ld3r(v14.V2D(), v15.V2D(), v16.V2D(), MemOperand(x1, 24, PostIndex));
1073  __ ld3r(v10.V2S(), v11.V2S(), v12.V2S(), MemOperand(x0));
1074  __ ld3r(v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x1, x2, PostIndex));
1075  __ ld3r(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, 12, PostIndex));
1076  __ ld3r(v22.V4H(), v23.V4H(), v24.V4H(), MemOperand(x0));
1077  __ ld3r(v6.V4H(), v7.V4H(), v8.V4H(), MemOperand(x1, x2, PostIndex));
1078  __ ld3r(v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x1, 6, PostIndex));
1079  __ ld3r(v26.V4S(), v27.V4S(), v28.V4S(), MemOperand(x0));
1080  __ ld3r(v0.V4S(), v1.V4S(), v2.V4S(), MemOperand(x1, x2, PostIndex));
1081  __ ld3r(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, 12, PostIndex));
1082  __ ld3r(v2.V8B(), v3.V8B(), v4.V8B(), MemOperand(x0));
1083  __ ld3r(v10.V8B(), v11.V8B(), v12.V8B(), MemOperand(x1, x2, PostIndex));
1084  __ ld3r(v28.V8B(), v29.V8B(), v30.V8B(), MemOperand(x1, 3, PostIndex));
1085  __ ld3r(v6.V8H(), v7.V8H(), v8.V8H(), MemOperand(x0));
1086  __ ld3r(v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x1, x2, PostIndex));
1087  __ ld3r(v7.V8H(), v8.V8H(), v9.V8H(), MemOperand(x1, 6, PostIndex));
1088  __ ld4(v3.V16B(), v4.V16B(), v5.V16B(), v6.V16B(), MemOperand(x0));
1089  __ ld4(v2.V16B(),
1090         v3.V16B(),
1091         v4.V16B(),
1092         v5.V16B(),
1093         MemOperand(x1, x2, PostIndex));
1094  __ ld4(v5.V16B(),
1095         v6.V16B(),
1096         v7.V16B(),
1097         v8.V16B(),
1098         MemOperand(x1, 64, PostIndex));
1099  __ ld4(v18.V2D(), v19.V2D(), v20.V2D(), v21.V2D(), MemOperand(x0));
1100  __ ld4(v4.V2D(), v5.V2D(), v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
1101  __ ld4(v29.V2D(),
1102         v30.V2D(),
1103         v31.V2D(),
1104         v0.V2D(),
1105         MemOperand(x1, 64, PostIndex));
1106  __ ld4(v27.V2S(), v28.V2S(), v29.V2S(), v30.V2S(), MemOperand(x0));
1107  __ ld4(v24.V2S(),
1108         v25.V2S(),
1109         v26.V2S(),
1110         v27.V2S(),
1111         MemOperand(x1, x2, PostIndex));
1112  __ ld4(v4.V2S(), v5.V2S(), v6.V2S(), v7.V2S(), MemOperand(x1, 32, PostIndex));
1113  __ ld4(v16.V4H(), v17.V4H(), v18.V4H(), v19.V4H(), MemOperand(x0));
1114  __ ld4(v23.V4H(),
1115         v24.V4H(),
1116         v25.V4H(),
1117         v26.V4H(),
1118         MemOperand(x1, x2, PostIndex));
1119  __ ld4(v2.V4H(), v3.V4H(), v4.V4H(), v5.V4H(), MemOperand(x1, 32, PostIndex));
1120  __ ld4(v7.V4S(), v8.V4S(), v9.V4S(), v10.V4S(), MemOperand(x0));
1121  __ ld4(v28.V4S(),
1122         v29.V4S(),
1123         v30.V4S(),
1124         v31.V4S(),
1125         MemOperand(x1, x2, PostIndex));
1126  __ ld4(v29.V4S(),
1127         v30.V4S(),
1128         v31.V4S(),
1129         v0.V4S(),
1130         MemOperand(x1, 64, PostIndex));
1131  __ ld4(v15.V8B(), v16.V8B(), v17.V8B(), v18.V8B(), MemOperand(x0));
1132  __ ld4(v27.V8B(),
1133         v28.V8B(),
1134         v29.V8B(),
1135         v30.V8B(),
1136         MemOperand(x1, x2, PostIndex));
1137  __ ld4(v5.V8B(), v6.V8B(), v7.V8B(), v8.V8B(), MemOperand(x1, 32, PostIndex));
1138  __ ld4(v25.V8H(), v26.V8H(), v27.V8H(), v28.V8H(), MemOperand(x0));
1139  __ ld4(v2.V8H(), v3.V8H(), v4.V8H(), v5.V8H(), MemOperand(x1, x2, PostIndex));
1140  __ ld4(v20.V8H(),
1141         v21.V8H(),
1142         v22.V8H(),
1143         v23.V8H(),
1144         MemOperand(x1, 64, PostIndex));
1145  __ ld4(v20.B(), v21.B(), v22.B(), v23.B(), 3, MemOperand(x0));
1146  __ ld4(v12.B(), v13.B(), v14.B(), v15.B(), 3, MemOperand(x1, x2, PostIndex));
1147  __ ld4(v27.B(), v28.B(), v29.B(), v30.B(), 6, MemOperand(x1, 4, PostIndex));
1148  __ ld4(v28.D(), v29.D(), v30.D(), v31.D(), 1, MemOperand(x0));
1149  __ ld4(v15.D(), v16.D(), v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex));
1150  __ ld4(v16.D(), v17.D(), v18.D(), v19.D(), 1, MemOperand(x1, 32, PostIndex));
1151  __ ld4(v2.H(), v3.H(), v4.H(), v5.H(), 6, MemOperand(x0));
1152  __ ld4(v5.H(), v6.H(), v7.H(), v8.H(), 3, MemOperand(x1, x2, PostIndex));
1153  __ ld4(v7.H(), v8.H(), v9.H(), v10.H(), 6, MemOperand(x1, 8, PostIndex));
1154  __ ld4(v6.S(), v7.S(), v8.S(), v9.S(), 1, MemOperand(x0));
1155  __ ld4(v25.S(), v26.S(), v27.S(), v28.S(), 2, MemOperand(x1, x2, PostIndex));
1156  __ ld4(v8.S(), v9.S(), v10.S(), v11.S(), 3, MemOperand(x1, 16, PostIndex));
1157  __ ld4r(v14.V16B(), v15.V16B(), v16.V16B(), v17.V16B(), MemOperand(x0));
1158  __ ld4r(v13.V16B(),
1159          v14.V16B(),
1160          v15.V16B(),
1161          v16.V16B(),
1162          MemOperand(x1, x2, PostIndex));
1163  __ ld4r(v9.V16B(),
1164          v10.V16B(),
1165          v11.V16B(),
1166          v12.V16B(),
1167          MemOperand(x1, 4, PostIndex));
1168  __ ld4r(v8.V1D(), v9.V1D(), v10.V1D(), v11.V1D(), MemOperand(x0));
1169  __ ld4r(v4.V1D(),
1170          v5.V1D(),
1171          v6.V1D(),
1172          v7.V1D(),
1173          MemOperand(x1, x2, PostIndex));
1174  __ ld4r(v26.V1D(),
1175          v27.V1D(),
1176          v28.V1D(),
1177          v29.V1D(),
1178          MemOperand(x1, 32, PostIndex));
1179  __ ld4r(v19.V2D(), v20.V2D(), v21.V2D(), v22.V2D(), MemOperand(x0));
1180  __ ld4r(v28.V2D(),
1181          v29.V2D(),
1182          v30.V2D(),
1183          v31.V2D(),
1184          MemOperand(x1, x2, PostIndex));
1185  __ ld4r(v15.V2D(),
1186          v16.V2D(),
1187          v17.V2D(),
1188          v18.V2D(),
1189          MemOperand(x1, 32, PostIndex));
1190  __ ld4r(v31.V2S(), v0.V2S(), v1.V2S(), v2.V2S(), MemOperand(x0));
1191  __ ld4r(v28.V2S(),
1192          v29.V2S(),
1193          v30.V2S(),
1194          v31.V2S(),
1195          MemOperand(x1, x2, PostIndex));
1196  __ ld4r(v11.V2S(),
1197          v12.V2S(),
1198          v13.V2S(),
1199          v14.V2S(),
1200          MemOperand(x1, 16, PostIndex));
1201  __ ld4r(v19.V4H(), v20.V4H(), v21.V4H(), v22.V4H(), MemOperand(x0));
1202  __ ld4r(v22.V4H(),
1203          v23.V4H(),
1204          v24.V4H(),
1205          v25.V4H(),
1206          MemOperand(x1, x2, PostIndex));
1207  __ ld4r(v20.V4H(),
1208          v21.V4H(),
1209          v22.V4H(),
1210          v23.V4H(),
1211          MemOperand(x1, 8, PostIndex));
1212  __ ld4r(v16.V4S(), v17.V4S(), v18.V4S(), v19.V4S(), MemOperand(x0));
1213  __ ld4r(v25.V4S(),
1214          v26.V4S(),
1215          v27.V4S(),
1216          v28.V4S(),
1217          MemOperand(x1, x2, PostIndex));
1218  __ ld4r(v23.V4S(),
1219          v24.V4S(),
1220          v25.V4S(),
1221          v26.V4S(),
1222          MemOperand(x1, 16, PostIndex));
1223  __ ld4r(v22.V8B(), v23.V8B(), v24.V8B(), v25.V8B(), MemOperand(x0));
1224  __ ld4r(v27.V8B(),
1225          v28.V8B(),
1226          v29.V8B(),
1227          v30.V8B(),
1228          MemOperand(x1, x2, PostIndex));
1229  __ ld4r(v29.V8B(),
1230          v30.V8B(),
1231          v31.V8B(),
1232          v0.V8B(),
1233          MemOperand(x1, 4, PostIndex));
1234  __ ld4r(v28.V8H(), v29.V8H(), v30.V8H(), v31.V8H(), MemOperand(x0));
1235  __ ld4r(v25.V8H(),
1236          v26.V8H(),
1237          v27.V8H(),
1238          v28.V8H(),
1239          MemOperand(x1, x2, PostIndex));
1240  __ ld4r(v22.V8H(),
1241          v23.V8H(),
1242          v24.V8H(),
1243          v25.V8H(),
1244          MemOperand(x1, 8, PostIndex));
1245  __ mla(v29.V16B(), v7.V16B(), v26.V16B());
1246  __ mla(v6.V2S(), v4.V2S(), v14.V2S());
1247  __ mla(v9.V2S(), v11.V2S(), v0.S(), 2);
1248  __ mla(v5.V4H(), v17.V4H(), v25.V4H());
1249  __ mla(v24.V4H(), v7.V4H(), v11.H(), 3);
1250  __ mla(v12.V4S(), v3.V4S(), v4.V4S());
1251  __ mla(v10.V4S(), v7.V4S(), v7.S(), 3);
1252  __ mla(v3.V8B(), v16.V8B(), v9.V8B());
1253  __ mla(v19.V8H(), v22.V8H(), v18.V8H());
1254  __ mla(v6.V8H(), v2.V8H(), v0.H(), 0);
1255  __ mls(v23.V16B(), v10.V16B(), v11.V16B());
1256  __ mls(v14.V2S(), v31.V2S(), v22.V2S());
1257  __ mls(v28.V2S(), v13.V2S(), v1.S(), 3);
1258  __ mls(v2.V4H(), v19.V4H(), v13.V4H());
1259  __ mls(v18.V4H(), v15.V4H(), v12.H(), 6);
1260  __ mls(v6.V4S(), v11.V4S(), v16.V4S());
1261  __ mls(v23.V4S(), v16.V4S(), v10.S(), 2);
1262  __ mls(v26.V8B(), v13.V8B(), v23.V8B());
1263  __ mls(v10.V8H(), v10.V8H(), v12.V8H());
1264  __ mls(v14.V8H(), v0.V8H(), v14.H(), 7);
1265  __ mov(b22, v1.B(), 3);
1266  __ mov(d7, v13.D(), 1);
1267  __ mov(h26, v21.H(), 2);
1268  __ mov(s26, v19.S(), 0);
1269  __ mov(v26.V16B(), v11.V16B());
1270  __ mov(v20.V8B(), v0.V8B());
1271  __ mov(v19.B(), 13, v6.B(), 4);
1272  __ mov(v4.B(), 13, w19);
1273  __ mov(v11.D(), 1, v8.D(), 0);
1274  __ mov(v3.D(), 0, x30);
1275  __ mov(v29.H(), 4, v11.H(), 7);
1276  __ mov(v2.H(), 6, w6);
1277  __ mov(v22.S(), 0, v5.S(), 2);
1278  __ mov(v24.S(), 3, w8);
1279  __ mov(w18, v1.S(), 3);
1280  __ mov(x28, v21.D(), 0);
1281  __ movi(d24, 0xffff0000ffffff);
1282  __ movi(v29.V16B(), 0x80);
1283  __ movi(v12.V2D(), 0xffff00ff00ffff00);
1284  __ movi(v12.V2S(), 0xec, LSL, 24);
1285  __ movi(v10.V2S(), 0x4c, MSL, 16);
1286  __ movi(v26.V4H(), 0xc0, LSL);
1287  __ movi(v24.V4S(), 0x98, LSL, 16);
1288  __ movi(v1.V4S(), 0xde, MSL, 16);
1289  __ movi(v21.V8B(), 0x4d);
1290  __ movi(v29.V8H(), 0x69, LSL);
1291  __ mul(v1.V16B(), v15.V16B(), v17.V16B());
1292  __ mul(v21.V2S(), v19.V2S(), v29.V2S());
1293  __ mul(v19.V2S(), v5.V2S(), v3.S(), 0);
1294  __ mul(v29.V4H(), v11.V4H(), v2.V4H());
1295  __ mul(v2.V4H(), v7.V4H(), v0.H(), 0);
1296  __ mul(v25.V4S(), v26.V4S(), v16.V4S());
1297  __ mul(v26.V4S(), v6.V4S(), v15.S(), 2);
1298  __ mul(v11.V8B(), v15.V8B(), v31.V8B());
1299  __ mul(v20.V8H(), v31.V8H(), v15.V8H());
1300  __ mul(v29.V8H(), v5.V8H(), v9.H(), 4);
1301  __ mvn(v13.V16B(), v21.V16B());
1302  __ mvn(v28.V8B(), v19.V8B());
1303  __ mvni(v25.V2S(), 0xb8, LSL, 8);
1304  __ mvni(v17.V2S(), 0x6c, MSL, 16);
1305  __ mvni(v29.V4H(), 0x48, LSL);
1306  __ mvni(v20.V4S(), 0x7a, LSL, 16);
1307  __ mvni(v0.V4S(), 0x1e, MSL, 8);
1308  __ mvni(v31.V8H(), 0x3e, LSL);
1309  __ neg(d25, d11);
1310  __ neg(v4.V16B(), v9.V16B());
1311  __ neg(v11.V2D(), v25.V2D());
1312  __ neg(v7.V2S(), v18.V2S());
1313  __ neg(v7.V4H(), v15.V4H());
1314  __ neg(v17.V4S(), v18.V4S());
1315  __ neg(v20.V8B(), v17.V8B());
1316  __ neg(v0.V8H(), v11.V8H());
1317  __ orn(v13.V16B(), v11.V16B(), v31.V16B());
1318  __ orn(v22.V8B(), v16.V8B(), v22.V8B());
1319  __ orr(v17.V16B(), v17.V16B(), v23.V16B());
1320  __ orr(v8.V2S(), 0xe3);
1321  __ orr(v11.V4H(), 0x97, 8);
1322  __ orr(v7.V4S(), 0xab);
1323  __ orr(v8.V8B(), v4.V8B(), v3.V8B());
1324  __ orr(v31.V8H(), 0xb0, 8);
1325  __ pmul(v11.V16B(), v18.V16B(), v23.V16B());
1326  __ pmul(v8.V8B(), v24.V8B(), v5.V8B());
1327  __ pmull(v24.V8H(), v18.V8B(), v22.V8B());
1328  __ pmull2(v13.V8H(), v3.V16B(), v21.V16B());
1329  __ raddhn(v22.V2S(), v10.V2D(), v21.V2D());
1330  __ raddhn(v5.V4H(), v13.V4S(), v13.V4S());
1331  __ raddhn(v10.V8B(), v17.V8H(), v26.V8H());
1332  __ raddhn2(v9.V16B(), v29.V8H(), v13.V8H());
1333  __ raddhn2(v27.V4S(), v23.V2D(), v26.V2D());
1334  __ raddhn2(v0.V8H(), v29.V4S(), v7.V4S());
1335  __ rbit(v22.V16B(), v15.V16B());
1336  __ rbit(v30.V8B(), v3.V8B());
1337  __ rev16(v31.V16B(), v27.V16B());
1338  __ rev16(v12.V8B(), v26.V8B());
1339  __ rev32(v5.V16B(), v4.V16B());
1340  __ rev32(v16.V4H(), v26.V4H());
1341  __ rev32(v20.V8B(), v3.V8B());
1342  __ rev32(v20.V8H(), v28.V8H());
1343  __ rev64(v9.V16B(), v19.V16B());
1344  __ rev64(v5.V2S(), v16.V2S());
1345  __ rev64(v7.V4H(), v31.V4H());
1346  __ rev64(v15.V4S(), v26.V4S());
1347  __ rev64(v25.V8B(), v9.V8B());
1348  __ rev64(v11.V8H(), v5.V8H());
1349  __ rshrn(v18.V2S(), v13.V2D(), 1);
1350  __ rshrn(v25.V4H(), v30.V4S(), 2);
1351  __ rshrn(v13.V8B(), v9.V8H(), 8);
1352  __ rshrn2(v3.V16B(), v6.V8H(), 8);
1353  __ rshrn2(v0.V4S(), v29.V2D(), 25);
1354  __ rshrn2(v27.V8H(), v26.V4S(), 15);
1355  __ rsubhn(v15.V2S(), v25.V2D(), v4.V2D());
1356  __ rsubhn(v23.V4H(), v9.V4S(), v3.V4S());
1357  __ rsubhn(v6.V8B(), v30.V8H(), v24.V8H());
1358  __ rsubhn2(v4.V16B(), v24.V8H(), v20.V8H());
1359  __ rsubhn2(v1.V4S(), v23.V2D(), v22.V2D());
1360  __ rsubhn2(v19.V8H(), v2.V4S(), v20.V4S());
1361  __ saba(v28.V16B(), v9.V16B(), v25.V16B());
1362  __ saba(v9.V2S(), v28.V2S(), v20.V2S());
1363  __ saba(v17.V4H(), v22.V4H(), v22.V4H());
1364  __ saba(v29.V4S(), v5.V4S(), v27.V4S());
1365  __ saba(v20.V8B(), v21.V8B(), v18.V8B());
1366  __ saba(v27.V8H(), v17.V8H(), v30.V8H());
1367  __ sabal(v20.V2D(), v13.V2S(), v7.V2S());
1368  __ sabal(v4.V4S(), v12.V4H(), v4.V4H());
1369  __ sabal(v23.V8H(), v24.V8B(), v20.V8B());
1370  __ sabal2(v26.V2D(), v21.V4S(), v18.V4S());
1371  __ sabal2(v27.V4S(), v28.V8H(), v8.V8H());
1372  __ sabal2(v12.V8H(), v16.V16B(), v21.V16B());
1373  __ sabd(v0.V16B(), v15.V16B(), v13.V16B());
1374  __ sabd(v15.V2S(), v7.V2S(), v30.V2S());
1375  __ sabd(v17.V4H(), v17.V4H(), v12.V4H());
1376  __ sabd(v7.V4S(), v4.V4S(), v22.V4S());
1377  __ sabd(v23.V8B(), v3.V8B(), v26.V8B());
1378  __ sabd(v20.V8H(), v28.V8H(), v5.V8H());
1379  __ sabdl(v27.V2D(), v22.V2S(), v20.V2S());
1380  __ sabdl(v31.V4S(), v20.V4H(), v23.V4H());
1381  __ sabdl(v0.V8H(), v20.V8B(), v27.V8B());
1382  __ sabdl2(v31.V2D(), v11.V4S(), v3.V4S());
1383  __ sabdl2(v26.V4S(), v11.V8H(), v27.V8H());
1384  __ sabdl2(v6.V8H(), v8.V16B(), v18.V16B());
1385  __ sadalp(v8.V1D(), v26.V2S());
1386  __ sadalp(v12.V2D(), v26.V4S());
1387  __ sadalp(v12.V2S(), v26.V4H());
1388  __ sadalp(v4.V4H(), v1.V8B());
1389  __ sadalp(v15.V4S(), v17.V8H());
1390  __ sadalp(v21.V8H(), v25.V16B());
1391  __ saddl(v5.V2D(), v10.V2S(), v14.V2S());
1392  __ saddl(v18.V4S(), v3.V4H(), v15.V4H());
1393  __ saddl(v15.V8H(), v2.V8B(), v23.V8B());
1394  __ saddl2(v16.V2D(), v16.V4S(), v27.V4S());
1395  __ saddl2(v6.V4S(), v24.V8H(), v0.V8H());
1396  __ saddl2(v7.V8H(), v20.V16B(), v28.V16B());
1397  __ saddlp(v10.V1D(), v25.V2S());
1398  __ saddlp(v15.V2D(), v16.V4S());
1399  __ saddlp(v18.V2S(), v10.V4H());
1400  __ saddlp(v29.V4H(), v26.V8B());
1401  __ saddlp(v10.V4S(), v1.V8H());
1402  __ saddlp(v0.V8H(), v21.V16B());
1403  __ saddlv(d12, v7.V4S());
1404  __ saddlv(h14, v28.V16B());
1405  __ saddlv(h30, v30.V8B());
1406  __ saddlv(s27, v3.V4H());
1407  __ saddlv(s16, v16.V8H());
1408  __ saddw(v24.V2D(), v11.V2D(), v18.V2S());
1409  __ saddw(v13.V4S(), v12.V4S(), v6.V4H());
1410  __ saddw(v19.V8H(), v19.V8H(), v7.V8B());
1411  __ saddw2(v27.V2D(), v9.V2D(), v26.V4S());
1412  __ saddw2(v19.V4S(), v23.V4S(), v21.V8H());
1413  __ saddw2(v15.V8H(), v25.V8H(), v30.V16B());
1414  __ shadd(v7.V16B(), v4.V16B(), v9.V16B());
1415  __ shadd(v29.V2S(), v25.V2S(), v24.V2S());
1416  __ shadd(v31.V4H(), v10.V4H(), v13.V4H());
1417  __ shadd(v21.V4S(), v16.V4S(), v8.V4S());
1418  __ shadd(v14.V8B(), v29.V8B(), v22.V8B());
1419  __ shadd(v19.V8H(), v24.V8H(), v20.V8H());
1420  __ shl(d22, d25, 23);
1421  __ shl(v5.V16B(), v17.V16B(), 7);
1422  __ shl(v2.V2D(), v4.V2D(), 21);
1423  __ shl(v4.V2S(), v3.V2S(), 26);
1424  __ shl(v3.V4H(), v28.V4H(), 8);
1425  __ shl(v4.V4S(), v31.V4S(), 24);
1426  __ shl(v18.V8B(), v16.V8B(), 2);
1427  __ shl(v0.V8H(), v11.V8H(), 3);
1428  __ shll(v5.V2D(), v24.V2S(), 32);
1429  __ shll(v26.V4S(), v20.V4H(), 16);
1430  __ shll(v5.V8H(), v9.V8B(), 8);
1431  __ shll2(v21.V2D(), v28.V4S(), 32);
1432  __ shll2(v22.V4S(), v1.V8H(), 16);
1433  __ shll2(v30.V8H(), v25.V16B(), 8);
1434  __ shrn(v5.V2S(), v1.V2D(), 28);
1435  __ shrn(v29.V4H(), v18.V4S(), 7);
1436  __ shrn(v17.V8B(), v29.V8H(), 2);
1437  __ shrn2(v5.V16B(), v30.V8H(), 3);
1438  __ shrn2(v24.V4S(), v1.V2D(), 1);
1439  __ shrn2(v5.V8H(), v14.V4S(), 16);
1440  __ shsub(v30.V16B(), v22.V16B(), v23.V16B());
1441  __ shsub(v22.V2S(), v27.V2S(), v25.V2S());
1442  __ shsub(v13.V4H(), v22.V4H(), v1.V4H());
1443  __ shsub(v10.V4S(), v8.V4S(), v23.V4S());
1444  __ shsub(v6.V8B(), v9.V8B(), v31.V8B());
1445  __ shsub(v8.V8H(), v31.V8H(), v8.V8H());
1446  __ sli(d19, d29, 20);
1447  __ sli(v9.V16B(), v24.V16B(), 0);
1448  __ sli(v22.V2D(), v9.V2D(), 10);
1449  __ sli(v11.V2S(), v27.V2S(), 20);
1450  __ sli(v16.V4H(), v15.V4H(), 5);
1451  __ sli(v8.V4S(), v8.V4S(), 25);
1452  __ sli(v10.V8B(), v30.V8B(), 0);
1453  __ sli(v7.V8H(), v28.V8H(), 6);
1454  __ smax(v18.V16B(), v8.V16B(), v1.V16B());
1455  __ smax(v30.V2S(), v5.V2S(), v1.V2S());
1456  __ smax(v17.V4H(), v25.V4H(), v19.V4H());
1457  __ smax(v1.V4S(), v24.V4S(), v31.V4S());
1458  __ smax(v17.V8B(), v24.V8B(), v24.V8B());
1459  __ smax(v11.V8H(), v26.V8H(), v10.V8H());
1460  __ smaxp(v12.V16B(), v14.V16B(), v7.V16B());
1461  __ smaxp(v31.V2S(), v24.V2S(), v6.V2S());
1462  __ smaxp(v10.V4H(), v29.V4H(), v10.V4H());
1463  __ smaxp(v18.V4S(), v11.V4S(), v7.V4S());
1464  __ smaxp(v21.V8B(), v0.V8B(), v18.V8B());
1465  __ smaxp(v26.V8H(), v8.V8H(), v15.V8H());
1466  __ smaxv(b4, v5.V16B());
1467  __ smaxv(b23, v0.V8B());
1468  __ smaxv(h6, v0.V4H());
1469  __ smaxv(h24, v8.V8H());
1470  __ smaxv(s3, v16.V4S());
1471  __ smin(v24.V16B(), v8.V16B(), v18.V16B());
1472  __ smin(v29.V2S(), v8.V2S(), v23.V2S());
1473  __ smin(v6.V4H(), v11.V4H(), v21.V4H());
1474  __ smin(v24.V4S(), v23.V4S(), v15.V4S());
1475  __ smin(v8.V8B(), v16.V8B(), v4.V8B());
1476  __ smin(v12.V8H(), v1.V8H(), v10.V8H());
1477  __ sminp(v13.V16B(), v18.V16B(), v28.V16B());
1478  __ sminp(v22.V2S(), v28.V2S(), v16.V2S());
1479  __ sminp(v15.V4H(), v12.V4H(), v5.V4H());
1480  __ sminp(v15.V4S(), v17.V4S(), v8.V4S());
1481  __ sminp(v21.V8B(), v2.V8B(), v6.V8B());
1482  __ sminp(v21.V8H(), v12.V8H(), v6.V8H());
1483  __ sminv(b8, v6.V16B());
1484  __ sminv(b6, v18.V8B());
1485  __ sminv(h20, v1.V4H());
1486  __ sminv(h7, v17.V8H());
1487  __ sminv(s21, v4.V4S());
1488  __ smlal(v24.V2D(), v14.V2S(), v21.V2S());
1489  __ smlal(v31.V2D(), v3.V2S(), v14.S(), 2);
1490  __ smlal(v7.V4S(), v20.V4H(), v21.V4H());
1491  __ smlal(v19.V4S(), v16.V4H(), v9.H(), 3);
1492  __ smlal(v29.V8H(), v14.V8B(), v1.V8B());
1493  __ smlal2(v30.V2D(), v26.V4S(), v16.V4S());
1494  __ smlal2(v31.V2D(), v30.V4S(), v1.S(), 0);
1495  __ smlal2(v17.V4S(), v6.V8H(), v3.V8H());
1496  __ smlal2(v11.V4S(), v31.V8H(), v5.H(), 7);
1497  __ smlal2(v30.V8H(), v16.V16B(), v29.V16B());
1498  __ smlsl(v1.V2D(), v20.V2S(), v17.V2S());
1499  __ smlsl(v29.V2D(), v12.V2S(), v5.S(), 3);
1500  __ smlsl(v0.V4S(), v26.V4H(), v1.V4H());
1501  __ smlsl(v3.V4S(), v5.V4H(), v6.H(), 5);
1502  __ smlsl(v4.V8H(), v0.V8B(), v26.V8B());
1503  __ smlsl2(v14.V2D(), v14.V4S(), v5.V4S());
1504  __ smlsl2(v15.V2D(), v5.V4S(), v0.S(), 1);
1505  __ smlsl2(v29.V4S(), v17.V8H(), v31.V8H());
1506  __ smlsl2(v6.V4S(), v15.V8H(), v9.H(), 6);
1507  __ smlsl2(v30.V8H(), v15.V16B(), v15.V16B());
1508  __ smov(w21, v6.B(), 3);
1509  __ smov(w13, v26.H(), 7);
1510  __ smov(x24, v16.B(), 7);
1511  __ smov(x7, v4.H(), 3);
1512  __ smov(x29, v7.S(), 1);
1513  __ smull(v4.V2D(), v29.V2S(), v17.V2S());
1514  __ smull(v30.V2D(), v21.V2S(), v6.S(), 2);
1515  __ smull(v23.V4S(), v5.V4H(), v23.V4H());
1516  __ smull(v8.V4S(), v9.V4H(), v2.H(), 1);
1517  __ smull(v31.V8H(), v17.V8B(), v1.V8B());
1518  __ smull2(v3.V2D(), v3.V4S(), v23.V4S());
1519  __ smull2(v15.V2D(), v29.V4S(), v6.S(), 1);
1520  __ smull2(v19.V4S(), v20.V8H(), v30.V8H());
1521  __ smull2(v6.V4S(), v10.V8H(), v7.H(), 4);
1522  __ smull2(v25.V8H(), v8.V16B(), v27.V16B());
1523  __ sqabs(b3, b15);
1524  __ sqabs(d14, d9);
1525  __ sqabs(h31, h28);
1526  __ sqabs(s8, s0);
1527  __ sqabs(v14.V16B(), v7.V16B());
1528  __ sqabs(v23.V2D(), v19.V2D());
1529  __ sqabs(v10.V2S(), v24.V2S());
1530  __ sqabs(v31.V4H(), v19.V4H());
1531  __ sqabs(v23.V4S(), v0.V4S());
1532  __ sqabs(v29.V8B(), v23.V8B());
1533  __ sqabs(v17.V8H(), v21.V8H());
1534  __ sqadd(b9, b23, b13);
1535  __ sqadd(d2, d25, d26);
1536  __ sqadd(h7, h29, h25);
1537  __ sqadd(s11, s7, s24);
1538  __ sqadd(v20.V16B(), v16.V16B(), v29.V16B());
1539  __ sqadd(v23.V2D(), v30.V2D(), v28.V2D());
1540  __ sqadd(v8.V2S(), v19.V2S(), v2.V2S());
1541  __ sqadd(v20.V4H(), v12.V4H(), v31.V4H());
1542  __ sqadd(v14.V4S(), v15.V4S(), v17.V4S());
1543  __ sqadd(v2.V8B(), v29.V8B(), v13.V8B());
1544  __ sqadd(v7.V8H(), v19.V8H(), v14.V8H());
1545  __ sqdmlal(d15, s5, s30);
1546  __ sqdmlal(d24, s10, v2.S(), 3);
1547  __ sqdmlal(s9, h19, h8);
1548  __ sqdmlal(s14, h1, v12.H(), 3);
1549  __ sqdmlal(v30.V2D(), v5.V2S(), v31.V2S());
1550  __ sqdmlal(v25.V2D(), v14.V2S(), v10.S(), 1);
1551  __ sqdmlal(v19.V4S(), v17.V4H(), v16.V4H());
1552  __ sqdmlal(v8.V4S(), v5.V4H(), v8.H(), 1);
1553  __ sqdmlal2(v1.V2D(), v23.V4S(), v3.V4S());
1554  __ sqdmlal2(v19.V2D(), v0.V4S(), v9.S(), 0);
1555  __ sqdmlal2(v26.V4S(), v22.V8H(), v11.V8H());
1556  __ sqdmlal2(v6.V4S(), v28.V8H(), v13.H(), 4);
1557  __ sqdmlsl(d10, s29, s20);
1558  __ sqdmlsl(d10, s9, v10.S(), 1);
1559  __ sqdmlsl(s30, h9, h24);
1560  __ sqdmlsl(s13, h24, v6.H(), 1);
1561  __ sqdmlsl(v27.V2D(), v10.V2S(), v20.V2S());
1562  __ sqdmlsl(v23.V2D(), v23.V2S(), v3.S(), 3);
1563  __ sqdmlsl(v7.V4S(), v17.V4H(), v29.V4H());
1564  __ sqdmlsl(v22.V4S(), v21.V4H(), v3.H(), 4);
1565  __ sqdmlsl2(v12.V2D(), v7.V4S(), v22.V4S());
1566  __ sqdmlsl2(v20.V2D(), v25.V4S(), v8.S(), 0);
1567  __ sqdmlsl2(v25.V4S(), v26.V8H(), v18.V8H());
1568  __ sqdmlsl2(v25.V4S(), v19.V8H(), v5.H(), 0);
1569  __ sqdmulh(h17, h27, h12);
1570  __ sqdmulh(h16, h5, v11.H(), 0);
1571  __ sqdmulh(s1, s19, s16);
1572  __ sqdmulh(s1, s16, v2.S(), 0);
1573  __ sqdmulh(v28.V2S(), v1.V2S(), v8.V2S());
1574  __ sqdmulh(v28.V2S(), v8.V2S(), v3.S(), 0);
1575  __ sqdmulh(v11.V4H(), v25.V4H(), v5.V4H());
1576  __ sqdmulh(v30.V4H(), v14.V4H(), v8.H(), 5);
1577  __ sqdmulh(v25.V4S(), v21.V4S(), v13.V4S());
1578  __ sqdmulh(v23.V4S(), v2.V4S(), v10.S(), 3);
1579  __ sqdmulh(v26.V8H(), v5.V8H(), v23.V8H());
1580  __ sqdmulh(v4.V8H(), v22.V8H(), v4.H(), 3);
1581  __ sqdmull(d25, s2, s26);
1582  __ sqdmull(d30, s14, v5.S(), 1);
1583  __ sqdmull(s29, h18, h11);
1584  __ sqdmull(s11, h13, v7.H(), 6);
1585  __ sqdmull(v23.V2D(), v9.V2S(), v8.V2S());
1586  __ sqdmull(v18.V2D(), v29.V2S(), v4.S(), 1);
1587  __ sqdmull(v17.V4S(), v24.V4H(), v7.V4H());
1588  __ sqdmull(v8.V4S(), v15.V4H(), v5.H(), 1);
1589  __ sqdmull2(v28.V2D(), v14.V4S(), v2.V4S());
1590  __ sqdmull2(v1.V2D(), v24.V4S(), v13.S(), 2);
1591  __ sqdmull2(v11.V4S(), v17.V8H(), v31.V8H());
1592  __ sqdmull2(v1.V4S(), v20.V8H(), v11.H(), 3);
1593  __ sqneg(b2, b0);
1594  __ sqneg(d24, d2);
1595  __ sqneg(h29, h3);
1596  __ sqneg(s4, s9);
1597  __ sqneg(v14.V16B(), v29.V16B());
1598  __ sqneg(v30.V2D(), v12.V2D());
1599  __ sqneg(v28.V2S(), v26.V2S());
1600  __ sqneg(v4.V4H(), v4.V4H());
1601  __ sqneg(v9.V4S(), v8.V4S());
1602  __ sqneg(v20.V8B(), v20.V8B());
1603  __ sqneg(v27.V8H(), v10.V8H());
1604  __ sqrdmulh(h7, h24, h0);
1605  __ sqrdmulh(h14, h3, v4.H(), 6);
1606  __ sqrdmulh(s27, s19, s24);
1607  __ sqrdmulh(s31, s21, v4.S(), 0);
1608  __ sqrdmulh(v18.V2S(), v25.V2S(), v1.V2S());
1609  __ sqrdmulh(v22.V2S(), v5.V2S(), v13.S(), 0);
1610  __ sqrdmulh(v22.V4H(), v24.V4H(), v9.V4H());
1611  __ sqrdmulh(v13.V4H(), v2.V4H(), v12.H(), 6);
1612  __ sqrdmulh(v9.V4S(), v27.V4S(), v2.V4S());
1613  __ sqrdmulh(v3.V4S(), v23.V4S(), v7.S(), 1);
1614  __ sqrdmulh(v2.V8H(), v0.V8H(), v7.V8H());
1615  __ sqrdmulh(v16.V8H(), v9.V8H(), v8.H(), 2);
1616  __ sqrshl(b8, b21, b13);
1617  __ sqrshl(d29, d7, d20);
1618  __ sqrshl(h28, h14, h10);
1619  __ sqrshl(s26, s18, s2);
1620  __ sqrshl(v18.V16B(), v31.V16B(), v26.V16B());
1621  __ sqrshl(v28.V2D(), v4.V2D(), v0.V2D());
1622  __ sqrshl(v3.V2S(), v6.V2S(), v0.V2S());
1623  __ sqrshl(v1.V4H(), v18.V4H(), v22.V4H());
1624  __ sqrshl(v16.V4S(), v25.V4S(), v7.V4S());
1625  __ sqrshl(v0.V8B(), v21.V8B(), v5.V8B());
1626  __ sqrshl(v30.V8H(), v19.V8H(), v8.V8H());
1627  __ sqrshrn(b6, h21, 4);
1628  __ sqrshrn(h14, s17, 11);
1629  __ sqrshrn(s25, d27, 10);
1630  __ sqrshrn(v6.V2S(), v13.V2D(), 18);
1631  __ sqrshrn(v5.V4H(), v9.V4S(), 15);
1632  __ sqrshrn(v19.V8B(), v12.V8H(), 1);
1633  __ sqrshrn2(v19.V16B(), v21.V8H(), 7);
1634  __ sqrshrn2(v29.V4S(), v24.V2D(), 13);
1635  __ sqrshrn2(v12.V8H(), v2.V4S(), 10);
1636  __ sqrshrun(b16, h9, 5);
1637  __ sqrshrun(h3, s24, 15);
1638  __ sqrshrun(s16, d18, 8);
1639  __ sqrshrun(v28.V2S(), v23.V2D(), 8);
1640  __ sqrshrun(v31.V4H(), v25.V4S(), 10);
1641  __ sqrshrun(v19.V8B(), v23.V8H(), 2);
1642  __ sqrshrun2(v24.V16B(), v0.V8H(), 8);
1643  __ sqrshrun2(v22.V4S(), v1.V2D(), 23);
1644  __ sqrshrun2(v28.V8H(), v21.V4S(), 13);
1645  __ sqshl(b6, b21, b8);
1646  __ sqshl(b11, b26, 2);
1647  __ sqshl(d29, d0, d4);
1648  __ sqshl(d21, d7, 35);
1649  __ sqshl(h20, h25, h17);
1650  __ sqshl(h20, h0, 8);
1651  __ sqshl(s29, s13, s4);
1652  __ sqshl(s10, s11, 20);
1653  __ sqshl(v8.V16B(), v18.V16B(), v28.V16B());
1654  __ sqshl(v29.V16B(), v29.V16B(), 2);
1655  __ sqshl(v8.V2D(), v31.V2D(), v16.V2D());
1656  __ sqshl(v7.V2D(), v14.V2D(), 37);
1657  __ sqshl(v0.V2S(), v26.V2S(), v7.V2S());
1658  __ sqshl(v5.V2S(), v11.V2S(), 19);
1659  __ sqshl(v11.V4H(), v30.V4H(), v0.V4H());
1660  __ sqshl(v1.V4H(), v18.V4H(), 7);
1661  __ sqshl(v22.V4S(), v3.V4S(), v30.V4S());
1662  __ sqshl(v16.V4S(), v15.V4S(), 28);
1663  __ sqshl(v6.V8B(), v28.V8B(), v25.V8B());
1664  __ sqshl(v0.V8B(), v15.V8B(), 0);
1665  __ sqshl(v6.V8H(), v16.V8H(), v30.V8H());
1666  __ sqshl(v3.V8H(), v20.V8H(), 14);
1667  __ sqshlu(b13, b14, 6);
1668  __ sqshlu(d0, d16, 44);
1669  __ sqshlu(h5, h29, 15);
1670  __ sqshlu(s29, s8, 13);
1671  __ sqshlu(v27.V16B(), v20.V16B(), 2);
1672  __ sqshlu(v24.V2D(), v12.V2D(), 11);
1673  __ sqshlu(v12.V2S(), v19.V2S(), 22);
1674  __ sqshlu(v8.V4H(), v12.V4H(), 11);
1675  __ sqshlu(v18.V4S(), v3.V4S(), 8);
1676  __ sqshlu(v3.V8B(), v10.V8B(), 1);
1677  __ sqshlu(v30.V8H(), v24.V8H(), 4);
1678  __ sqshrn(b1, h28, 1);
1679  __ sqshrn(h31, s7, 10);
1680  __ sqshrn(s4, d10, 24);
1681  __ sqshrn(v10.V2S(), v1.V2D(), 29);
1682  __ sqshrn(v3.V4H(), v13.V4S(), 14);
1683  __ sqshrn(v27.V8B(), v6.V8H(), 7);
1684  __ sqshrn2(v14.V16B(), v23.V8H(), 1);
1685  __ sqshrn2(v25.V4S(), v22.V2D(), 27);
1686  __ sqshrn2(v31.V8H(), v12.V4S(), 10);
1687  __ sqshrun(b9, h0, 1);
1688  __ sqshrun(h11, s6, 7);
1689  __ sqshrun(s13, d12, 13);
1690  __ sqshrun(v10.V2S(), v30.V2D(), 1);
1691  __ sqshrun(v31.V4H(), v3.V4S(), 11);
1692  __ sqshrun(v28.V8B(), v30.V8H(), 8);
1693  __ sqshrun2(v16.V16B(), v27.V8H(), 3);
1694  __ sqshrun2(v27.V4S(), v14.V2D(), 18);
1695  __ sqshrun2(v23.V8H(), v14.V4S(), 1);
1696  __ sqsub(b19, b29, b11);
1697  __ sqsub(d21, d31, d6);
1698  __ sqsub(h18, h10, h19);
1699  __ sqsub(s6, s5, s0);
1700  __ sqsub(v21.V16B(), v22.V16B(), v0.V16B());
1701  __ sqsub(v22.V2D(), v10.V2D(), v17.V2D());
1702  __ sqsub(v8.V2S(), v21.V2S(), v2.V2S());
1703  __ sqsub(v18.V4H(), v25.V4H(), v27.V4H());
1704  __ sqsub(v13.V4S(), v3.V4S(), v6.V4S());
1705  __ sqsub(v28.V8B(), v29.V8B(), v16.V8B());
1706  __ sqsub(v17.V8H(), v6.V8H(), v10.V8H());
1707  __ sqxtn(b27, h26);
1708  __ sqxtn(h17, s11);
1709  __ sqxtn(s22, d31);
1710  __ sqxtn(v26.V2S(), v5.V2D());
1711  __ sqxtn(v13.V4H(), v7.V4S());
1712  __ sqxtn(v19.V8B(), v19.V8H());
1713  __ sqxtn2(v19.V16B(), v3.V8H());
1714  __ sqxtn2(v23.V4S(), v1.V2D());
1715  __ sqxtn2(v13.V8H(), v3.V4S());
1716  __ sqxtun(b26, h9);
1717  __ sqxtun(h19, s12);
1718  __ sqxtun(s3, d6);
1719  __ sqxtun(v29.V2S(), v26.V2D());
1720  __ sqxtun(v26.V4H(), v10.V4S());
1721  __ sqxtun(v7.V8B(), v29.V8H());
1722  __ sqxtun2(v21.V16B(), v14.V8H());
1723  __ sqxtun2(v24.V4S(), v15.V2D());
1724  __ sqxtun2(v30.V8H(), v1.V4S());
1725  __ srhadd(v21.V16B(), v17.V16B(), v15.V16B());
1726  __ srhadd(v28.V2S(), v21.V2S(), v29.V2S());
1727  __ srhadd(v9.V4H(), v1.V4H(), v30.V4H());
1728  __ srhadd(v24.V4S(), v0.V4S(), v2.V4S());
1729  __ srhadd(v6.V8B(), v17.V8B(), v15.V8B());
1730  __ srhadd(v5.V8H(), v7.V8H(), v21.V8H());
1731  __ sri(d14, d14, 49);
1732  __ sri(v23.V16B(), v8.V16B(), 4);
1733  __ sri(v20.V2D(), v13.V2D(), 20);
1734  __ sri(v16.V2S(), v2.V2S(), 24);
1735  __ sri(v5.V4H(), v23.V4H(), 11);
1736  __ sri(v27.V4S(), v15.V4S(), 23);
1737  __ sri(v19.V8B(), v29.V8B(), 4);
1738  __ sri(v7.V8H(), v29.V8H(), 3);
1739  __ srshl(d2, d9, d26);
1740  __ srshl(v29.V16B(), v17.V16B(), v11.V16B());
1741  __ srshl(v8.V2D(), v15.V2D(), v4.V2D());
1742  __ srshl(v25.V2S(), v17.V2S(), v8.V2S());
1743  __ srshl(v19.V4H(), v7.V4H(), v7.V4H());
1744  __ srshl(v13.V4S(), v2.V4S(), v17.V4S());
1745  __ srshl(v22.V8B(), v6.V8B(), v21.V8B());
1746  __ srshl(v10.V8H(), v17.V8H(), v4.V8H());
1747  __ srshr(d21, d18, 45);
1748  __ srshr(v3.V16B(), v11.V16B(), 7);
1749  __ srshr(v21.V2D(), v26.V2D(), 53);
1750  __ srshr(v11.V2S(), v5.V2S(), 28);
1751  __ srshr(v7.V4H(), v18.V4H(), 12);
1752  __ srshr(v7.V4S(), v3.V4S(), 30);
1753  __ srshr(v14.V8B(), v2.V8B(), 6);
1754  __ srshr(v21.V8H(), v20.V8H(), 3);
1755  __ srsra(d21, d30, 63);
1756  __ srsra(v27.V16B(), v30.V16B(), 6);
1757  __ srsra(v20.V2D(), v12.V2D(), 27);
1758  __ srsra(v0.V2S(), v17.V2S(), 5);
1759  __ srsra(v14.V4H(), v16.V4H(), 15);
1760  __ srsra(v18.V4S(), v3.V4S(), 20);
1761  __ srsra(v21.V8B(), v1.V8B(), 1);
1762  __ srsra(v31.V8H(), v25.V8H(), 2);
1763  __ sshl(d1, d13, d9);
1764  __ sshl(v17.V16B(), v31.V16B(), v15.V16B());
1765  __ sshl(v13.V2D(), v16.V2D(), v0.V2D());
1766  __ sshl(v0.V2S(), v7.V2S(), v22.V2S());
1767  __ sshl(v23.V4H(), v19.V4H(), v4.V4H());
1768  __ sshl(v5.V4S(), v5.V4S(), v11.V4S());
1769  __ sshl(v23.V8B(), v27.V8B(), v7.V8B());
1770  __ sshl(v29.V8H(), v10.V8H(), v5.V8H());
1771  __ sshll(v0.V2D(), v2.V2S(), 23);
1772  __ sshll(v11.V4S(), v8.V4H(), 8);
1773  __ sshll(v4.V8H(), v29.V8B(), 1);
1774  __ sshll2(v10.V2D(), v4.V4S(), 14);
1775  __ sshll2(v26.V4S(), v31.V8H(), 6);
1776  __ sshll2(v3.V8H(), v26.V16B(), 4);
1777  __ sshr(d19, d21, 20);
1778  __ sshr(v15.V16B(), v23.V16B(), 5);
1779  __ sshr(v17.V2D(), v14.V2D(), 38);
1780  __ sshr(v3.V2S(), v29.V2S(), 23);
1781  __ sshr(v23.V4H(), v27.V4H(), 4);
1782  __ sshr(v28.V4S(), v3.V4S(), 4);
1783  __ sshr(v14.V8B(), v2.V8B(), 6);
1784  __ sshr(v3.V8H(), v8.V8H(), 6);
1785  __ ssra(d12, d28, 44);
1786  __ ssra(v29.V16B(), v31.V16B(), 4);
1787  __ ssra(v3.V2D(), v0.V2D(), 24);
1788  __ ssra(v14.V2S(), v28.V2S(), 6);
1789  __ ssra(v18.V4H(), v8.V4H(), 7);
1790  __ ssra(v31.V4S(), v14.V4S(), 24);
1791  __ ssra(v28.V8B(), v26.V8B(), 5);
1792  __ ssra(v9.V8H(), v9.V8H(), 14);
1793  __ ssubl(v13.V2D(), v14.V2S(), v3.V2S());
1794  __ ssubl(v5.V4S(), v16.V4H(), v8.V4H());
1795  __ ssubl(v0.V8H(), v28.V8B(), v6.V8B());
1796  __ ssubl2(v5.V2D(), v13.V4S(), v25.V4S());
1797  __ ssubl2(v3.V4S(), v15.V8H(), v17.V8H());
1798  __ ssubl2(v15.V8H(), v15.V16B(), v14.V16B());
1799  __ ssubw(v25.V2D(), v23.V2D(), v26.V2S());
1800  __ ssubw(v21.V4S(), v18.V4S(), v24.V4H());
1801  __ ssubw(v30.V8H(), v22.V8H(), v3.V8B());
1802  __ ssubw2(v16.V2D(), v24.V2D(), v28.V4S());
1803  __ ssubw2(v31.V4S(), v11.V4S(), v15.V8H());
1804  __ ssubw2(v4.V8H(), v8.V8H(), v16.V16B());
1805  __ st1(v18.V16B(), v19.V16B(), v20.V16B(), v21.V16B(), MemOperand(x0));
1806  __ st1(v10.V16B(),
1807         v11.V16B(),
1808         v12.V16B(),
1809         v13.V16B(),
1810         MemOperand(x1, x2, PostIndex));
1811  __ st1(v27.V16B(),
1812         v28.V16B(),
1813         v29.V16B(),
1814         v30.V16B(),
1815         MemOperand(x1, 64, PostIndex));
1816  __ st1(v16.V16B(), v17.V16B(), v18.V16B(), MemOperand(x0));
1817  __ st1(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex));
1818  __ st1(v9.V16B(), v10.V16B(), v11.V16B(), MemOperand(x1, 48, PostIndex));
1819  __ st1(v7.V16B(), v8.V16B(), MemOperand(x0));
1820  __ st1(v26.V16B(), v27.V16B(), MemOperand(x1, x2, PostIndex));
1821  __ st1(v22.V16B(), v23.V16B(), MemOperand(x1, 32, PostIndex));
1822  __ st1(v23.V16B(), MemOperand(x0));
1823  __ st1(v28.V16B(), MemOperand(x1, x2, PostIndex));
1824  __ st1(v2.V16B(), MemOperand(x1, 16, PostIndex));
1825  __ st1(v29.V1D(), v30.V1D(), v31.V1D(), v0.V1D(), MemOperand(x0));
1826  __ st1(v12.V1D(),
1827         v13.V1D(),
1828         v14.V1D(),
1829         v15.V1D(),
1830         MemOperand(x1, x2, PostIndex));
1831  __ st1(v30.V1D(),
1832         v31.V1D(),
1833         v0.V1D(),
1834         v1.V1D(),
1835         MemOperand(x1, 32, PostIndex));
1836  __ st1(v16.V1D(), v17.V1D(), v18.V1D(), MemOperand(x0));
1837  __ st1(v3.V1D(), v4.V1D(), v5.V1D(), MemOperand(x1, x2, PostIndex));
1838  __ st1(v14.V1D(), v15.V1D(), v16.V1D(), MemOperand(x1, 24, PostIndex));
1839  __ st1(v18.V1D(), v19.V1D(), MemOperand(x0));
1840  __ st1(v5.V1D(), v6.V1D(), MemOperand(x1, x2, PostIndex));
1841  __ st1(v2.V1D(), v3.V1D(), MemOperand(x1, 16, PostIndex));
1842  __ st1(v4.V1D(), MemOperand(x0));
1843  __ st1(v27.V1D(), MemOperand(x1, x2, PostIndex));
1844  __ st1(v23.V1D(), MemOperand(x1, 8, PostIndex));
1845  __ st1(v2.V2D(), v3.V2D(), v4.V2D(), v5.V2D(), MemOperand(x0));
1846  __ st1(v22.V2D(),
1847         v23.V2D(),
1848         v24.V2D(),
1849         v25.V2D(),
1850         MemOperand(x1, x2, PostIndex));
1851  __ st1(v28.V2D(),
1852         v29.V2D(),
1853         v30.V2D(),
1854         v31.V2D(),
1855         MemOperand(x1, 64, PostIndex));
1856  __ st1(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
1857  __ st1(v16.V2D(), v17.V2D(), v18.V2D(), MemOperand(x1, x2, PostIndex));
1858  __ st1(v22.V2D(), v23.V2D(), v24.V2D(), MemOperand(x1, 48, PostIndex));
1859  __ st1(v21.V2D(), v22.V2D(), MemOperand(x0));
1860  __ st1(v6.V2D(), v7.V2D(), MemOperand(x1, x2, PostIndex));
1861  __ st1(v27.V2D(), v28.V2D(), MemOperand(x1, 32, PostIndex));
1862  __ st1(v21.V2D(), MemOperand(x0));
1863  __ st1(v29.V2D(), MemOperand(x1, x2, PostIndex));
1864  __ st1(v20.V2D(), MemOperand(x1, 16, PostIndex));
1865  __ st1(v22.V2S(), v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x0));
1866  __ st1(v8.V2S(),
1867         v9.V2S(),
1868         v10.V2S(),
1869         v11.V2S(),
1870         MemOperand(x1, x2, PostIndex));
1871  __ st1(v15.V2S(),
1872         v16.V2S(),
1873         v17.V2S(),
1874         v18.V2S(),
1875         MemOperand(x1, 32, PostIndex));
1876  __ st1(v2.V2S(), v3.V2S(), v4.V2S(), MemOperand(x0));
1877  __ st1(v23.V2S(), v24.V2S(), v25.V2S(), MemOperand(x1, x2, PostIndex));
1878  __ st1(v7.V2S(), v8.V2S(), v9.V2S(), MemOperand(x1, 24, PostIndex));
1879  __ st1(v28.V2S(), v29.V2S(), MemOperand(x0));
1880  __ st1(v29.V2S(), v30.V2S(), MemOperand(x1, x2, PostIndex));
1881  __ st1(v23.V2S(), v24.V2S(), MemOperand(x1, 16, PostIndex));
1882  __ st1(v6.V2S(), MemOperand(x0));
1883  __ st1(v11.V2S(), MemOperand(x1, x2, PostIndex));
1884  __ st1(v17.V2S(), MemOperand(x1, 8, PostIndex));
1885  __ st1(v6.V4H(), v7.V4H(), v8.V4H(), v9.V4H(), MemOperand(x0));
1886  __ st1(v9.V4H(),
1887         v10.V4H(),
1888         v11.V4H(),
1889         v12.V4H(),
1890         MemOperand(x1, x2, PostIndex));
1891  __ st1(v25.V4H(),
1892         v26.V4H(),
1893         v27.V4H(),
1894         v28.V4H(),
1895         MemOperand(x1, 32, PostIndex));
1896  __ st1(v11.V4H(), v12.V4H(), v13.V4H(), MemOperand(x0));
1897  __ st1(v10.V4H(), v11.V4H(), v12.V4H(), MemOperand(x1, x2, PostIndex));
1898  __ st1(v12.V4H(), v13.V4H(), v14.V4H(), MemOperand(x1, 24, PostIndex));
1899  __ st1(v13.V4H(), v14.V4H(), MemOperand(x0));
1900  __ st1(v15.V4H(), v16.V4H(), MemOperand(x1, x2, PostIndex));
1901  __ st1(v21.V4H(), v22.V4H(), MemOperand(x1, 16, PostIndex));
1902  __ st1(v16.V4H(), MemOperand(x0));
1903  __ st1(v8.V4H(), MemOperand(x1, x2, PostIndex));
1904  __ st1(v30.V4H(), MemOperand(x1, 8, PostIndex));
1905  __ st1(v3.V4S(), v4.V4S(), v5.V4S(), v6.V4S(), MemOperand(x0));
1906  __ st1(v25.V4S(),
1907         v26.V4S(),
1908         v27.V4S(),
1909         v28.V4S(),
1910         MemOperand(x1, x2, PostIndex));
1911  __ st1(v5.V4S(), v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 64, PostIndex));
1912  __ st1(v31.V4S(), v0.V4S(), v1.V4S(), MemOperand(x0));
1913  __ st1(v30.V4S(), v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex));
1914  __ st1(v6.V4S(), v7.V4S(), v8.V4S(), MemOperand(x1, 48, PostIndex));
1915  __ st1(v17.V4S(), v18.V4S(), MemOperand(x0));
1916  __ st1(v31.V4S(), v0.V4S(), MemOperand(x1, x2, PostIndex));
1917  __ st1(v1.V4S(), v2.V4S(), MemOperand(x1, 32, PostIndex));
1918  __ st1(v26.V4S(), MemOperand(x0));
1919  __ st1(v15.V4S(), MemOperand(x1, x2, PostIndex));
1920  __ st1(v13.V4S(), MemOperand(x1, 16, PostIndex));
1921  __ st1(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
1922  __ st1(v10.V8B(),
1923         v11.V8B(),
1924         v12.V8B(),
1925         v13.V8B(),
1926         MemOperand(x1, x2, PostIndex));
1927  __ st1(v15.V8B(),
1928         v16.V8B(),
1929         v17.V8B(),
1930         v18.V8B(),
1931         MemOperand(x1, 32, PostIndex));
1932  __ st1(v19.V8B(), v20.V8B(), v21.V8B(), MemOperand(x0));
1933  __ st1(v31.V8B(), v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex));
1934  __ st1(v9.V8B(), v10.V8B(), v11.V8B(), MemOperand(x1, 24, PostIndex));
1935  __ st1(v12.V8B(), v13.V8B(), MemOperand(x0));
1936  __ st1(v2.V8B(), v3.V8B(), MemOperand(x1, x2, PostIndex));
1937  __ st1(v0.V8B(), v1.V8B(), MemOperand(x1, 16, PostIndex));
1938  __ st1(v16.V8B(), MemOperand(x0));
1939  __ st1(v25.V8B(), MemOperand(x1, x2, PostIndex));
1940  __ st1(v31.V8B(), MemOperand(x1, 8, PostIndex));
1941  __ st1(v4.V8H(), v5.V8H(), v6.V8H(), v7.V8H(), MemOperand(x0));
1942  __ st1(v3.V8H(), v4.V8H(), v5.V8H(), v6.V8H(), MemOperand(x1, x2, PostIndex));
1943  __ st1(v26.V8H(),
1944         v27.V8H(),
1945         v28.V8H(),
1946         v29.V8H(),
1947         MemOperand(x1, 64, PostIndex));
1948  __ st1(v10.V8H(), v11.V8H(), v12.V8H(), MemOperand(x0));
1949  __ st1(v21.V8H(), v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex));
1950  __ st1(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex));
1951  __ st1(v26.V8H(), v27.V8H(), MemOperand(x0));
1952  __ st1(v24.V8H(), v25.V8H(), MemOperand(x1, x2, PostIndex));
1953  __ st1(v17.V8H(), v18.V8H(), MemOperand(x1, 32, PostIndex));
1954  __ st1(v29.V8H(), MemOperand(x0));
1955  __ st1(v19.V8H(), MemOperand(x1, x2, PostIndex));
1956  __ st1(v23.V8H(), MemOperand(x1, 16, PostIndex));
1957  __ st1(v19.B(), 15, MemOperand(x0));
1958  __ st1(v25.B(), 9, MemOperand(x1, x2, PostIndex));
1959  __ st1(v4.B(), 8, MemOperand(x1, 1, PostIndex));
1960  __ st1(v13.D(), 0, MemOperand(x0));
1961  __ st1(v30.D(), 0, MemOperand(x1, x2, PostIndex));
1962  __ st1(v3.D(), 0, MemOperand(x1, 8, PostIndex));
1963  __ st1(v22.H(), 0, MemOperand(x0));
1964  __ st1(v31.H(), 7, MemOperand(x1, x2, PostIndex));
1965  __ st1(v23.H(), 3, MemOperand(x1, 2, PostIndex));
1966  __ st1(v0.S(), 0, MemOperand(x0));
1967  __ st1(v11.S(), 3, MemOperand(x1, x2, PostIndex));
1968  __ st1(v24.S(), 3, MemOperand(x1, 4, PostIndex));
1969  __ st2(v7.V16B(), v8.V16B(), MemOperand(x0));
1970  __ st2(v5.V16B(), v6.V16B(), MemOperand(x1, x2, PostIndex));
1971  __ st2(v18.V16B(), v19.V16B(), MemOperand(x1, 32, PostIndex));
1972  __ st2(v14.V2D(), v15.V2D(), MemOperand(x0));
1973  __ st2(v7.V2D(), v8.V2D(), MemOperand(x1, x2, PostIndex));
1974  __ st2(v24.V2D(), v25.V2D(), MemOperand(x1, 32, PostIndex));
1975  __ st2(v22.V2S(), v23.V2S(), MemOperand(x0));
1976  __ st2(v4.V2S(), v5.V2S(), MemOperand(x1, x2, PostIndex));
1977  __ st2(v2.V2S(), v3.V2S(), MemOperand(x1, 16, PostIndex));
1978  __ st2(v23.V4H(), v24.V4H(), MemOperand(x0));
1979  __ st2(v8.V4H(), v9.V4H(), MemOperand(x1, x2, PostIndex));
1980  __ st2(v7.V4H(), v8.V4H(), MemOperand(x1, 16, PostIndex));
1981  __ st2(v17.V4S(), v18.V4S(), MemOperand(x0));
1982  __ st2(v6.V4S(), v7.V4S(), MemOperand(x1, x2, PostIndex));
1983  __ st2(v26.V4S(), v27.V4S(), MemOperand(x1, 32, PostIndex));
1984  __ st2(v31.V8B(), v0.V8B(), MemOperand(x0));
1985  __ st2(v0.V8B(), v1.V8B(), MemOperand(x1, x2, PostIndex));
1986  __ st2(v21.V8B(), v22.V8B(), MemOperand(x1, 16, PostIndex));
1987  __ st2(v7.V8H(), v8.V8H(), MemOperand(x0));
1988  __ st2(v22.V8H(), v23.V8H(), MemOperand(x1, x2, PostIndex));
1989  __ st2(v4.V8H(), v5.V8H(), MemOperand(x1, 32, PostIndex));
1990  __ st2(v8.B(), v9.B(), 15, MemOperand(x0));
1991  __ st2(v8.B(), v9.B(), 15, MemOperand(x1, x2, PostIndex));
1992  __ st2(v7.B(), v8.B(), 4, MemOperand(x1, 2, PostIndex));
1993  __ st2(v25.D(), v26.D(), 0, MemOperand(x0));
1994  __ st2(v17.D(), v18.D(), 1, MemOperand(x1, x2, PostIndex));
1995  __ st2(v3.D(), v4.D(), 1, MemOperand(x1, 16, PostIndex));
1996  __ st2(v4.H(), v5.H(), 3, MemOperand(x0));
1997  __ st2(v0.H(), v1.H(), 5, MemOperand(x1, x2, PostIndex));
1998  __ st2(v22.H(), v23.H(), 2, MemOperand(x1, 4, PostIndex));
1999  __ st2(v14.S(), v15.S(), 3, MemOperand(x0));
2000  __ st2(v23.S(), v24.S(), 3, MemOperand(x1, x2, PostIndex));
2001  __ st2(v0.S(), v1.S(), 2, MemOperand(x1, 8, PostIndex));
2002  __ st3(v26.V16B(), v27.V16B(), v28.V16B(), MemOperand(x0));
2003  __ st3(v21.V16B(), v22.V16B(), v23.V16B(), MemOperand(x1, x2, PostIndex));
2004  __ st3(v24.V16B(), v25.V16B(), v26.V16B(), MemOperand(x1, 48, PostIndex));
2005  __ st3(v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
2006  __ st3(v23.V2D(), v24.V2D(), v25.V2D(), MemOperand(x1, x2, PostIndex));
2007  __ st3(v10.V2D(), v11.V2D(), v12.V2D(), MemOperand(x1, 48, PostIndex));
2008  __ st3(v9.V2S(), v10.V2S(), v11.V2S(), MemOperand(x0));
2009  __ st3(v13.V2S(), v14.V2S(), v15.V2S(), MemOperand(x1, x2, PostIndex));
2010  __ st3(v22.V2S(), v23.V2S(), v24.V2S(), MemOperand(x1, 24, PostIndex));
2011  __ st3(v31.V4H(), v0.V4H(), v1.V4H(), MemOperand(x0));
2012  __ st3(v8.V4H(), v9.V4H(), v10.V4H(), MemOperand(x1, x2, PostIndex));
2013  __ st3(v19.V4H(), v20.V4H(), v21.V4H(), MemOperand(x1, 24, PostIndex));
2014  __ st3(v18.V4S(), v19.V4S(), v20.V4S(), MemOperand(x0));
2015  __ st3(v25.V4S(), v26.V4S(), v27.V4S(), MemOperand(x1, x2, PostIndex));
2016  __ st3(v16.V4S(), v17.V4S(), v18.V4S(), MemOperand(x1, 48, PostIndex));
2017  __ st3(v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
2018  __ st3(v29.V8B(), v30.V8B(), v31.V8B(), MemOperand(x1, x2, PostIndex));
2019  __ st3(v30.V8B(), v31.V8B(), v0.V8B(), MemOperand(x1, 24, PostIndex));
2020  __ st3(v8.V8H(), v9.V8H(), v10.V8H(), MemOperand(x0));
2021  __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, x2, PostIndex));
2022  __ st3(v18.V8H(), v19.V8H(), v20.V8H(), MemOperand(x1, 48, PostIndex));
2023  __ st3(v31.B(), v0.B(), v1.B(), 10, MemOperand(x0));
2024  __ st3(v4.B(), v5.B(), v6.B(), 5, MemOperand(x1, x2, PostIndex));
2025  __ st3(v5.B(), v6.B(), v7.B(), 1, MemOperand(x1, 3, PostIndex));
2026  __ st3(v5.D(), v6.D(), v7.D(), 0, MemOperand(x0));
2027  __ st3(v6.D(), v7.D(), v8.D(), 0, MemOperand(x1, x2, PostIndex));
2028  __ st3(v0.D(), v1.D(), v2.D(), 0, MemOperand(x1, 24, PostIndex));
2029  __ st3(v31.H(), v0.H(), v1.H(), 2, MemOperand(x0));
2030  __ st3(v14.H(), v15.H(), v16.H(), 5, MemOperand(x1, x2, PostIndex));
2031  __ st3(v21.H(), v22.H(), v23.H(), 6, MemOperand(x1, 6, PostIndex));
2032  __ st3(v21.S(), v22.S(), v23.S(), 0, MemOperand(x0));
2033  __ st3(v11.S(), v12.S(), v13.S(), 1, MemOperand(x1, x2, PostIndex));
2034  __ st3(v15.S(), v16.S(), v17.S(), 0, MemOperand(x1, 12, PostIndex));
2035  __ st4(v22.V16B(), v23.V16B(), v24.V16B(), v25.V16B(), MemOperand(x0));
2036  __ st4(v24.V16B(),
2037         v25.V16B(),
2038         v26.V16B(),
2039         v27.V16B(),
2040         MemOperand(x1, x2, PostIndex));
2041  __ st4(v15.V16B(),
2042         v16.V16B(),
2043         v17.V16B(),
2044         v18.V16B(),
2045         MemOperand(x1, 64, PostIndex));
2046  __ st4(v16.V2D(), v17.V2D(), v18.V2D(), v19.V2D(), MemOperand(x0));
2047  __ st4(v17.V2D(),
2048         v18.V2D(),
2049         v19.V2D(),
2050         v20.V2D(),
2051         MemOperand(x1, x2, PostIndex));
2052  __ st4(v9.V2D(),
2053         v10.V2D(),
2054         v11.V2D(),
2055         v12.V2D(),
2056         MemOperand(x1, 64, PostIndex));
2057  __ st4(v23.V2S(), v24.V2S(), v25.V2S(), v26.V2S(), MemOperand(x0));
2058  __ st4(v15.V2S(),
2059         v16.V2S(),
2060         v17.V2S(),
2061         v18.V2S(),
2062         MemOperand(x1, x2, PostIndex));
2063  __ st4(v24.V2S(),
2064         v25.V2S(),
2065         v26.V2S(),
2066         v27.V2S(),
2067         MemOperand(x1, 32, PostIndex));
2068  __ st4(v14.V4H(), v15.V4H(), v16.V4H(), v17.V4H(), MemOperand(x0));
2069  __ st4(v18.V4H(),
2070         v19.V4H(),
2071         v20.V4H(),
2072         v21.V4H(),
2073         MemOperand(x1, x2, PostIndex));
2074  __ st4(v1.V4H(), v2.V4H(), v3.V4H(), v4.V4H(), MemOperand(x1, 32, PostIndex));
2075  __ st4(v13.V4S(), v14.V4S(), v15.V4S(), v16.V4S(), MemOperand(x0));
2076  __ st4(v6.V4S(), v7.V4S(), v8.V4S(), v9.V4S(), MemOperand(x1, x2, PostIndex));
2077  __ st4(v15.V4S(),
2078         v16.V4S(),
2079         v17.V4S(),
2080         v18.V4S(),
2081         MemOperand(x1, 64, PostIndex));
2082  __ st4(v26.V8B(), v27.V8B(), v28.V8B(), v29.V8B(), MemOperand(x0));
2083  __ st4(v25.V8B(),
2084         v26.V8B(),
2085         v27.V8B(),
2086         v28.V8B(),
2087         MemOperand(x1, x2, PostIndex));
2088  __ st4(v19.V8B(),
2089         v20.V8B(),
2090         v21.V8B(),
2091         v22.V8B(),
2092         MemOperand(x1, 32, PostIndex));
2093  __ st4(v19.V8H(), v20.V8H(), v21.V8H(), v22.V8H(), MemOperand(x0));
2094  __ st4(v15.V8H(),
2095         v16.V8H(),
2096         v17.V8H(),
2097         v18.V8H(),
2098         MemOperand(x1, x2, PostIndex));
2099  __ st4(v31.V8H(),
2100         v0.V8H(),
2101         v1.V8H(),
2102         v2.V8H(),
2103         MemOperand(x1, 64, PostIndex));
2104  __ st4(v0.B(), v1.B(), v2.B(), v3.B(), 13, MemOperand(x0));
2105  __ st4(v4.B(), v5.B(), v6.B(), v7.B(), 10, MemOperand(x1, x2, PostIndex));
2106  __ st4(v9.B(), v10.B(), v11.B(), v12.B(), 9, MemOperand(x1, 4, PostIndex));
2107  __ st4(v2.D(), v3.D(), v4.D(), v5.D(), 1, MemOperand(x0));
2108  __ st4(v7.D(), v8.D(), v9.D(), v10.D(), 0, MemOperand(x1, x2, PostIndex));
2109  __ st4(v31.D(), v0.D(), v1.D(), v2.D(), 1, MemOperand(x1, 32, PostIndex));
2110  __ st4(v2.H(), v3.H(), v4.H(), v5.H(), 1, MemOperand(x0));
2111  __ st4(v27.H(), v28.H(), v29.H(), v30.H(), 3, MemOperand(x1, x2, PostIndex));
2112  __ st4(v24.H(), v25.H(), v26.H(), v27.H(), 4, MemOperand(x1, 8, PostIndex));
2113  __ st4(v18.S(), v19.S(), v20.S(), v21.S(), 2, MemOperand(x0));
2114  __ st4(v6.S(), v7.S(), v8.S(), v9.S(), 2, MemOperand(x1, x2, PostIndex));
2115  __ st4(v25.S(), v26.S(), v27.S(), v28.S(), 1, MemOperand(x1, 16, PostIndex));
2116  __ sub(d12, d17, d2);
2117  __ sub(v20.V16B(), v24.V16B(), v8.V16B());
2118  __ sub(v8.V2D(), v29.V2D(), v5.V2D());
2119  __ sub(v2.V2S(), v28.V2S(), v24.V2S());
2120  __ sub(v24.V4H(), v10.V4H(), v4.V4H());
2121  __ sub(v28.V4S(), v4.V4S(), v17.V4S());
2122  __ sub(v16.V8B(), v27.V8B(), v2.V8B());
2123  __ sub(v20.V8H(), v10.V8H(), v13.V8H());
2124  __ subhn(v5.V2S(), v14.V2D(), v13.V2D());
2125  __ subhn(v10.V4H(), v5.V4S(), v8.V4S());
2126  __ subhn(v6.V8B(), v10.V8H(), v22.V8H());
2127  __ subhn2(v11.V16B(), v6.V8H(), v9.V8H());
2128  __ subhn2(v25.V4S(), v18.V2D(), v24.V2D());
2129  __ subhn2(v20.V8H(), v21.V4S(), v1.V4S());
2130  __ suqadd(b25, b11);
2131  __ suqadd(d13, d1);
2132  __ suqadd(h0, h9);
2133  __ suqadd(s22, s8);
2134  __ suqadd(v24.V16B(), v27.V16B());
2135  __ suqadd(v26.V2D(), v14.V2D());
2136  __ suqadd(v7.V2S(), v10.V2S());
2137  __ suqadd(v25.V4H(), v12.V4H());
2138  __ suqadd(v4.V4S(), v3.V4S());
2139  __ suqadd(v14.V8B(), v18.V8B());
2140  __ suqadd(v31.V8H(), v8.V8H());
2141  __ sxtl(v16.V2D(), v20.V2S());
2142  __ sxtl(v27.V4S(), v28.V4H());
2143  __ sxtl(v0.V8H(), v22.V8B());
2144  __ sxtl2(v6.V2D(), v7.V4S());
2145  __ sxtl2(v9.V4S(), v27.V8H());
2146  __ sxtl2(v16.V8H(), v16.V16B());
2147  __ tbl(v25.V16B(),
2148         v17.V16B(),
2149         v18.V16B(),
2150         v19.V16B(),
2151         v20.V16B(),
2152         v22.V16B());
2153  __ tbl(v28.V16B(), v13.V16B(), v14.V16B(), v15.V16B(), v4.V16B());
2154  __ tbl(v3.V16B(), v0.V16B(), v1.V16B(), v2.V16B());
2155  __ tbl(v20.V16B(), v15.V16B(), v4.V16B());
2156  __ tbl(v7.V8B(), v23.V16B(), v24.V16B(), v25.V16B(), v26.V16B(), v20.V8B());
2157  __ tbl(v8.V8B(), v1.V16B(), v2.V16B(), v3.V16B(), v31.V8B());
2158  __ tbl(v8.V8B(), v25.V16B(), v26.V16B(), v16.V8B());
2159  __ tbl(v11.V8B(), v19.V16B(), v30.V8B());
2160  __ tbx(v25.V16B(), v25.V16B(), v26.V16B(), v27.V16B(), v28.V16B(), v5.V16B());
2161  __ tbx(v21.V16B(), v29.V16B(), v30.V16B(), v31.V16B(), v24.V16B());
2162  __ tbx(v6.V16B(), v16.V16B(), v17.V16B(), v1.V16B());
2163  __ tbx(v13.V16B(), v3.V16B(), v20.V16B());
2164  __ tbx(v24.V8B(), v29.V16B(), v30.V16B(), v31.V16B(), v0.V16B(), v9.V8B());
2165  __ tbx(v17.V8B(), v9.V16B(), v10.V16B(), v11.V16B(), v26.V8B());
2166  __ tbx(v5.V8B(), v3.V16B(), v4.V16B(), v21.V8B());
2167  __ tbx(v16.V8B(), v11.V16B(), v29.V8B());
2168  __ trn1(v19.V16B(), v24.V16B(), v12.V16B());
2169  __ trn1(v2.V2D(), v7.V2D(), v10.V2D());
2170  __ trn1(v22.V2S(), v0.V2S(), v21.V2S());
2171  __ trn1(v12.V4H(), v15.V4H(), v20.V4H());
2172  __ trn1(v30.V4S(), v17.V4S(), v9.V4S());
2173  __ trn1(v12.V8B(), v19.V8B(), v29.V8B());
2174  __ trn1(v23.V8H(), v8.V8H(), v9.V8H());
2175  __ trn2(v28.V16B(), v30.V16B(), v25.V16B());
2176  __ trn2(v7.V2D(), v27.V2D(), v7.V2D());
2177  __ trn2(v30.V2S(), v16.V2S(), v19.V2S());
2178  __ trn2(v24.V4H(), v6.V4H(), v25.V4H());
2179  __ trn2(v2.V4S(), v19.V4S(), v11.V4S());
2180  __ trn2(v25.V8B(), v27.V8B(), v18.V8B());
2181  __ trn2(v12.V8H(), v4.V8H(), v15.V8H());
2182  __ uaba(v31.V16B(), v12.V16B(), v28.V16B());
2183  __ uaba(v18.V2S(), v5.V2S(), v14.V2S());
2184  __ uaba(v9.V4H(), v20.V4H(), v21.V4H());
2185  __ uaba(v6.V4S(), v20.V4S(), v2.V4S());
2186  __ uaba(v16.V8B(), v12.V8B(), v5.V8B());
2187  __ uaba(v15.V8H(), v26.V8H(), v30.V8H());
2188  __ uabal(v10.V2D(), v18.V2S(), v15.V2S());
2189  __ uabal(v30.V4S(), v19.V4H(), v7.V4H());
2190  __ uabal(v4.V8H(), v27.V8B(), v0.V8B());
2191  __ uabal2(v19.V2D(), v12.V4S(), v2.V4S());
2192  __ uabal2(v26.V4S(), v5.V8H(), v12.V8H());
2193  __ uabal2(v19.V8H(), v20.V16B(), v28.V16B());
2194  __ uabd(v18.V16B(), v4.V16B(), v21.V16B());
2195  __ uabd(v30.V2S(), v21.V2S(), v16.V2S());
2196  __ uabd(v8.V4H(), v28.V4H(), v25.V4H());
2197  __ uabd(v28.V4S(), v12.V4S(), v21.V4S());
2198  __ uabd(v19.V8B(), v16.V8B(), v28.V8B());
2199  __ uabd(v9.V8H(), v12.V8H(), v29.V8H());
2200  __ uabdl(v26.V2D(), v0.V2S(), v8.V2S());
2201  __ uabdl(v29.V4S(), v31.V4H(), v25.V4H());
2202  __ uabdl(v27.V8H(), v29.V8B(), v14.V8B());
2203  __ uabdl2(v20.V2D(), v20.V4S(), v8.V4S());
2204  __ uabdl2(v22.V4S(), v15.V8H(), v18.V8H());
2205  __ uabdl2(v9.V8H(), v18.V16B(), v23.V16B());
2206  __ uadalp(v9.V1D(), v15.V2S());
2207  __ uadalp(v14.V2D(), v12.V4S());
2208  __ uadalp(v28.V2S(), v12.V4H());
2209  __ uadalp(v0.V4H(), v17.V8B());
2210  __ uadalp(v1.V4S(), v29.V8H());
2211  __ uadalp(v15.V8H(), v22.V16B());
2212  __ uaddl(v1.V2D(), v20.V2S(), v27.V2S());
2213  __ uaddl(v31.V4S(), v25.V4H(), v5.V4H());
2214  __ uaddl(v12.V8H(), v3.V8B(), v3.V8B());
2215  __ uaddl2(v5.V2D(), v23.V4S(), v6.V4S());
2216  __ uaddl2(v1.V4S(), v5.V8H(), v25.V8H());
2217  __ uaddl2(v22.V8H(), v30.V16B(), v28.V16B());
2218  __ uaddlp(v7.V1D(), v9.V2S());
2219  __ uaddlp(v26.V2D(), v4.V4S());
2220  __ uaddlp(v28.V2S(), v1.V4H());
2221  __ uaddlp(v20.V4H(), v31.V8B());
2222  __ uaddlp(v16.V4S(), v17.V8H());
2223  __ uaddlp(v6.V8H(), v2.V16B());
2224  __ uaddlv(d28, v22.V4S());
2225  __ uaddlv(h0, v19.V16B());
2226  __ uaddlv(h30, v30.V8B());
2227  __ uaddlv(s24, v18.V4H());
2228  __ uaddlv(s10, v0.V8H());
2229  __ uaddw(v9.V2D(), v17.V2D(), v14.V2S());
2230  __ uaddw(v9.V4S(), v25.V4S(), v3.V4H());
2231  __ uaddw(v18.V8H(), v1.V8H(), v0.V8B());
2232  __ uaddw2(v18.V2D(), v5.V2D(), v6.V4S());
2233  __ uaddw2(v17.V4S(), v15.V4S(), v11.V8H());
2234  __ uaddw2(v29.V8H(), v11.V8H(), v7.V16B());
2235  __ uhadd(v13.V16B(), v9.V16B(), v3.V16B());
2236  __ uhadd(v17.V2S(), v25.V2S(), v24.V2S());
2237  __ uhadd(v25.V4H(), v23.V4H(), v13.V4H());
2238  __ uhadd(v0.V4S(), v20.V4S(), v16.V4S());
2239  __ uhadd(v5.V8B(), v5.V8B(), v25.V8B());
2240  __ uhadd(v3.V8H(), v29.V8H(), v18.V8H());
2241  __ uhsub(v1.V16B(), v22.V16B(), v13.V16B());
2242  __ uhsub(v14.V2S(), v30.V2S(), v30.V2S());
2243  __ uhsub(v29.V4H(), v14.V4H(), v17.V4H());
2244  __ uhsub(v26.V4S(), v5.V4S(), v18.V4S());
2245  __ uhsub(v3.V8B(), v7.V8B(), v12.V8B());
2246  __ uhsub(v25.V8H(), v21.V8H(), v5.V8H());
2247  __ umax(v28.V16B(), v12.V16B(), v6.V16B());
2248  __ umax(v20.V2S(), v19.V2S(), v26.V2S());
2249  __ umax(v0.V4H(), v31.V4H(), v18.V4H());
2250  __ umax(v6.V4S(), v21.V4S(), v28.V4S());
2251  __ umax(v0.V8B(), v2.V8B(), v20.V8B());
2252  __ umax(v4.V8H(), v11.V8H(), v22.V8H());
2253  __ umaxp(v1.V16B(), v6.V16B(), v29.V16B());
2254  __ umaxp(v19.V2S(), v17.V2S(), v27.V2S());
2255  __ umaxp(v21.V4H(), v16.V4H(), v7.V4H());
2256  __ umaxp(v9.V4S(), v20.V4S(), v29.V4S());
2257  __ umaxp(v13.V8B(), v1.V8B(), v16.V8B());
2258  __ umaxp(v19.V8H(), v23.V8H(), v26.V8H());
2259  __ umaxv(b17, v30.V16B());
2260  __ umaxv(b23, v12.V8B());
2261  __ umaxv(h31, v15.V4H());
2262  __ umaxv(h15, v25.V8H());
2263  __ umaxv(s18, v21.V4S());
2264  __ umin(v22.V16B(), v0.V16B(), v18.V16B());
2265  __ umin(v1.V2S(), v21.V2S(), v16.V2S());
2266  __ umin(v17.V4H(), v4.V4H(), v25.V4H());
2267  __ umin(v24.V4S(), v26.V4S(), v13.V4S());
2268  __ umin(v20.V8B(), v1.V8B(), v5.V8B());
2269  __ umin(v26.V8H(), v25.V8H(), v23.V8H());
2270  __ uminp(v5.V16B(), v1.V16B(), v23.V16B());
2271  __ uminp(v7.V2S(), v26.V2S(), v30.V2S());
2272  __ uminp(v9.V4H(), v5.V4H(), v25.V4H());
2273  __ uminp(v23.V4S(), v10.V4S(), v1.V4S());
2274  __ uminp(v4.V8B(), v29.V8B(), v14.V8B());
2275  __ uminp(v21.V8H(), v0.V8H(), v14.V8H());
2276  __ uminv(b0, v17.V16B());
2277  __ uminv(b0, v31.V8B());
2278  __ uminv(h24, v0.V4H());
2279  __ uminv(h29, v14.V8H());
2280  __ uminv(s30, v3.V4S());
2281  __ umlal(v11.V2D(), v11.V2S(), v24.V2S());
2282  __ umlal(v30.V2D(), v16.V2S(), v11.S(), 3);
2283  __ umlal(v0.V4S(), v9.V4H(), v26.V4H());
2284  __ umlal(v20.V4S(), v24.V4H(), v12.H(), 4);
2285  __ umlal(v16.V8H(), v21.V8B(), v6.V8B());
2286  __ umlal2(v17.V2D(), v19.V4S(), v23.V4S());
2287  __ umlal2(v5.V2D(), v30.V4S(), v8.S(), 0);
2288  __ umlal2(v16.V4S(), v8.V8H(), v15.V8H());
2289  __ umlal2(v15.V4S(), v26.V8H(), v1.H(), 5);
2290  __ umlal2(v30.V8H(), v1.V16B(), v17.V16B());
2291  __ umlsl(v18.V2D(), v19.V2S(), v28.V2S());
2292  __ umlsl(v7.V2D(), v7.V2S(), v8.S(), 0);
2293  __ umlsl(v24.V4S(), v8.V4H(), v4.V4H());
2294  __ umlsl(v18.V4S(), v22.V4H(), v12.H(), 4);
2295  __ umlsl(v28.V8H(), v14.V8B(), v20.V8B());
2296  __ umlsl2(v11.V2D(), v0.V4S(), v9.V4S());
2297  __ umlsl2(v26.V2D(), v16.V4S(), v9.S(), 2);
2298  __ umlsl2(v3.V4S(), v11.V8H(), v9.V8H());
2299  __ umlsl2(v10.V4S(), v25.V8H(), v9.H(), 4);
2300  __ umlsl2(v24.V8H(), v16.V16B(), v28.V16B());
2301  __ umov(x30, v25.D(), 1);
2302  __ umull(v12.V2D(), v10.V2S(), v29.V2S());
2303  __ umull(v22.V2D(), v30.V2S(), v5.S(), 3);
2304  __ umull(v7.V4S(), v0.V4H(), v25.V4H());
2305  __ umull(v11.V4S(), v13.V4H(), v3.H(), 2);
2306  __ umull(v25.V8H(), v16.V8B(), v10.V8B());
2307  __ umull2(v17.V2D(), v3.V4S(), v26.V4S());
2308  __ umull2(v26.V2D(), v11.V4S(), v2.S(), 3);
2309  __ umull2(v12.V4S(), v17.V8H(), v23.V8H());
2310  __ umull2(v4.V4S(), v31.V8H(), v1.H(), 2);
2311  __ umull2(v5.V8H(), v12.V16B(), v17.V16B());
2312  __ uqadd(b30, b4, b28);
2313  __ uqadd(d27, d20, d16);
2314  __ uqadd(h7, h14, h28);
2315  __ uqadd(s28, s17, s4);
2316  __ uqadd(v19.V16B(), v22.V16B(), v21.V16B());
2317  __ uqadd(v16.V2D(), v4.V2D(), v11.V2D());
2318  __ uqadd(v20.V2S(), v14.V2S(), v4.V2S());
2319  __ uqadd(v5.V4H(), v0.V4H(), v16.V4H());
2320  __ uqadd(v21.V4S(), v31.V4S(), v9.V4S());
2321  __ uqadd(v23.V8B(), v24.V8B(), v3.V8B());
2322  __ uqadd(v17.V8H(), v27.V8H(), v11.V8H());
2323  __ uqrshl(b10, b22, b10);
2324  __ uqrshl(d29, d5, d11);
2325  __ uqrshl(h27, h24, h30);
2326  __ uqrshl(s10, s13, s8);
2327  __ uqrshl(v9.V16B(), v18.V16B(), v14.V16B());
2328  __ uqrshl(v24.V2D(), v15.V2D(), v17.V2D());
2329  __ uqrshl(v4.V2S(), v14.V2S(), v27.V2S());
2330  __ uqrshl(v15.V4H(), v5.V4H(), v8.V4H());
2331  __ uqrshl(v21.V4S(), v29.V4S(), v0.V4S());
2332  __ uqrshl(v16.V8B(), v24.V8B(), v9.V8B());
2333  __ uqrshl(v2.V8H(), v0.V8H(), v15.V8H());
2334  __ uqrshrn(b11, h26, 4);
2335  __ uqrshrn(h7, s30, 5);
2336  __ uqrshrn(s10, d8, 21);
2337  __ uqrshrn(v15.V2S(), v6.V2D(), 11);
2338  __ uqrshrn(v5.V4H(), v26.V4S(), 12);
2339  __ uqrshrn(v28.V8B(), v25.V8H(), 5);
2340  __ uqrshrn2(v25.V16B(), v30.V8H(), 2);
2341  __ uqrshrn2(v21.V4S(), v14.V2D(), 32);
2342  __ uqrshrn2(v13.V8H(), v7.V4S(), 2);
2343  __ uqshl(b13, b0, b23);
2344  __ uqshl(b9, b17, 4);
2345  __ uqshl(d23, d6, d4);
2346  __ uqshl(d8, d11, 44);
2347  __ uqshl(h19, h13, h15);
2348  __ uqshl(h25, h26, 6);
2349  __ uqshl(s4, s24, s10);
2350  __ uqshl(s19, s14, 1);
2351  __ uqshl(v14.V16B(), v30.V16B(), v25.V16B());
2352  __ uqshl(v6.V16B(), v10.V16B(), 5);
2353  __ uqshl(v18.V2D(), v8.V2D(), v7.V2D());
2354  __ uqshl(v25.V2D(), v14.V2D(), 18);
2355  __ uqshl(v25.V2S(), v16.V2S(), v23.V2S());
2356  __ uqshl(v13.V2S(), v15.V2S(), 31);
2357  __ uqshl(v28.V4H(), v24.V4H(), v15.V4H());
2358  __ uqshl(v4.V4H(), v17.V4H(), 1);
2359  __ uqshl(v9.V4S(), v31.V4S(), v23.V4S());
2360  __ uqshl(v18.V4S(), v28.V4S(), 31);
2361  __ uqshl(v31.V8B(), v21.V8B(), v15.V8B());
2362  __ uqshl(v6.V8B(), v21.V8B(), 1);
2363  __ uqshl(v28.V8H(), v2.V8H(), v17.V8H());
2364  __ uqshl(v24.V8H(), v8.V8H(), 14);
2365  __ uqshrn(b21, h27, 7);
2366  __ uqshrn(h28, s26, 11);
2367  __ uqshrn(s13, d31, 17);
2368  __ uqshrn(v21.V2S(), v16.V2D(), 8);
2369  __ uqshrn(v24.V4H(), v24.V4S(), 2);
2370  __ uqshrn(v5.V8B(), v1.V8H(), 8);
2371  __ uqshrn2(v16.V16B(), v29.V8H(), 6);
2372  __ uqshrn2(v2.V4S(), v6.V2D(), 1);
2373  __ uqshrn2(v16.V8H(), v10.V4S(), 14);
2374  __ uqsub(b28, b20, b26);
2375  __ uqsub(d0, d7, d10);
2376  __ uqsub(h26, h24, h7);
2377  __ uqsub(s23, s23, s16);
2378  __ uqsub(v14.V16B(), v16.V16B(), v24.V16B());
2379  __ uqsub(v11.V2D(), v17.V2D(), v6.V2D());
2380  __ uqsub(v10.V2S(), v10.V2S(), v8.V2S());
2381  __ uqsub(v9.V4H(), v15.V4H(), v12.V4H());
2382  __ uqsub(v23.V4S(), v18.V4S(), v7.V4S());
2383  __ uqsub(v9.V8B(), v19.V8B(), v17.V8B());
2384  __ uqsub(v20.V8H(), v2.V8H(), v6.V8H());
2385  __ uqxtn(b29, h19);
2386  __ uqxtn(h0, s13);
2387  __ uqxtn(s26, d22);
2388  __ uqxtn(v5.V2S(), v31.V2D());
2389  __ uqxtn(v30.V4H(), v19.V4S());
2390  __ uqxtn(v15.V8B(), v2.V8H());
2391  __ uqxtn2(v29.V16B(), v3.V8H());
2392  __ uqxtn2(v13.V4S(), v17.V2D());
2393  __ uqxtn2(v28.V8H(), v11.V4S());
2394  __ urecpe(v23.V2S(), v15.V2S());
2395  __ urecpe(v27.V4S(), v7.V4S());
2396  __ urhadd(v2.V16B(), v15.V16B(), v27.V16B());
2397  __ urhadd(v15.V2S(), v1.V2S(), v18.V2S());
2398  __ urhadd(v17.V4H(), v4.V4H(), v26.V4H());
2399  __ urhadd(v2.V4S(), v27.V4S(), v14.V4S());
2400  __ urhadd(v5.V8B(), v17.V8B(), v14.V8B());
2401  __ urhadd(v30.V8H(), v2.V8H(), v25.V8H());
2402  __ urshl(d4, d28, d30);
2403  __ urshl(v13.V16B(), v31.V16B(), v19.V16B());
2404  __ urshl(v14.V2D(), v23.V2D(), v21.V2D());
2405  __ urshl(v10.V2S(), v7.V2S(), v8.V2S());
2406  __ urshl(v15.V4H(), v21.V4H(), v28.V4H());
2407  __ urshl(v30.V4S(), v8.V4S(), v23.V4S());
2408  __ urshl(v31.V8B(), v20.V8B(), v5.V8B());
2409  __ urshl(v30.V8H(), v27.V8H(), v30.V8H());
2410  __ urshr(d4, d13, 49);
2411  __ urshr(v2.V16B(), v20.V16B(), 1);
2412  __ urshr(v13.V2D(), v11.V2D(), 51);
2413  __ urshr(v21.V2S(), v31.V2S(), 10);
2414  __ urshr(v21.V4H(), v17.V4H(), 11);
2415  __ urshr(v4.V4S(), v22.V4S(), 1);
2416  __ urshr(v0.V8B(), v1.V8B(), 7);
2417  __ urshr(v13.V8H(), v20.V8H(), 1);
2418  __ ursqrte(v20.V2S(), v16.V2S());
2419  __ ursqrte(v28.V4S(), v8.V4S());
2420  __ ursra(d27, d16, 45);
2421  __ ursra(v18.V16B(), v17.V16B(), 3);
2422  __ ursra(v26.V2D(), v28.V2D(), 58);
2423  __ ursra(v8.V2S(), v22.V2S(), 31);
2424  __ ursra(v31.V4H(), v4.V4H(), 7);
2425  __ ursra(v31.V4S(), v15.V4S(), 2);
2426  __ ursra(v3.V8B(), v1.V8B(), 5);
2427  __ ursra(v18.V8H(), v14.V8H(), 13);
2428  __ ushl(d31, d0, d16);
2429  __ ushl(v0.V16B(), v6.V16B(), v2.V16B());
2430  __ ushl(v18.V2D(), v1.V2D(), v18.V2D());
2431  __ ushl(v27.V2S(), v7.V2S(), v29.V2S());
2432  __ ushl(v14.V4H(), v14.V4H(), v13.V4H());
2433  __ ushl(v22.V4S(), v4.V4S(), v9.V4S());
2434  __ ushl(v23.V8B(), v22.V8B(), v27.V8B());
2435  __ ushl(v21.V8H(), v25.V8H(), v8.V8H());
2436  __ ushll(v11.V2D(), v0.V2S(), 21);
2437  __ ushll(v2.V4S(), v17.V4H(), 8);
2438  __ ushll(v11.V8H(), v14.V8B(), 1);
2439  __ ushll2(v8.V2D(), v29.V4S(), 7);
2440  __ ushll2(v29.V4S(), v9.V8H(), 2);
2441  __ ushll2(v5.V8H(), v24.V16B(), 6);
2442  __ ushr(d28, d27, 53);
2443  __ ushr(v1.V16B(), v9.V16B(), 7);
2444  __ ushr(v2.V2D(), v24.V2D(), 43);
2445  __ ushr(v30.V2S(), v25.V2S(), 11);
2446  __ ushr(v10.V4H(), v26.V4H(), 12);
2447  __ ushr(v4.V4S(), v5.V4S(), 30);
2448  __ ushr(v30.V8B(), v2.V8B(), 1);
2449  __ ushr(v6.V8H(), v12.V8H(), 2);
2450  __ usqadd(b19, b5);
2451  __ usqadd(d9, d2);
2452  __ usqadd(h2, h16);
2453  __ usqadd(s16, s3);
2454  __ usqadd(v31.V16B(), v29.V16B());
2455  __ usqadd(v8.V2D(), v10.V2D());
2456  __ usqadd(v18.V2S(), v9.V2S());
2457  __ usqadd(v24.V4H(), v14.V4H());
2458  __ usqadd(v10.V4S(), v30.V4S());
2459  __ usqadd(v16.V8B(), v20.V8B());
2460  __ usqadd(v12.V8H(), v16.V8H());
2461  __ usra(d28, d27, 37);
2462  __ usra(v5.V16B(), v22.V16B(), 5);
2463  __ usra(v2.V2D(), v19.V2D(), 33);
2464  __ usra(v0.V2S(), v0.V2S(), 21);
2465  __ usra(v7.V4H(), v6.V4H(), 12);
2466  __ usra(v4.V4S(), v17.V4S(), 9);
2467  __ usra(v9.V8B(), v12.V8B(), 7);
2468  __ usra(v3.V8H(), v27.V8H(), 14);
2469  __ usubl(v29.V2D(), v12.V2S(), v30.V2S());
2470  __ usubl(v29.V4S(), v28.V4H(), v6.V4H());
2471  __ usubl(v12.V8H(), v4.V8B(), v14.V8B());
2472  __ usubl2(v1.V2D(), v24.V4S(), v17.V4S());
2473  __ usubl2(v4.V4S(), v1.V8H(), v3.V8H());
2474  __ usubl2(v23.V8H(), v4.V16B(), v7.V16B());
2475  __ usubw(v9.V2D(), v20.V2D(), v30.V2S());
2476  __ usubw(v20.V4S(), v16.V4S(), v23.V4H());
2477  __ usubw(v25.V8H(), v8.V8H(), v29.V8B());
2478  __ usubw2(v18.V2D(), v29.V2D(), v6.V4S());
2479  __ usubw2(v6.V4S(), v6.V4S(), v20.V8H());
2480  __ usubw2(v18.V8H(), v4.V8H(), v16.V16B());
2481  __ uxtl(v27.V2D(), v21.V2S());
2482  __ uxtl(v0.V4S(), v31.V4H());
2483  __ uxtl(v27.V8H(), v10.V8B());
2484  __ uxtl2(v6.V2D(), v16.V4S());
2485  __ uxtl2(v22.V4S(), v20.V8H());
2486  __ uxtl2(v20.V8H(), v21.V16B());
2487  __ uzp1(v30.V16B(), v9.V16B(), v17.V16B());
2488  __ uzp1(v7.V2D(), v26.V2D(), v28.V2D());
2489  __ uzp1(v26.V2S(), v16.V2S(), v22.V2S());
2490  __ uzp1(v14.V4H(), v19.V4H(), v6.V4H());
2491  __ uzp1(v17.V4S(), v23.V4S(), v30.V4S());
2492  __ uzp1(v28.V8B(), v27.V8B(), v13.V8B());
2493  __ uzp1(v17.V8H(), v1.V8H(), v12.V8H());
2494  __ uzp2(v8.V16B(), v18.V16B(), v26.V16B());
2495  __ uzp2(v21.V2D(), v22.V2D(), v24.V2D());
2496  __ uzp2(v20.V2S(), v21.V2S(), v2.V2S());
2497  __ uzp2(v16.V4H(), v31.V4H(), v6.V4H());
2498  __ uzp2(v25.V4S(), v11.V4S(), v8.V4S());
2499  __ uzp2(v31.V8B(), v31.V8B(), v13.V8B());
2500  __ uzp2(v8.V8H(), v17.V8H(), v1.V8H());
2501  __ xtn(v17.V2S(), v26.V2D());
2502  __ xtn(v3.V4H(), v0.V4S());
2503  __ xtn(v18.V8B(), v8.V8H());
2504  __ xtn2(v0.V16B(), v0.V8H());
2505  __ xtn2(v15.V4S(), v4.V2D());
2506  __ xtn2(v31.V8H(), v18.V4S());
2507  __ zip1(v22.V16B(), v9.V16B(), v6.V16B());
2508  __ zip1(v23.V2D(), v11.V2D(), v2.V2D());
2509  __ zip1(v26.V2S(), v16.V2S(), v9.V2S());
2510  __ zip1(v1.V4H(), v9.V4H(), v7.V4H());
2511  __ zip1(v0.V4S(), v30.V4S(), v20.V4S());
2512  __ zip1(v30.V8B(), v17.V8B(), v15.V8B());
2513  __ zip1(v17.V8H(), v8.V8H(), v2.V8H());
2514  __ zip2(v23.V16B(), v10.V16B(), v11.V16B());
2515  __ zip2(v30.V2D(), v6.V2D(), v14.V2D());
2516  __ zip2(v9.V2S(), v10.V2S(), v21.V2S());
2517  __ zip2(v8.V4H(), v24.V4H(), v29.V4H());
2518  __ zip2(v0.V4S(), v21.V4S(), v23.V4S());
2519  __ zip2(v25.V8B(), v23.V8B(), v30.V8B());
2520  __ zip2(v7.V8H(), v10.V8H(), v30.V8H());
2521}  // NOLINT(readability/fn_size)
2522
2523
2524static void GenerateTestSequenceNEONFP(MacroAssembler* masm) {
2525  ExactAssemblyScope guard(masm,
2526                           masm->GetBuffer()->GetRemainingBytes(),
2527                           ExactAssemblyScope::kMaximumSize);
2528
2529  // NEON floating point instructions.
2530  __ fabd(v3.V2D(), v25.V2D(), v8.V2D());
2531  __ fabd(v14.V2S(), v27.V2S(), v11.V2S());
2532  __ fabd(v9.V4S(), v22.V4S(), v18.V4S());
2533  __ fabs(v1.V2D(), v29.V2D());
2534  __ fabs(v6.V2S(), v21.V2S());
2535  __ fabs(v12.V4S(), v25.V4S());
2536  __ facge(v18.V2D(), v5.V2D(), v0.V2D());
2537  __ facge(v15.V2S(), v11.V2S(), v6.V2S());
2538  __ facge(v30.V4S(), v10.V4S(), v25.V4S());
2539  __ facgt(v28.V2D(), v16.V2D(), v31.V2D());
2540  __ facgt(v15.V2S(), v1.V2S(), v4.V2S());
2541  __ facgt(v22.V4S(), v3.V4S(), v10.V4S());
2542  __ fadd(v7.V2D(), v10.V2D(), v24.V2D());
2543  __ fadd(v10.V2S(), v23.V2S(), v7.V2S());
2544  __ fadd(v16.V4S(), v22.V4S(), v11.V4S());
2545  __ faddp(d27, v28.V2D());
2546  __ faddp(s20, v23.V2S());
2547  __ faddp(v21.V2D(), v4.V2D(), v11.V2D());
2548  __ faddp(v31.V2S(), v26.V2S(), v1.V2S());
2549  __ faddp(v13.V4S(), v27.V4S(), v28.V4S());
2550  __ fcmeq(v17.V2D(), v13.V2D(), v20.V2D());
2551  __ fcmeq(v24.V2D(), v16.V2D(), 0.0);
2552  __ fcmeq(v26.V2S(), v17.V2S(), v10.V2S());
2553  __ fcmeq(v24.V2S(), v4.V2S(), 0.0);
2554  __ fcmeq(v8.V4S(), v4.V4S(), v14.V4S());
2555  __ fcmeq(v26.V4S(), v25.V4S(), 0.0);
2556  __ fcmge(v27.V2D(), v0.V2D(), v0.V2D());
2557  __ fcmge(v22.V2D(), v30.V2D(), 0.0);
2558  __ fcmge(v7.V2S(), v21.V2S(), v25.V2S());
2559  __ fcmge(v15.V2S(), v15.V2S(), 0.0);
2560  __ fcmge(v29.V4S(), v4.V4S(), v27.V4S());
2561  __ fcmge(v22.V4S(), v21.V4S(), 0.0);
2562  __ fcmgt(v1.V2D(), v26.V2D(), v15.V2D());
2563  __ fcmgt(v15.V2D(), v23.V2D(), 0.0);
2564  __ fcmgt(v21.V2S(), v16.V2S(), v6.V2S());
2565  __ fcmgt(v1.V2S(), v13.V2S(), 0.0);
2566  __ fcmgt(v14.V4S(), v0.V4S(), v25.V4S());
2567  __ fcmgt(v13.V4S(), v8.V4S(), 0.0);
2568  __ fcmle(v4.V2D(), v6.V2D(), 0.0);
2569  __ fcmle(v24.V2S(), v31.V2S(), 0.0);
2570  __ fcmle(v8.V4S(), v23.V4S(), 0.0);
2571  __ fcmlt(v7.V2D(), v3.V2D(), 0.0);
2572  __ fcmlt(v15.V2S(), v21.V2S(), 0.0);
2573  __ fcmlt(v1.V4S(), v2.V4S(), 0.0);
2574  __ fcvtas(v6.V2D(), v8.V2D());
2575  __ fcvtas(v1.V2S(), v9.V2S());
2576  __ fcvtas(v8.V4S(), v19.V4S());
2577  __ fcvtau(v5.V2D(), v31.V2D());
2578  __ fcvtau(v28.V2S(), v29.V2S());
2579  __ fcvtau(v11.V4S(), v26.V4S());
2580  __ fcvtl(v8.V2D(), v25.V2S());
2581  __ fcvtl(v27.V4S(), v14.V4H());
2582  __ fcvtl2(v1.V2D(), v6.V4S());
2583  __ fcvtl2(v24.V4S(), v9.V8H());
2584  __ fcvtms(v9.V2D(), v24.V2D());
2585  __ fcvtms(v7.V2S(), v11.V2S());
2586  __ fcvtms(v23.V4S(), v21.V4S());
2587  __ fcvtmu(v13.V2D(), v1.V2D());
2588  __ fcvtmu(v26.V2S(), v12.V2S());
2589  __ fcvtmu(v21.V4S(), v21.V4S());
2590  __ fcvtn(v11.V2S(), v1.V2D());
2591  __ fcvtn(v8.V4H(), v2.V4S());
2592  __ fcvtn2(v24.V4S(), v29.V2D());
2593  __ fcvtn2(v4.V8H(), v10.V4S());
2594  __ fcvtns(v25.V2D(), v10.V2D());
2595  __ fcvtns(v4.V2S(), v8.V2S());
2596  __ fcvtns(v29.V4S(), v27.V4S());
2597  __ fcvtnu(v18.V2D(), v27.V2D());
2598  __ fcvtnu(v11.V2S(), v14.V2S());
2599  __ fcvtnu(v27.V4S(), v21.V4S());
2600  __ fcvtps(v23.V2D(), v5.V2D());
2601  __ fcvtps(v24.V2S(), v15.V2S());
2602  __ fcvtps(v5.V4S(), v19.V4S());
2603  __ fcvtpu(v3.V2D(), v21.V2D());
2604  __ fcvtpu(v3.V2S(), v21.V2S());
2605  __ fcvtpu(v0.V4S(), v7.V4S());
2606  __ fcvtxn(v29.V2S(), v11.V2D());
2607  __ fcvtxn2(v31.V4S(), v25.V2D());
2608  __ fcvtzs(v19.V2D(), v17.V2D());
2609  __ fcvtzs(v12.V2D(), v24.V2D(), 64);
2610  __ fcvtzs(v9.V2S(), v2.V2S());
2611  __ fcvtzs(v5.V2S(), v20.V2S(), 29);
2612  __ fcvtzs(v21.V4S(), v25.V4S());
2613  __ fcvtzs(v26.V4S(), v1.V4S(), 6);
2614  __ fcvtzu(v13.V2D(), v25.V2D());
2615  __ fcvtzu(v28.V2D(), v13.V2D(), 32);
2616  __ fcvtzu(v26.V2S(), v6.V2S());
2617  __ fcvtzu(v9.V2S(), v10.V2S(), 15);
2618  __ fcvtzu(v30.V4S(), v6.V4S());
2619  __ fcvtzu(v19.V4S(), v22.V4S(), 18);
2620  __ fdiv(v15.V2D(), v8.V2D(), v15.V2D());
2621  __ fdiv(v12.V2S(), v9.V2S(), v26.V2S());
2622  __ fdiv(v19.V4S(), v22.V4S(), v19.V4S());
2623  __ fmax(v19.V2D(), v7.V2D(), v8.V2D());
2624  __ fmax(v25.V2S(), v12.V2S(), v29.V2S());
2625  __ fmax(v6.V4S(), v15.V4S(), v5.V4S());
2626  __ fmaxnm(v16.V2D(), v8.V2D(), v20.V2D());
2627  __ fmaxnm(v15.V2S(), v26.V2S(), v25.V2S());
2628  __ fmaxnm(v23.V4S(), v14.V4S(), v16.V4S());
2629  __ fmaxnmp(d6, v19.V2D());
2630  __ fmaxnmp(s27, v26.V2S());
2631  __ fmaxnmp(v8.V2D(), v12.V2D(), v23.V2D());
2632  __ fmaxnmp(v13.V2S(), v25.V2S(), v22.V2S());
2633  __ fmaxnmp(v15.V4S(), v11.V4S(), v17.V4S());
2634  __ fmaxnmv(s27, v19.V4S());
2635  __ fmaxp(d20, v14.V2D());
2636  __ fmaxp(s18, v2.V2S());
2637  __ fmaxp(v9.V2D(), v23.V2D(), v31.V2D());
2638  __ fmaxp(v7.V2S(), v22.V2S(), v31.V2S());
2639  __ fmaxp(v18.V4S(), v7.V4S(), v29.V4S());
2640  __ fmaxv(s31, v29.V4S());
2641  __ fmin(v2.V2D(), v5.V2D(), v2.V2D());
2642  __ fmin(v31.V2S(), v17.V2S(), v10.V2S());
2643  __ fmin(v10.V4S(), v4.V4S(), v16.V4S());
2644  __ fminnm(v21.V2D(), v6.V2D(), v5.V2D());
2645  __ fminnm(v22.V2S(), v18.V2S(), v14.V2S());
2646  __ fminnm(v25.V4S(), v31.V4S(), v3.V4S());
2647  __ fminnmp(d9, v1.V2D());
2648  __ fminnmp(s21, v20.V2S());
2649  __ fminnmp(v16.V2D(), v21.V2D(), v19.V2D());
2650  __ fminnmp(v16.V2S(), v31.V2S(), v25.V2S());
2651  __ fminnmp(v26.V4S(), v16.V4S(), v15.V4S());
2652  __ fminnmv(s3, v4.V4S());
2653  __ fminp(d24, v26.V2D());
2654  __ fminp(s7, v17.V2S());
2655  __ fminp(v23.V2D(), v19.V2D(), v3.V2D());
2656  __ fminp(v29.V2S(), v21.V2S(), v9.V2S());
2657  __ fminp(v0.V4S(), v24.V4S(), v21.V4S());
2658  __ fminv(s25, v8.V4S());
2659  __ fmla(d23, d0, v9.D(), 1);
2660  __ fmla(s23, s15, v7.S(), 0);
2661  __ fmla(v17.V2D(), v11.V2D(), v6.V2D());
2662  __ fmla(v30.V2D(), v30.V2D(), v11.D(), 0);
2663  __ fmla(v19.V2S(), v12.V2S(), v6.V2S());
2664  __ fmla(v24.V2S(), v17.V2S(), v9.S(), 0);
2665  __ fmla(v16.V4S(), v11.V4S(), v11.V4S());
2666  __ fmla(v27.V4S(), v23.V4S(), v9.S(), 2);
2667  __ fmls(d27, d30, v6.D(), 0);
2668  __ fmls(s21, s16, v2.S(), 0);
2669  __ fmls(v5.V2D(), v19.V2D(), v21.V2D());
2670  __ fmls(v18.V2D(), v30.V2D(), v12.D(), 0);
2671  __ fmls(v5.V2S(), v16.V2S(), v7.V2S());
2672  __ fmls(v3.V2S(), v18.V2S(), v11.S(), 1);
2673  __ fmls(v27.V4S(), v5.V4S(), v30.V4S());
2674  __ fmls(v26.V4S(), v20.V4S(), v4.S(), 3);
2675  __ fmov(v14.V2D(), -0.34375);
2676  __ fmov(v26.V2S(), 0.90625f);
2677  __ fmov(v31.V4S(), -5.0000f);
2678  __ fmov(v28.D(), 1, x25);
2679  __ fmov(x18, v2.D(), 1);
2680  __ fmul(d12, d4, v1.D(), 1);
2681  __ fmul(s30, s1, v15.S(), 3);
2682  __ fmul(v25.V2D(), v0.V2D(), v21.V2D());
2683  __ fmul(v10.V2D(), v24.V2D(), v10.D(), 1);
2684  __ fmul(v7.V2S(), v24.V2S(), v16.V2S());
2685  __ fmul(v1.V2S(), v16.V2S(), v4.S(), 2);
2686  __ fmul(v5.V4S(), v28.V4S(), v25.V4S());
2687  __ fmul(v11.V4S(), v3.V4S(), v8.S(), 0);
2688  __ fmulx(d28, d9, v3.D(), 1);
2689  __ fmulx(s25, s21, v15.S(), 1);
2690  __ fmulx(v31.V2D(), v28.V2D(), v8.V2D());
2691  __ fmulx(v3.V2D(), v21.V2D(), v6.D(), 0);
2692  __ fmulx(v9.V2S(), v1.V2S(), v0.V2S());
2693  __ fmulx(v16.V2S(), v27.V2S(), v6.S(), 0);
2694  __ fmulx(v2.V4S(), v4.V4S(), v5.V4S());
2695  __ fmulx(v18.V4S(), v7.V4S(), v4.S(), 0);
2696  __ fneg(v1.V2D(), v25.V2D());
2697  __ fneg(v14.V2S(), v31.V2S());
2698  __ fneg(v5.V4S(), v4.V4S());
2699  __ frecpe(v18.V2D(), v12.V2D());
2700  __ frecpe(v10.V2S(), v22.V2S());
2701  __ frecpe(v5.V4S(), v6.V4S());
2702  __ frecps(v22.V2D(), v7.V2D(), v26.V2D());
2703  __ frecps(v31.V2S(), v27.V2S(), v2.V2S());
2704  __ frecps(v18.V4S(), v6.V4S(), v27.V4S());
2705  __ frinta(v26.V2D(), v13.V2D());
2706  __ frinta(v15.V2S(), v26.V2S());
2707  __ frinta(v13.V4S(), v16.V4S());
2708  __ frinti(v9.V2D(), v12.V2D());
2709  __ frinti(v5.V2S(), v19.V2S());
2710  __ frinti(v15.V4S(), v11.V4S());
2711  __ frintm(v17.V2D(), v29.V2D());
2712  __ frintm(v30.V2S(), v11.V2S());
2713  __ frintm(v1.V4S(), v20.V4S());
2714  __ frintn(v24.V2D(), v6.V2D());
2715  __ frintn(v12.V2S(), v17.V2S());
2716  __ frintn(v29.V4S(), v11.V4S());
2717  __ frintp(v10.V2D(), v7.V2D());
2718  __ frintp(v12.V2S(), v18.V2S());
2719  __ frintp(v26.V4S(), v31.V4S());
2720  __ frintx(v24.V2D(), v13.V2D());
2721  __ frintx(v7.V2S(), v9.V2S());
2722  __ frintx(v18.V4S(), v21.V4S());
2723  __ frintz(v19.V2D(), v25.V2D());
2724  __ frintz(v15.V2S(), v8.V2S());
2725  __ frintz(v20.V4S(), v3.V4S());
2726  __ frsqrte(v23.V2D(), v5.V2D());
2727  __ frsqrte(v9.V2S(), v7.V2S());
2728  __ frsqrte(v3.V4S(), v9.V4S());
2729  __ frsqrts(v25.V2D(), v28.V2D(), v15.V2D());
2730  __ frsqrts(v9.V2S(), v26.V2S(), v10.V2S());
2731  __ frsqrts(v5.V4S(), v1.V4S(), v10.V4S());
2732  __ fsqrt(v6.V2D(), v18.V2D());
2733  __ fsqrt(v6.V2S(), v18.V2S());
2734  __ fsqrt(v0.V4S(), v31.V4S());
2735  __ fsub(v31.V2D(), v30.V2D(), v31.V2D());
2736  __ fsub(v11.V2S(), v8.V2S(), v6.V2S());
2737  __ fsub(v16.V4S(), v0.V4S(), v31.V4S());
2738  __ scvtf(v25.V2D(), v31.V2D());
2739  __ scvtf(v10.V2D(), v13.V2D(), 45);
2740  __ scvtf(v10.V2S(), v15.V2S());
2741  __ scvtf(v18.V2S(), v4.V2S(), 27);
2742  __ scvtf(v17.V4S(), v5.V4S());
2743  __ scvtf(v11.V4S(), v25.V4S(), 24);
2744  __ ucvtf(v9.V2D(), v3.V2D());
2745  __ ucvtf(v26.V2D(), v30.V2D(), 46);
2746  __ ucvtf(v11.V2S(), v4.V2S());
2747  __ ucvtf(v29.V2S(), v3.V2S(), 25);
2748  __ ucvtf(v22.V4S(), v23.V4S());
2749  __ ucvtf(v18.V4S(), v9.V4S(), 25);
2750}
2751
2752
2753static void GenerateTestSequenceSVE(MacroAssembler* masm) {
2754  ExactAssemblyScope guard(masm,
2755                           masm->GetBuffer()->GetRemainingBytes(),
2756                           ExactAssemblyScope::kMaximumSize);
2757  CPUFeaturesScope feature_guard(masm, CPUFeatures::kSVE);
2758
2759  // Simple, unpredicated loads and stores.
2760  __ str(p12.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2761  __ str(p13.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2762  __ str(p14.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2763  __ str(p15.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2764  __ ldr(p8.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2765  __ ldr(p9.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2766  __ ldr(p10.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2767  __ ldr(p11.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2768
2769  __ str(z0.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2770  __ str(z1.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2771  __ str(z2.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2772  __ str(z3.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2773  __ ldr(z20.VnD(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2774  __ ldr(z21.VnS(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2775  __ ldr(z22.VnH(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2776  __ ldr(z23.VnB(), SVEMemOperand(x0, 11, SVE_MUL_VL));
2777
2778  // Structured accesses.
2779  __ st1b(z0.VnB(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2780  __ st1h(z1.VnH(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2781  __ st1w(z2.VnS(), p1, SVEMemOperand(x0, x3, LSL, 2));
2782  __ st1d(z3.VnD(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2783  __ ld1b(z20.VnB(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2784  __ ld1h(z21.VnH(), p2.Zeroing(), SVEMemOperand(x0, x2, LSL, 1));
2785  __ ld1w(z22.VnS(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2786  __ ld1d(z23.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2787
2788  // Structured, packed accesses.
2789  __ st1b(z2.VnH(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2790  __ st1b(z3.VnS(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2791  __ st1b(z4.VnD(), p2, SVEMemOperand(x0, 3, SVE_MUL_VL));
2792  __ st1h(z0.VnS(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2793  __ st1h(z1.VnD(), p1, SVEMemOperand(x0, x2, LSL, 1));
2794  __ st1w(z2.VnD(), p1, SVEMemOperand(x0, 3, SVE_MUL_VL));
2795  __ ld1b(z20.VnH(), p1.Zeroing(), SVEMemOperand(x0, x2));
2796  __ ld1b(z21.VnS(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2797  __ ld1b(z22.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2798  __ ld1h(z23.VnS(), p2.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2799  __ ld1h(z24.VnD(), p2.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2800  __ ld1w(z20.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2801  __ ld1sb(z21.VnH(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2802  __ ld1sb(z22.VnS(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2803  __ ld1sb(z23.VnD(), p2.Zeroing(), SVEMemOperand(x0, x2));
2804  __ ld1sh(z24.VnS(), p2.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2805  __ ld1sh(z20.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2806  __ ld1sw(z21.VnD(), p1.Zeroing(), SVEMemOperand(x0, 3, SVE_MUL_VL));
2807
2808  // Structured, interleaved accesses.
2809  __ st2b(z0.VnB(), z1.VnB(), p4, SVEMemOperand(x0, 4, SVE_MUL_VL));
2810  __ st2h(z1.VnH(), z2.VnH(), p4, SVEMemOperand(x0, 4, SVE_MUL_VL));
2811  __ st2w(z2.VnS(), z3.VnS(), p3, SVEMemOperand(x0, x2, LSL, 2));
2812  __ st2d(z3.VnD(), z4.VnD(), p4, SVEMemOperand(x0, 4, SVE_MUL_VL));
2813  __ ld2b(z20.VnB(), z21.VnB(), p5.Zeroing(), SVEMemOperand(x0, x2));
2814  __ ld2h(z21.VnH(), z22.VnH(), p6.Zeroing(), SVEMemOperand(x0, 4, SVE_MUL_VL));
2815  __ ld2w(z22.VnS(), z23.VnS(), p6.Zeroing(), SVEMemOperand(x0, 4, SVE_MUL_VL));
2816  __ ld2d(z23.VnD(), z24.VnD(), p5.Zeroing(), SVEMemOperand(x0, 4, SVE_MUL_VL));
2817
2818  __ st3b(z4.VnB(), z5.VnB(), z6.VnB(), p4, SVEMemOperand(x0, 3, SVE_MUL_VL));
2819  __ st3h(z5.VnH(), z6.VnH(), z7.VnH(), p4, SVEMemOperand(x0, 3, SVE_MUL_VL));
2820  __ st3w(z6.VnS(), z7.VnS(), z8.VnS(), p3, SVEMemOperand(x0, 3, SVE_MUL_VL));
2821  __ st3d(z7.VnD(), z8.VnD(), z9.VnD(), p4, SVEMemOperand(x0, x2, LSL, 3));
2822  __ ld3b(z24.VnB(),
2823          z25.VnB(),
2824          z26.VnB(),
2825          p5.Zeroing(),
2826          SVEMemOperand(x0, 3, SVE_MUL_VL));
2827  __ ld3h(z25.VnH(),
2828          z26.VnH(),
2829          z27.VnH(),
2830          p6.Zeroing(),
2831          SVEMemOperand(x0, x2, LSL, 1));
2832  __ ld3w(z26.VnS(),
2833          z27.VnS(),
2834          z28.VnS(),
2835          p6.Zeroing(),
2836          SVEMemOperand(x0, 3, SVE_MUL_VL));
2837  __ ld3d(z27.VnD(),
2838          z28.VnD(),
2839          z29.VnD(),
2840          p5.Zeroing(),
2841          SVEMemOperand(x0, 3, SVE_MUL_VL));
2842
2843  __ st4b(z31.VnB(),
2844          z0.VnB(),
2845          z1.VnB(),
2846          z2.VnB(),
2847          p4,
2848          SVEMemOperand(x0, 4, SVE_MUL_VL));
2849  __ st4h(z0.VnH(),
2850          z1.VnH(),
2851          z2.VnH(),
2852          z3.VnH(),
2853          p4,
2854          SVEMemOperand(x0, 4, SVE_MUL_VL));
2855  __ st4w(z1.VnS(),
2856          z2.VnS(),
2857          z3.VnS(),
2858          z4.VnS(),
2859          p3,
2860          SVEMemOperand(x0, 4, SVE_MUL_VL));
2861  __ st4d(z2.VnD(),
2862          z3.VnD(),
2863          z4.VnD(),
2864          z5.VnD(),
2865          p4,
2866          SVEMemOperand(x0, x2, LSL, 3));
2867  __ ld4b(z25.VnB(),
2868          z26.VnB(),
2869          z27.VnB(),
2870          z28.VnB(),
2871          p5.Zeroing(),
2872          SVEMemOperand(x0, 4, SVE_MUL_VL));
2873  __ ld4h(z26.VnH(),
2874          z27.VnH(),
2875          z28.VnH(),
2876          z29.VnH(),
2877          p6.Zeroing(),
2878          SVEMemOperand(x0, 4, SVE_MUL_VL));
2879  __ ld4w(z27.VnS(),
2880          z28.VnS(),
2881          z29.VnS(),
2882          z30.VnS(),
2883          p6.Zeroing(),
2884          SVEMemOperand(x0, x2, LSL, 2));
2885  __ ld4d(z28.VnD(),
2886          z29.VnD(),
2887          z30.VnD(),
2888          z31.VnD(),
2889          p5.Zeroing(),
2890          SVEMemOperand(x0, 4, SVE_MUL_VL));
2891}
2892
2893static void GenerateTestSequenceAtomics(MacroAssembler* masm) {
2894  ExactAssemblyScope guard(masm,
2895                           masm->GetBuffer()->GetRemainingBytes(),
2896                           ExactAssemblyScope::kMaximumSize);
2897  CPUFeaturesScope feature_guard(masm, CPUFeatures::kAtomics);
2898  __ sub(sp, sp, 16);  // Claim some working space on the stack.
2899  __ mov(x0, 0x5555555555555555);
2900  __ str(x0, MemOperand(sp));  // Initialise working space.
2901
2902#define INST_LIST(OP)                     \
2903  __ ld##OP##b(w0, w0, MemOperand(sp));   \
2904  __ ld##OP##ab(w0, w1, MemOperand(sp));  \
2905  __ ld##OP##lb(w0, w2, MemOperand(sp));  \
2906  __ ld##OP##alb(w0, w3, MemOperand(sp)); \
2907  __ ld##OP##h(w0, w0, MemOperand(sp));   \
2908  __ ld##OP##ah(w0, w1, MemOperand(sp));  \
2909  __ ld##OP##lh(w0, w2, MemOperand(sp));  \
2910  __ ld##OP##alh(w0, w3, MemOperand(sp)); \
2911  __ ld##OP(w0, w0, MemOperand(sp));      \
2912  __ ld##OP##a(w0, w1, MemOperand(sp));   \
2913  __ ld##OP##l(w0, w2, MemOperand(sp));   \
2914  __ ld##OP##al(w0, w3, MemOperand(sp));  \
2915  __ ld##OP(x0, x0, MemOperand(sp));      \
2916  __ ld##OP##a(x0, x1, MemOperand(sp));   \
2917  __ ld##OP##l(x0, x2, MemOperand(sp));   \
2918  __ ld##OP##al(x0, x3, MemOperand(sp));  \
2919  __ st##OP##b(w0, MemOperand(sp));       \
2920  __ st##OP##lb(w0, MemOperand(sp));      \
2921  __ st##OP##h(w0, MemOperand(sp));       \
2922  __ st##OP##lh(w0, MemOperand(sp));      \
2923  __ st##OP(w0, MemOperand(sp));          \
2924  __ st##OP##l(w0, MemOperand(sp));       \
2925  __ st##OP(x0, MemOperand(sp));          \
2926  __ st##OP##l(x0, MemOperand(sp));
2927
2928  INST_LIST(add);
2929  INST_LIST(set);
2930  INST_LIST(eor);
2931  INST_LIST(smin);
2932  INST_LIST(smax);
2933  INST_LIST(umin);
2934  INST_LIST(umax);
2935  INST_LIST(clr);
2936
2937#undef INST_LIST
2938
2939  __ add(sp, sp, 16);  // Restore stack pointer.
2940}
2941
2942static void MaskAddresses(const char* trace) {
2943#define VIXL_COLOUR "(\x1b\\[[01];([0-9][0-9])?m)?"
2944  // All patterns are replaced with "$1~~~~~~~~~~~~~~~~".
2945  std::regex patterns[] =
2946      {// Mask registers that hold addresses that change from run to run.
2947       std::regex("((x0|x1|x2|sp): " VIXL_COLOUR "0x)[0-9a-f]{16}"),
2948       // Mask accessed memory addresses.
2949       std::regex("((<-|->) " VIXL_COLOUR "0x)[0-9a-f]{16}"),
2950       // Mask instruction addresses.
2951       std::regex("^(0x)[0-9a-f]{16}"),
2952       // Mask branch targets.
2953       std::regex("(Branch" VIXL_COLOUR " to 0x)[0-9a-f]{16}"),
2954       // Mask explicit address annotations.
2955       std::regex("(addr 0x)[0-9a-f]+")};
2956#undef VIXL_COLOUR
2957
2958  std::vector<std::string> lines;
2959  std::ifstream in(trace);
2960  while (!in.eof()) {
2961    std::string line;
2962    std::getline(in, line);
2963    for (auto&& pattern : patterns) {
2964      line = std::regex_replace(line, pattern, "$1~~~~~~~~~~~~~~~~");
2965    }
2966    lines.push_back(line);
2967  }
2968  in.close();
2969
2970  // `getline` produces an empty line after a terminal "\n".
2971  if (lines.back().empty()) lines.pop_back();
2972
2973  std::ofstream out(trace, std::ofstream::trunc);
2974  for (auto&& line : lines) {
2975    out << line << "\n";
2976  }
2977}
2978
2979static void PrintFile(const char* name) {
2980  FILE* file = fopen(name, "r");
2981  char buffer[1024];  // The buffer size is arbitrary.
2982  while (fgets(buffer, sizeof(buffer), file) != NULL) fputs(buffer, stdout);
2983  fclose(file);
2984}
2985
2986static bool CheckOrGenerateTrace(const char* filename, const char* ref_file) {
2987  bool trace_matched_reference;
2988  if (Test::generate_test_trace()) {
2989    // Copy trace_stream to stdout.
2990    FILE* trace_stream = fopen(filename, "r");
2991    VIXL_ASSERT(trace_stream != NULL);
2992    fseek(trace_stream, 0, SEEK_SET);
2993    int c;
2994    while (1) {
2995      c = getc(trace_stream);
2996      if (c == EOF) break;
2997      putc(c, stdout);
2998    }
2999    fclose(trace_stream);
3000    trace_matched_reference = true;
3001  } else {
3002    // Check trace_stream against ref_file.
3003    char command[1024];
3004    size_t length =
3005        snprintf(command, sizeof(command), "diff -u %s %s", ref_file, filename);
3006    VIXL_CHECK(length < sizeof(command));
3007    trace_matched_reference = (system(command) == 0);
3008  }
3009  return trace_matched_reference;
3010}
3011
3012
3013// Trace tests can only work with the simulator.
3014#ifdef VIXL_INCLUDE_SIMULATOR_AARCH64
3015
3016static void TraceTestHelper(bool coloured_trace,
3017                            TraceParameters trace_parameters,
3018                            const char* ref_file) {
3019  MacroAssembler masm(12 * KBytes);
3020
3021  char trace_stream_filename[] = "/tmp/vixl-test-trace-XXXXXX";
3022  FILE* trace_stream = fdopen(mkstemp(trace_stream_filename), "w");
3023
3024  Decoder decoder;
3025  Simulator simulator(&decoder, trace_stream);
3026  simulator.SetColouredTrace(coloured_trace);
3027  simulator.SetTraceParameters(trace_parameters);
3028  simulator.SilenceExclusiveAccessWarning();
3029
3030  const int vl_in_bytes = 5 * kZRegMinSizeInBytes;
3031  const int vl_in_bits = vl_in_bytes * kBitsPerByte;
3032  const int pl_in_bits = vl_in_bits / kZRegBitsPerPRegBit;
3033  simulator.SetVectorLengthInBits(vl_in_bits);
3034
3035  // Set up a scratch buffer so we can test loads and stores.
3036  const int kScratchSize = vl_in_bytes * 1024;
3037  const int kScratchGuardSize = vl_in_bytes;
3038  char scratch_buffer[kScratchSize + kScratchGuardSize];
3039  for (size_t i = 0; i < (sizeof(scratch_buffer) / sizeof(scratch_buffer[0]));
3040       i++) {
3041    scratch_buffer[i] = i & 0xff;
3042  }
3043  // Used for offset addressing.
3044  simulator.WriteXRegister(0, reinterpret_cast<uintptr_t>(scratch_buffer));
3045  // Used for pre-/post-index addressing.
3046  simulator.WriteXRegister(1, reinterpret_cast<uintptr_t>(scratch_buffer));
3047
3048  const int kPostIndexRegisterStep = 13;  // Arbitrary interesting value.
3049  // Used for post-index offsets.
3050  simulator.WriteXRegister(2, kPostIndexRegisterStep);
3051
3052  // Initialize the other registers with unique values.
3053  uint64_t initial_base_u64 = 0x0100001000100101;
3054  for (unsigned i = 3; i < kNumberOfRegisters; i++) {
3055    if (i == kLinkRegCode) continue;
3056    if (i == kZeroRegCode) continue;
3057    // NoRegLog suppresses the log now, but the registers will still be logged
3058    // before the first instruction is executed since they have been written but
3059    // not printed.
3060    simulator.WriteRegister(i, initial_base_u64 * i, Simulator::NoRegLog);
3061  }
3062  for (unsigned r = 0; r < kNumberOfVRegisters; r++) {
3063    LogicVRegister reg(simulator.ReadVRegister(r));
3064    // Try to initialise Z registers with reasonable FP values. We prioritise
3065    // setting double values, then floats and half-precision values. The lanes
3066    // overlap, so this is a compromise, but d0, s0 and h0 views all see similar
3067    // arithmetic values.
3068    //
3069    // The exponent of each value is set to the (biased) register number. We set
3070    // the double, float and half-precision exponents where we can.
3071    uint64_t base = 0x3ff000003f803c00 + (0x0010000000800400 * (0x7f + r));
3072    for (unsigned lane = 0; lane < (vl_in_bytes / kDRegSizeInBytes); lane++) {
3073      uint64_t mantissas = 0x0000000100010001 * (lane & 0x7f);
3074      reg.SetUint(kFormatVnD, lane, base | mantissas);
3075    }
3076  }
3077  for (unsigned r = 0; r < kNumberOfPRegisters; r++) {
3078    LogicPRegister reg(simulator.ReadPRegister(r));
3079    // Set `r` active lanes between each inactive lane.
3080    for (unsigned bit = 0; bit < pl_in_bits; bit++) {
3081      reg.SetActive(kFormatVnB, bit, ((bit + 1) % (r + 2)) != 0);
3082    }
3083    // Completely clear some Q-sized blocks. The trace will completely omit
3084    // these for stores.
3085    for (unsigned chunk = 0; chunk < (vl_in_bits / kQRegSize); chunk++) {
3086      if (((chunk + 1) % (r + 2)) == 0) {
3087        reg.SetActiveMask(chunk, static_cast<uint16_t>(0));
3088      }
3089    }
3090  }
3091
3092  GenerateTestSequenceBase(&masm);
3093  GenerateTestSequenceFP(&masm);
3094  GenerateTestSequenceNEON(&masm);
3095  GenerateTestSequenceNEONFP(&masm);
3096  GenerateTestSequenceSVE(&masm);
3097  GenerateTestSequenceAtomics(&masm);
3098  masm.Ret();
3099  masm.FinalizeCode();
3100
3101  if (Test::disassemble()) {
3102    PrintDisassembler disasm(stdout);
3103    Instruction* start = masm.GetBuffer()->GetStartAddress<Instruction*>();
3104    Instruction* end = masm.GetBuffer()->GetEndAddress<Instruction*>();
3105    disasm.DisassembleBuffer(start, end);
3106  }
3107
3108  simulator.RunFrom(masm.GetBuffer()->GetStartAddress<Instruction*>());
3109
3110  fclose(trace_stream);
3111
3112  // We already traced into the temporary file, so just print the file.
3113  // Note that these tests need to control the trace flags, so we ignore all
3114  // --trace-* options here except for --trace-sim.
3115  if (Test::trace_sim()) PrintFile(trace_stream_filename);
3116
3117  MaskAddresses(trace_stream_filename);
3118
3119  bool trace_matched_reference =
3120      CheckOrGenerateTrace(trace_stream_filename, ref_file);
3121  remove(trace_stream_filename);  // Clean up before checking the result.
3122  VIXL_CHECK(trace_matched_reference);
3123
3124  uint64_t offset_base = simulator.ReadRegister<uint64_t>(0);
3125  uint64_t index_base = simulator.ReadRegister<uint64_t>(1);
3126
3127  VIXL_CHECK(index_base >= offset_base);
3128  VIXL_CHECK((index_base - offset_base) <= kScratchSize);
3129}
3130
3131
3132// Test individual options.
3133TEST(disasm) { TraceTestHelper(false, LOG_DISASM, REF("log-disasm")); }
3134TEST(regs) { TraceTestHelper(false, LOG_REGS, REF("log-regs")); }
3135TEST(vregs) { TraceTestHelper(false, LOG_VREGS, REF("log-vregs")); }
3136TEST(sysregs) { TraceTestHelper(false, LOG_SYSREGS, REF("log-sysregs")); }
3137TEST(write) { TraceTestHelper(false, LOG_WRITE, REF("log-write")); }
3138TEST(branch) { TraceTestHelper(false, LOG_WRITE, REF("log-branch")); }
3139
3140// Test standard combinations.
3141TEST(none) { TraceTestHelper(false, LOG_NONE, REF("log-none")); }
3142TEST(state) { TraceTestHelper(false, LOG_STATE, REF("log-state")); }
3143TEST(all) { TraceTestHelper(false, LOG_ALL, REF("log-all")); }
3144
3145
3146// Test individual options (with colour).
3147TEST(disasm_colour) {
3148  TraceTestHelper(true, LOG_DISASM, REF("log-disasm-colour"));
3149}
3150TEST(regs_colour) { TraceTestHelper(true, LOG_REGS, REF("log-regs-colour")); }
3151TEST(vregs_colour) {
3152  TraceTestHelper(true, LOG_VREGS, REF("log-vregs-colour"));
3153}
3154TEST(sysregs_colour) {
3155  TraceTestHelper(true, LOG_SYSREGS, REF("log-sysregs-colour"));
3156}
3157TEST(write_colour) {
3158  TraceTestHelper(true, LOG_WRITE, REF("log-write-colour"));
3159}
3160TEST(branch_colour) {
3161  TraceTestHelper(true, LOG_WRITE, REF("log-branch-colour"));
3162}
3163
3164// Test standard combinations (with colour).
3165TEST(none_colour) { TraceTestHelper(true, LOG_NONE, REF("log-none-colour")); }
3166TEST(state_colour) {
3167  TraceTestHelper(true, LOG_STATE, REF("log-state-colour"));
3168}
3169TEST(all_colour) { TraceTestHelper(true, LOG_ALL, REF("log-all-colour")); }
3170
3171#endif  // VIXL_INCLUDE_SIMULATOR_AARCH64
3172
3173static void PrintDisassemblerTestHelper(const char* prefix,
3174                                        const char* suffix,
3175                                        const char* ref_file) {
3176  MacroAssembler masm(12 * KBytes);
3177
3178  char trace_stream_filename[] = "/tmp/vixl-test-trace-XXXXXX";
3179  FILE* trace_stream = fdopen(mkstemp(trace_stream_filename), "w");
3180
3181  // We don't need to execute this code so there's no need for the execution
3182  // environment setup from TraceTestHelper.
3183
3184  GenerateTestSequenceBase(&masm);
3185  GenerateTestSequenceFP(&masm);
3186  GenerateTestSequenceNEON(&masm);
3187  GenerateTestSequenceNEONFP(&masm);
3188  GenerateTestSequenceSVE(&masm);
3189  GenerateTestSequenceAtomics(&masm);
3190  masm.FinalizeCode();
3191
3192  Decoder decoder;
3193  CPUFeaturesAuditor auditor(&decoder);
3194  PrintDisassembler disasm(trace_stream);
3195  if (prefix != NULL) disasm.SetCPUFeaturesPrefix(prefix);
3196  if (suffix != NULL) disasm.SetCPUFeaturesSuffix(suffix);
3197  disasm.RegisterCPUFeaturesAuditor(&auditor);
3198  decoder.AppendVisitor(&disasm);
3199
3200  Instruction* instruction = masm.GetBuffer()->GetStartAddress<Instruction*>();
3201  Instruction* end = masm.GetCursorAddress<Instruction*>();
3202  while (instruction != end) {
3203    decoder.Decode(instruction);
3204    instruction += kInstructionSize;
3205  }
3206
3207  fclose(trace_stream);
3208
3209  // We already disassembled into the temporary file, so just print the file.
3210  if (Test::disassemble()) PrintFile(trace_stream_filename);
3211
3212  MaskAddresses(trace_stream_filename);
3213
3214  bool trace_matched_reference =
3215      CheckOrGenerateTrace(trace_stream_filename, ref_file);
3216  remove(trace_stream_filename);  // Clean up before checking the result.
3217  VIXL_CHECK(trace_matched_reference);
3218}
3219
3220
3221// Test CPUFeatures disassembly annotations.
3222TEST(cpufeatures) {
3223  PrintDisassemblerTestHelper(NULL, NULL, REF("log-cpufeatures"));
3224}
3225TEST(cpufeatures_custom) {
3226  PrintDisassemblerTestHelper("### {", "} ###", REF("log-cpufeatures-custom"));
3227}
3228TEST(cpufeatures_colour) {
3229  // The colour chosen is arbitrary.
3230  PrintDisassemblerTestHelper("\033[1;35m",  // Prefix: Bold magenta.
3231                              "\033[0;m",    // Suffix: Reset colour.
3232                              REF("log-cpufeatures-colour"));
3233}
3234}  // namespace aarch64
3235}  // namespace vixl
3236