Lines Matching refs:v3
32 ld1 {v3.16b, v4.16b}, [x0], #32
46 mul v23.8h, v3.8h, v0.h[0] // 6/2 * src[16]
50 shl v3.8h, v3.8h, #3 // 16/2 * src[16]
61 mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
67 add v22.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2
69 sub v17.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2
70 add v23.8h, v4.8h, v3.8h // t5/2 = t1/2 + t3/2
73 sub v2.8h, v4.8h, v3.8h // t8/2 = t1/2 - t3/2
74 neg v3.8h, v7.8h // -t1
85 ssra v23.8h, v3.8h, #1 // (t5 - t1) >> 1
86 srshr v3.8h, v22.8h, #2 // (t5 + t1 + 4) >> 3
94 trn2 v17.8h, v3.8h, v4.8h
102 trn1 v3.8h, v3.8h, v4.8h
112 trn1 v18.4s, v3.4s, v5.4s
117 trn2 v3.4s, v3.4s, v5.4s
125 trn1 v19.2d, v3.2d, v1.2d
132 trn2 v1.2d, v3.2d, v1.2d
136 mul v3.8h, v19.8h, v0.h[0] // 6/2 * src[16]
142 sub v3.8h, v3.8h, v19.8h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
145 add v7.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2
148 sub v19.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2
153 sub v22.8h, v1.8h, v3.8h // t7/2 = t2/2 - t4/2
157 add v1.8h, v1.8h, v3.8h // t6/2 = t2/2 + t4/2
159 neg v3.8h, v2.8h // -t1
166 srsra v6.8h, v3.8h, #1 // (t5 - t1 + 1) >> 1
168 srshr v3.8h, v7.8h, #6 // (t6 + t2 + 64) >> 7
173 st1 {v2.16b, v3.16b}, [x1], #32
190 ld1 {v1.8b, v2.8b, v3.8b, v4.8b}, [x2], #32
195 trn2 v6.4h, v1.4h, v3.4h
197 trn1 v1.4h, v1.4h, v3.4h
199 trn2 v3.4h, v16.4h, v18.4h
204 trn1 v19.2s, v6.2s, v3.2s
205 trn2 v3.2s, v6.2s, v3.2s
212 shl v16.4h, v3.4h, #4 // 16 * src[3]
227 mla v21.4h, v3.4h, v0.h[2] // 16 * src[1] + 15 * src[3] + 4 * src[7]
233 mla v23.4h, v3.4h, v0.h[1] // - 4 * src[1] + 9 * src[3] + 16 * src[7]
234 shl v3.4h, v2.4h, #3 // 16/2 * src[6]
238 sub v2.4h, v24.4h, v3.4h // t4/2 = 6/2 * src[2] - 16/2 * src[6]
240 add v3.4h, v7.4h, v20.4h // t1/2 = 12/2 * src[0] + 12/2 * src[4]
244 add v7.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2
245 sub v19.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2
248 add v26.4h, v3.4h, v1.4h // t5/2 = t1/2 + t3/2
251 sub v1.4h, v3.4h, v1.4h // t8/2 = t1/2 - t3/2
252 neg v3.4h, v17.4h // +t2
257 ssra v20.4h, v3.4h, #1 // (t6 + t2) >> 1
265 trn1 v3.2d, v24.2d, v27.2d
269 srshr v3.8h, v3.8h, #2 // (t7 + t3 + 4) >> 3, (t6 - t2 + 4) >> 3
273 trn2 v2.8h, v3.8h, v4.8h
274 trn1 v3.8h, v3.8h, v4.8h
276 trn1 v7.4s, v1.4s, v3.4s
277 trn2 v1.4s, v1.4s, v3.4s
278 mul v3.8h, v4.8h, v0.h[5] // 22/2 * src[24]
283 mls v3.8h, v2.8h, v0.h[4] // t4/2 = - 10/2 * src[8] + 22/2 * src[24]
287 neg v2.8h, v3.8h // -t4/2
291 ssra v3.8h, v1.8h, #1 // (t2 + t4) >> 1
295 srshr v2.8h, v3.8h, #6 // (t2 + t4 + 64) >> 7
296 srshr v3.8h, v6.8h, #6 // (t1 - t3 + 64) >> 7
300 uaddw v3.8h, v3.8h, v25.8b
304 sqxtun v3.8b, v3.8h
308 st1 {v3.8b}, [x3]
325 ld1 {v3.d}[0], [x2], x3 // 20 21 22 23
329 ld1 {v3.d}[1], [x2], x3 // 60 61 62 63
336 trn2 v2.8h, v3.8h, v4.8h // 21 31 23 33 61 71 63 73
337 trn1 v3.8h, v3.8h, v4.8h // 20 30 22 32 60 70 62 72
340 trn1 v18.4s, v1.4s, v3.4s // 00 10 20 30 40 50 60 70
346 trn2 v1.4s, v1.4s, v3.4s // 02 12 22 32 42 52 62 72
347 mul v3.8h, v18.8h, v0.h[6] // 17 * src[0]
353 add v2.8h, v3.8h, v1.8h // t1 = 17 * src[0] + 17 * src[2]
354 sub v1.8h, v3.8h, v1.8h // t2 = 17 * src[0] - 17 * src[2]
355 neg v3.8h, v16.8h // -t3/2
359 ssra v3.8h, v2.8h, #1 // (t1 - t3) >> 1
363 srshr v3.8h, v3.8h, #2 // (t1 - t3 + 64) >> 3
365 trn2 v17.8h, v2.8h, v3.8h // 12 13 32 33 52 53 72 73
368 trn1 v2.8h, v2.8h, v3.8h // 02 03 22 23 42 43 62 63
369 trn1 v3.4s, v18.4s, v17.4s // 10 11 12 13 50 51 52 53
372 mov d18, v3.d[1] // 50 51 52 53
373 shl v19.4h, v3.4h, #4 // 16 * src[8]
377 shl v23.4h, v3.4h, #2 // 4 * src[8]
391 mls v24.4h, v3.4h, v0.h[2] // - 15 * src[8] + 4 * src[24] + 16 * src[40]
393 mls v21.4h, v3.4h, v0.h[1] // - 9 * src[8] + 16 * src[24] - 4 * src[40]
396 shl v3.4h, v23.4h, #3 // 16/2 * src[48]
401 sub v3.4h, v25.4h, v3.4h // t4/2 = 6/2 * src[16] - 16/2 * src[48]
408 add v18.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2
409 sub v22.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2
411 sub v25.4h, v17.4h, v3.4h // t7/2 = t2/2 - t4/2
412 add v3.4h, v17.4h, v3.4h // t6/2 = t2/2 + t4/2
423 srsra v3.4h, v24.4h, #1 // (t6 - t2 + 1) >> 1
428 trn1 v3.2d, v25.2d, v3.2d
432 srshr v3.8h, v3.8h, #6 // (t7 + t3 + 64) >> 7, (t6 - t2 + 65) >> 7
436 uaddw v3.8h, v3.8h, v7.8b
440 sqxtun v3.8b, v3.8h
444 st1 {v3.s}[0], [x4], x1
448 st1 {v3.s}[1], [x4], x1
466 ld1 {v3.d}[0], [x2], x3 // 20 21 22 23
474 trn2 v2.4h, v3.4h, v4.4h // 21 31 23 33
475 trn1 v3.4h, v3.4h, v4.4h // 20 30 22 32
477 trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30
479 trn2 v1.2s, v1.2s, v3.2s // 02 12 22 32
480 mul v3.4h, v4.4h, v0.h[0] // 10/2 * src[3]
484 mla v3.4h, v2.4h, v0.h[1] // t3/2 = 22/2 * src[1] + 10/2 * src[3]
488 neg v7.4h, v3.4h // -t3/2
490 ssra v3.4h, v2.4h, #1 // (t1 + t3) >> 1
494 srshr v1.4h, v3.4h, #2 // (t1 + t3 + 64) >> 3
496 srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3
498 trn2 v7.4h, v1.4h, v3.4h // 10 11 30 31
499 trn1 v1.4h, v1.4h, v3.4h // 00 01 20 21
500 trn2 v3.4h, v2.4h, v4.4h // 12 13 32 33
502 trn2 v4.2s, v7.2s, v3.2s // 30 31 32 33
504 trn1 v3.2s, v7.2s, v3.2s // 10 11 12 13
510 mls v2.4h, v3.4h, v0.h[0] // t4/2 = - 10/2 * src[8] + 22/2 * src[24]
511 mla v4.4h, v3.4h, v0.h[1] // t3/2 = 22/2 * src[8] + 10/2 * src[24]
514 neg v3.4h, v2.4h // -t4/2
517 ssra v3.4h, v1.4h, #1 // (t2 - t4) >> 1
520 trn1 v0.2d, v4.2d, v3.2d
549 ld1 {v3.8b}, [x0], x1
563 uaddw v3.8h, v16.8h, v3.8b
571 sqxtun v3.8b, v3.8h
579 st1 {v3.8b}, [x3], x1
601 ld1 {v3.8b}, [x0]
611 uaddw v3.8h, v4.8h, v3.8b
615 sqxtun v3.8b, v3.8h
619 st1 {v3.8b}, [x3]
637 ld1 {v3.s}[0], [x0], x1
647 ld1 {v3.s}[1], [x0]
651 uaddw v3.8h, v4.8h, v3.8b
655 sqxtun v3.8b, v3.8h
659 st1 {v3.s}[0], [x3], x1
663 st1 {v3.s}[1], [x3]
717 ld1 {v3.s}[0], [x3], x1 // P2
726 uxtl v3.8h, v3.8b // P2
729 mls v2.4h, v3.4h, v0.h[1] // 2*P1-5*P2
730 uxtl v3.8h, v6.8b // P7
734 mla v17.4h, v3.4h, v0.h[1] // 2*P5-5*P6+5*P7
735 uxtl v3.8h, v16.8b // P8
739 mls v17.4h, v3.4h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
740 sub v3.4h, v6.4h, v1.4h // P4-P5
744 abs v4.4h, v3.4h
750 sshr v3.4h, v3.4h, #8 // clip_sign
757 sub v3.4h, v3.4h, v5.4h // clip_sign - a0_sign
770 mls v6.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
771 mla v1.4h, v0.4h, v3.4h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
790 ld1 {v3.8b}, [x3], x1
795 trn1 v2.8b, v3.8b, v4.8b
796 trn2 v3.8b, v3.8b, v4.8b
798 trn1 v7.4h, v1.4h, v3.4h // P2, P6
800 trn2 v1.4h, v1.4h, v3.4h // P4, P8
801 ushll v3.8h, v4.8b, #1 // 2*P1, 2*P5
805 mls v3.8h, v6.8h, v0.h[1] // 2*P1-5*P2, 2*P5-5*P6
808 mla v3.8h, v7.8h, v0.h[1] // 2*P1-5*P2+5*P3, 2*P5-5*P6+5*P7
810 mls v3.8h, v1.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4, 2*P5-5*P6+5*P7-2*P8
816 srshr v3.8h, v3.8h, #3
820 abs v3.8h, v3.8h // a1, a2
822 mov d16, v3.d[1] // a2
826 cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2
829 bsl v19.8b, v16.8b, v3.8b // a3
830 orr v3.8b, v18.8b, v5.8b // test clip == 0 || a0 >= pq
834 orr v5.8b, v3.8b, v7.8b // test clip == 0 || a0 >= pq || a3 >= a0
840 bic v0.8b, v5.8b, v3.8b // set each d to zero if it should not be filtered because clip == 0 || a0 >= pq (a3 > a0 case already zeroed by saturating sub)
843 sqxtun v3.8b, v4.8h
845 st2 {v2.b, v3.b}[0], [x0], x1
846 st2 {v2.b, v3.b}[1], [x0], x1
847 st2 {v2.b, v3.b}[2], [x0], x1
848 st2 {v2.b, v3.b}[3], [x0]
862 ld1 {v3.8b}, [x3], x1 // P1
868 ushll v3.8h, v3.8b, #1 // 2*P1
875 mls v3.8h, v4.8h, v0.h[1] // 2*P1-5*P2
883 mla v3.8h, v20.8h, v0.h[1] // 2*P1-5*P2+5*P3
889 mls v3.8h, v7.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4
895 srshr v3.8h, v3.8h, #3
897 abs v3.8h, v3.8h // a1
899 cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2
902 bsl v18.16b, v16.16b, v3.16b // a3
903 cmhs v3.8h, v20.8h, v19.8h // test a0 >= pq
907 orr v3.16b, v5.16b, v3.16b // test clip == 0 || a0 >= pq
909 orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
914 orr v2.16b, v3.16b, v2.16b
915 cmhs v3.8h, v0.8h, v17.8h
917 bsl v3.16b, v17.16b, v0.16b // FFMIN(d, clip)
919 bic v0.16b, v3.16b, v2.16b // set each d to zero if it should not be filtered
941 ld1 {v3.8b}, [x3], x1
949 trn1 v2.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]...
950 trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]...
955 trn1 v19.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]...
959 trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]...
960 trn1 v3.4h, v18.4h, v20.4h // P1[4], P1[5], P1[6], P1[7], P5[4]...
964 trn1 v7.2s, v6.2s, v3.2s // P1
966 trn2 v3.2s, v6.2s, v3.2s // P5
972 ushll v19.8h, v3.8b, #1 // 2*P5
984 uxtl v2.8h, v3.8b // P5
986 sub v3.8h, v16.8h, v2.8h // P4-P5
989 abs v1.8h, v3.8h
990 sshr v3.8h, v3.8h, #8 // clip_sign
1006 sub v3.8h, v3.8h, v5.8h // clip_sign - a0_sign
1020 mla v2.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P5
1021 mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1047 ld1 {v3.16b}, [x3], x1 // P1
1053 ushll v17.8h, v3.8b, #1 // 2*P1[0..7]
1059 ushll2 v3.8h, v3.16b, #1 // 2*P1[8..15]
1067 mls v3.8h, v4.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]
1081 mla v3.8h, v26.8h, v0.h[1] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]
1098 mls v3.8h, v18.8h, v0.h[0] // 2*P1[8..15]-5*P2[8..15]+5*P3[8..15]-2*P4[8..15]
1104 srshr v3.8h, v3.8h, #3
1110 abs v3.8h, v3.8h // a1[8..15]
1115 cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15]
1118 bsl v16.16b, v19.16b, v3.16b // a3[8..15]
1119 uqsub v3.8h, v17.8h, v5.8h // a0[0..7] >= a3[0..7] ? a0[0..7]-a3[0..7] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1125 mul v3.8h, v3.8h, v0.h[1] // a0[0..7] >= a3[0..7] ? 5*(a0[0..7]-a3[0..7]) : 0
1133 ushr v3.8h, v3.8h, #3 // a0[0..7] >= a3[0..7] ? (5*(a0[0..7]-a3[0..7]))>>3 : 0
1137 cmhs v19.8h, v3.8h, v27.8h
1142 bsl v19.16b, v27.16b, v3.16b // FFMIN(d[0..7], clip[0..7])
1144 cmhs v3.8h, v0.8h, v23.8h
1150 bsl v3.16b, v23.16b, v0.16b // FFMIN(d[8..15], clip[8..15])
1153 bic v0.16b, v3.16b, v2.16b // set each d[8..15] to zero if it should not be filtered because clip[8..15] == 0 || a0[8..15] >= pq (a3 > a0 case already zeroed by saturating sub)
1180 ld1 {v3.8b}, [x3], x1
1191 trn1 v18.8b, v3.8b, v4.8b // P1[2], P1[3], P3[2]...
1193 trn2 v3.8b, v3.8b, v4.8b // P2[2], P2[3], P4[2]...
1209 trn1 v21.4h, v1.4h, v3.4h // P2[0], P2[1], P2[2], P2[3], P6[0]...
1232 trn2 v1.4h, v1.4h, v3.4h // P4[0], P4[1], P4[2], P4[3], P8[0]...
1233 trn2 v3.4h, v5.4h, v7.4h // P4[4], P4[5], P4[6], P4[7], P8[4]...
1246 trn1 v27.2s, v1.2s, v3.2s // P4[0..7]
1260 trn2 v1.2s, v1.2s, v3.2s // P8[0..7]
1261 ushll v3.8h, v21.8b, #1 // 2*P3[8..15]
1277 mls v3.8h, v4.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]
1293 mla v3.8h, v18.8h, v0.h[1] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]
1296 mls v3.8h, v17.8h, v0.h[0] // 2*P3[8..15]-5*P4[8..15]+5*P5[8..15]-2*P6[8..15]
1302 srshr v3.8h, v3.8h, #3
1306 abs v2.8h, v3.8h // a0[8..15]
1307 sshr v3.8h, v3.8h, #8 // a0_sign[8..15]
1312 sub v1.8h, v1.8h, v3.8h // clip_sign[8..15] - a0_sign[8..15]
1313 uqsub v3.8h, v2.8h, v24.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1319 mul v3.8h, v3.8h, v0.h[1] // a0[8..15] >= a3[8..15] ? 5*(a0[8..15]-a3[8..15]) : 0
1325 ushr v3.8h, v3.8h, #3 // a0[8..15] >= a3[8..15] ? (5*(a0[8..15]-a3[8..15]))>>3 : 0
1330 cmhs v2.8h, v3.8h, v26.8h
1333 bsl v2.16b, v26.16b, v3.16b // FFMIN(d[8..15], clip[8..15])
1348 sqxtun v3.8b, v18.8h
1360 st2 {v2.b, v3.b}[0], [x4], x1
1361 st2 {v2.b, v3.b}[1], [x4], x1
1362 st2 {v2.b, v3.b}[2], [x4], x1
1363 st2 {v2.b, v3.b}[3], [x4]
1365 st2 {v2.b, v3.b}[4], [x6], x1
1366 st2 {v2.b, v3.b}[5], [x6], x1
1367 st2 {v2.b, v3.b}[6], [x6], x1
1368 st2 {v2.b, v3.b}[7], [x6]
1424 1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48
1425 ext v25.16b, v3.16b, v4.16b, #1
1426 ext v26.16b, v3.16b, v4.16b, #2
1427 ext v27.16b, v3.16b, v4.16b, #3
1431 bic v24.16b, v3.16b, v20.16b
1494 st1 {v3.16b, v4.16b}, [x2], #32
1495 3: mov v3.16b, v2.16b
1505 ext v25.16b, v3.16b, v4.16b, #1
1507 ext v26.16b, v3.16b, v4.16b, #2
1508 ext v27.16b, v3.16b, v4.16b, #3
1513 bic v24.16b, v3.16b, v20.16b