Lines Matching refs:v16
36 ld1 {v16.16b, v17.16b}, [x0]
55 shl v21.8h, v16.8h, #3 // 16/2 * src[48]
61 mla v3.8h, v16.8h, v0.h[0] // t3/2 = 16/2 * src[16] + 6/2 * src[48]
64 sub v16.8h, v1.8h, v2.8h // t7/2 = t2/2 - t4/2
81 ssra v16.8h, v6.8h, #1 // (t7 + t3) >> 1
88 srshr v5.8h, v16.8h, #2 // (t7 + t3 + 4) >> 3
93 srshr v16.8h, v23.8h, #2 // (t5 - t1 + 4) >> 3
97 trn2 v20.8h, v7.8h, v16.8h
107 trn1 v2.8h, v7.8h, v16.8h
110 shl v16.8h, v20.8h, #4 // 16 * src[24]
120 sub v5.8h, v16.8h, v23.8h // 16 * src[24] - 4 * src[40]
121 ssra v17.8h, v16.8h, #2 // 4 * src[24] + 16 * src[40]
122 sub v16.8h, v21.8h, v22.8h // - 4 * src[8] + 16 * src[56]
133 mla v16.8h, v20.8h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
147 mls v16.8h, v4.8h, v0.h[2] // -t4 = - 4 * src[8] + 9 * src[24] - 15 * src[40] + 16 * src[56]
155 neg v24.8h, v16.8h // +t4
163 srsra v23.8h, v16.8h, #1 // (t8 - t4 + 1) >> 1
171 srshr v16.8h, v23.8h, #6 // (t8 - t4 + 65) >> 7
177 st1 {v16.16b, v17.16b}, [x1], #32
192 ld1 {v16.8b, v17.8b, v18.8b, v19.8b}, [x2]
199 trn2 v3.4h, v16.4h, v18.4h
201 trn1 v16.4h, v16.4h, v18.4h
208 trn1 v7.2s, v1.2s, v16.2s
211 trn2 v1.2s, v1.2s, v16.2s
212 shl v16.4h, v3.4h, #4 // 16 * src[3]
222 ssra v17.4h, v16.4h, #2 // 4 * src[3] + 16 * src[5]
224 sub v16.4h, v16.4h, v27.4h // 16 * src[3] - 4 * src[5]
231 mls v16.4h, v19.4h, v0.h[1] // - 9 * src[1] + 16 * src[3] - 4 * src[5]
239 mls v16.4h, v4.4h, v0.h[2] // -t3 = - 9 * src[1] + 16 * src[3] - 4 * src[5] - 15 * src[7]
253 neg v4.4h, v16.4h // +t3
260 ssra v2.4h, v16.4h, #1 // (t7 - t3) >> 1
334 trn2 v16.8h, v1.8h, v2.8h // 01 11 03 13 41 51 43 53
339 trn2 v17.4s, v16.4s, v2.4s // 03 13 23 33 43 53 63 73
341 trn1 v2.4s, v16.4s, v2.4s // 01 11 21 31 41 51 61 71
342 mul v16.8h, v17.8h, v0.h[4] // 10/2 * src[3]
351 mla v16.8h, v2.8h, v0.h[5] // t3/2 = 22/2 * src[1] + 10/2 * src[3]
355 neg v3.8h, v16.8h // -t3/2
356 ssra v16.8h, v2.8h, #1 // (t1 + t3) >> 1
361 srshr v1.8h, v16.8h, #2 // (t1 + t3 + 64) >> 3
364 srshr v16.8h, v18.8h, #2 // (t2 - t4 + 64) >> 3
366 trn2 v18.8h, v1.8h, v16.8h // 10 11 30 31 50 51 70 71
367 trn1 v1.8h, v1.8h, v16.8h // 00 01 20 21 40 41 60 61
370 trn2 v16.4s, v18.4s, v17.4s // 30 31 32 33 70 71 72 73
374 mov d20, v16.d[1] // 70 71 72 73
375 shl v21.4h, v16.4h, #4 // 16 * src[24]
395 mla v2.4h, v16.4h, v0.h[1] // - 4 * src[8] + 9 * src[24] + 16 * src[56]
397 mla v19.4h, v16.4h, v0.h[2] // 16 * src[8] + 15 * src[24] + 4 * src[56]
400 add v16.4h, v17.4h, v22.4h // t1/2 = 12/2 * src[0] + 12/2 * src[32]
405 add v20.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2
407 sub v0.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2
414 sub v26.4h, v16.4h, v1.4h // t8/2 = t1/2 - t3/2
415 add v1.4h, v16.4h, v1.4h // t5/2 = t1/2 + t3/2
416 neg v16.4h, v19.4h // -t1
425 srsra v1.4h, v16.4h, #1 // (t5 - t1 + 1) >> 1
477 trn1 v16.2s, v1.2s, v3.2s // 00 10 20 30
482 mul v7.4h, v16.4h, v0.h[2] // 17 * src[0]
489 neg v16.4h, v4.4h // -t4/2
492 ssra v16.4h, v1.4h, #1 // (t2 - t4) >> 1
496 srshr v3.4h, v16.4h, #2 // (t2 - t4 + 64) >> 3
503 trn1 v16.2s, v1.2s, v2.2s // 00 01 02 03
508 mul v7.4h, v16.4h, v0.h[2] // 17 * src[0]
559 dup v16.8h, w0
560 uaddw v0.8h, v16.8h, v0.8b
561 uaddw v1.8h, v16.8h, v1.8b
562 uaddw v2.8h, v16.8h, v2.8b
563 uaddw v3.8h, v16.8h, v3.8b
564 uaddw v4.8h, v16.8h, v4.8b
565 uaddw v5.8h, v16.8h, v5.8b
567 uaddw v6.8h, v16.8h, v6.8b
569 uaddw v7.8h, v16.8h, v7.8b
722 ld1 {v16.s}[0], [x0] // P8
735 uxtl v3.8h, v16.8b // P8
752 cmeq v16.4h, v4.4h, #0 // test clip == 0
759 orr v2.8b, v16.8b, v18.8b // test clip == 0 || a0 >= pq
826 cmhs v19.4h, v3.4h, v16.4h // test a1 >= a2
829 bsl v19.8b, v16.8b, v3.8b // a3
867 ushll v16.8h, v1.8b, #1 // 2*P5
878 mls v16.8h, v5.8h, v0.h[1] // 2*P5-5*P6
881 mla v16.8h, v4.8h, v0.h[1] // 2*P5-5*P6+5*P7
886 mls v16.8h, v17.8h, v0.h[0] // 2*P5-5*P6+5*P7-2*P8
892 srshr v16.8h, v16.8h, #3
896 abs v16.8h, v16.8h // a2
899 cmhs v18.8h, v3.8h, v16.8h // test a1 >= a2
902 bsl v18.16b, v16.16b, v3.16b // a3
906 cmhs v16.8h, v18.8h, v20.8h // test a3 >= a0
909 orr v5.16b, v3.16b, v16.16b // test clip == 0 || a0 >= pq || a3 >= a0
946 trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]...
954 trn1 v6.4h, v16.4h, v2.4h // P1[0], P1[1], P1[2], P1[3], P5[0]...
958 trn2 v2.4h, v16.4h, v2.4h // P3[0], P3[1], P3[2], P3[3], P7[0]...
961 trn1 v16.4h, v5.4h, v7.4h // P2[4], P2[5], P2[6], P2[7], P6[4]...
965 trn1 v18.2s, v19.2s, v16.2s // P2
967 trn2 v6.2s, v19.2s, v16.2s // P6
968 trn1 v16.2s, v2.2s, v17.2s // P3
976 uxtl v18.8h, v16.8b // P3
979 ushll v5.8h, v16.8b, #1 // 2*P3
981 uxtl v16.8h, v17.8b // P4
986 sub v3.8h, v16.8h, v2.8h // P4-P5
987 mls v5.8h, v16.8h, v0.h[1] // 2*P3-5*P4
991 mls v7.8h, v16.8h, v0.h[0] // 2*P1-5*P2+5*P3-2*P4
1021 mls v16.8h, v0.8h, v3.8h // invert d depending on clip_sign & a0_sign, or zero it if they match, and accumulate into P4
1023 sqxtun v0.8b, v16.8h
1052 ushll v16.8h, v1.8b, #1 // 2*P5[0..7]
1063 mls v16.8h, v21.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]
1075 mla v16.8h, v24.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1092 mls v16.8h, v23.8h, v0.h[0] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]-2*P8[0..7]
1100 srshr v16.8h, v16.8h, #3
1106 abs v16.8h, v16.8h // a2[0..7]
1109 cmhs v5.8h, v17.8h, v16.8h // test a1[0..7] >= a2[0..7]
1113 bsl v5.16b, v16.16b, v17.16b // a3[0..7]
1115 cmhs v16.8h, v3.8h, v19.8h // test a1[8..15] >= a2[8.15]
1118 bsl v16.16b, v19.16b, v3.16b // a3[8..15]
1126 uqsub v17.8h, v19.8h, v16.8h // a0[8..15] >= a3[8..15] ? a0[8..15]-a3[8..15] : 0 (a0 > a3 in all cases where filtering is enabled, so makes more sense to subtract this way round than the opposite and then taking the abs)
1129 cmhs v16.8h, v16.8h, v19.8h // test a3[8..15] >= a0[8..15]
1140 orr v5.16b, v7.16b, v16.16b // test clip[8..15] == 0 || a0[8..15] >= pq || a3[8..15] >= a0[8..15]
1141 orr v16.16b, v20.16b, v17.16b
1148 bic v5.16b, v19.16b, v16.16b // set each d[0..7] to zero if it should not be filtered because clip[0..7] == 0 || a0[0..7] >= pq (a3 > a0 case already zeroed by saturating sub)
1187 trn1 v16.8b, v1.8b, v2.8b // P1[0], P1[1], P3[0]...
1206 trn1 v19.4h, v16.4h, v18.4h // P1[0], P1[1], P1[2], P1[3], P5[0]...
1223 trn2 v16.4h, v16.4h, v18.4h // P3[0], P3[1], P3[2], P3[3], P7[0]...
1238 trn1 v7.2s, v16.2s, v20.2s // P3[0..7]
1240 trn2 v16.2s, v16.2s, v20.2s // P7[0..7]
1254 uxtl v16.8h, v16.8b // P7[0..7]
1266 mla v6.8h, v16.8h, v0.h[1] // 2*P5[0..7]-5*P6[0..7]+5*P7[0..7]
1268 uxtl v16.8h, v19.8b // P5[0..7]
1273 sub v21.8h, v27.8h, v16.8h // P4[0..7]-P5[0..7]
1291 mla v7.8h, v16.8h, v0.h[1] // 2*P3[0..7]-5*P4[0..7]+5*P5[0..7]
1343 mla v16.8h, v2.8h, v6.8h // invert d[0..7] depending on clip_sign[0..7] & a0_sign[0..7], or zero it if they match, and accumulate into P5
1347 sqxtun v1.8b, v16.8h