1 /* 2 * Copyright (c) 2017 Google Inc. 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include "libavutil/aarch64/asm.S" 22 #include "neon.S" 23 24 25 // The input to and output from this macro is in the registers v16-v31, 26 // and v0-v7 are used as scratch registers. 27 // p7 = v16 .. p3 = v20, p0 = v23, q0 = v24, q3 = v27, q7 = v31 28 // Depending on the width of the loop filter, we either use v16-v19 29 // and v28-v31 as temp registers, or v8-v15. 30 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8 31 dup v0.8h, w2 // E 32 dup v2.8h, w3 // I 33 dup v3.8h, w4 // H 34 35 uabd v4.8h, v20.8h, v21.8h // abs(p3 - p2) 36 uabd v5.8h, v21.8h, v22.8h // abs(p2 - p1) 37 uabd v6.8h, v22.8h, v23.8h // abs(p1 - p0) 38 uabd v7.8h, v24.8h, v25.8h // abs(q0 - q1) 39 uabd \tmp1\().8h, v25.8h, v26.8h // abs(q1 - q2) 40 uabd \tmp2\().8h, v26.8h, v27.8h // abs(q2 - q3) 41 umax v4.8h, v4.8h, v5.8h 42 umax v5.8h, v6.8h, v7.8h 43 umax \tmp1\().8h, \tmp1\().8h, \tmp2\().8h 44 uabd v6.8h, v23.8h, v24.8h // abs(p0 - q0) 45 umax v4.8h, v4.8h, v5.8h 46 add v6.8h, v6.8h, v6.8h // abs(p0 - q0) * 2 47 uabd v5.8h, v22.8h, v25.8h // abs(p1 - q1) 48 umax v4.8h, v4.8h, \tmp1\().8h // max(abs(p3 - p2), ..., abs(q2 - q3)) 49 ushr v5.8h, v5.8h, #1 50 cmhs v4.8h, v2.8h, v4.8h // max(abs()) <= I 51 add v6.8h, v6.8h, v5.8h // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1 52 cmhs v6.8h, v0.8h, v6.8h 53 and v4.16b, v4.16b, v6.16b // fm 54 55 // If no pixels need filtering, just exit as soon as possible 56 mov x11, v4.d[0] 57 mov x12, v4.d[1] 58 adds x11, x11, x12 59 b.ne 1f 60 ret x10 61 1: 62 63 .if \wd >= 8 64 dup v0.8h, w5 65 66 uabd v6.8h, v20.8h, v23.8h // abs(p3 - p0) 67 uabd v2.8h, v21.8h, v23.8h // abs(p2 - p0) 68 uabd v1.8h, v22.8h, v23.8h // abs(p1 - p0) 69 uabd \tmp1\().8h, v25.8h, v24.8h // abs(q1 - q0) 70 uabd \tmp2\().8h, v26.8h, v24.8h // abs(q2 - q0) 71 uabd \tmp3\().8h, v27.8h, v24.8h // abs(q3 - q0) 72 umax v6.8h, v6.8h, v2.8h 73 umax v1.8h, v1.8h, \tmp1\().8h 74 umax \tmp2\().8h, \tmp2\().8h, \tmp3\().8h 75 .if \wd == 16 76 uabd v7.8h, v16.8h, v23.8h // abs(p7 - p0) 77 umax v6.8h, v6.8h, v1.8h 78 uabd v2.8h, v17.8h, v23.8h // abs(p6 - p0) 79 umax v6.8h, v6.8h, \tmp2\().8h 80 uabd v1.8h, v18.8h, v23.8h // abs(p5 - p0) 81 cmhs v6.8h, v0.8h, v6.8h // flat8in 82 uabd v8.8h, v19.8h, v23.8h // abs(p4 - p0) 83 and v6.16b, v6.16b, v4.16b // flat8in && fm 84 uabd v9.8h, v28.8h, v24.8h // abs(q4 - q0) 85 bic v4.16b, v4.16b, v6.16b // fm && !flat8in 86 uabd v10.8h, v29.8h, v24.8h // abs(q5 - q0) 87 uabd v11.8h, v30.8h, v24.8h // abs(q6 - q0) 88 uabd v12.8h, v31.8h, v24.8h // abs(q7 - q0) 89 90 umax v7.8h, v7.8h, v2.8h 91 umax v1.8h, v1.8h, v8.8h 92 umax v9.8h, v9.8h, v10.8h 93 umax v11.8h, v11.8h, v12.8h 94 // The rest of the calculation of flat8out is interleaved below 95 .else 96 // The rest of the calculation of flat8in is interleaved below 97 .endif 98 .endif 99 100 // Calculate the normal inner loop filter for 2 or 4 pixels 101 uabd v5.8h, v22.8h, v23.8h // abs(p1 - p0) 102 .if \wd == 16 103 umax v7.8h, v7.8h, v1.8h 104 umax v9.8h, v9.8h, v11.8h 105 .elseif \wd == 8 106 umax v6.8h, v6.8h, v1.8h 107 .endif 108 uabd v1.8h, v25.8h, v24.8h // abs(q1 - q0) 109 .if \wd == 16 110 umax v7.8h, v7.8h, v9.8h 111 .elseif \wd == 8 112 umax v6.8h, v6.8h, \tmp2\().8h 113 .endif 114 dup \tmp2\().8h, w6 // left shift for saturation 115 sub \tmp1\().8h, v22.8h, v25.8h // p1 - q1 116 neg \tmp6\().8h, \tmp2\().8h // negative left shift after saturation 117 umax v5.8h, v5.8h, v1.8h // max(abs(p1 - p0), abs(q1 - q0)) 118 sub \tmp3\().8h, v24.8h, v23.8h // q0 - p0 119 movi \tmp5\().8h, #3 120 .if \wd == 8 121 cmhs v6.8h, v0.8h, v6.8h // flat8in 122 .endif 123 cmhs v5.8h, v3.8h, v5.8h // !hev 124 .if \wd == 8 125 and v6.16b, v6.16b, v4.16b // flat8in && fm 126 .endif 127 sqshl \tmp1\().8h, \tmp1\().8h, \tmp2\().8h 128 .if \wd == 16 129 cmhs v7.8h, v0.8h, v7.8h // flat8out 130 .elseif \wd == 8 131 bic v4.16b, v4.16b, v6.16b // fm && !flat8in 132 .endif 133 and v5.16b, v5.16b, v4.16b // !hev && fm && !flat8in 134 .if \wd == 16 135 and v7.16b, v7.16b, v6.16b // flat8out && flat8in && fm 136 .endif 137 sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(p1 - q1, BIT_DEPTH - 1) 138 139 mul \tmp3\().8h, \tmp3\().8h, \tmp5\().8h // 3 * (q0 - p0) 140 bic \tmp1\().16b, \tmp1\().16b, v5.16b // if (!hev) av_clip_int8 = 0 141 movi v2.8h, #4 142 add \tmp3\().8h, \tmp3\().8h, \tmp1\().8h // 3 * (q0 - p0) [+ av_clip_int8(p1 - q1)] 143 movi v3.8h, #3 144 sqshl \tmp1\().8h, \tmp3\().8h, \tmp2\().8h 145 movi \tmp5\().8h, #0 146 sshl \tmp1\().8h, \tmp1\().8h, \tmp6\().8h // av_clip_int2p(3 * (q0 - p0) [+ av_clip_int2p(p1 - q1)], BIT_DEPTH - 1) = f 147 dup \tmp6\().8h, w7 // max pixel value 148 .if \wd == 16 149 bic v6.16b, v6.16b, v7.16b // fm && flat8in && !flat8out 150 .endif 151 152 ushr \tmp2\().8h, \tmp6\().8h, #1 // (1 << (BIT_DEPTH - 1)) - 1 153 154 add \tmp3\().8h, \tmp1\().8h, v2.8h // f + 4 155 add \tmp4\().8h, \tmp1\().8h, v3.8h // f + 3 156 smin \tmp3\().8h, \tmp3\().8h, \tmp2\().8h // FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) 157 smin \tmp4\().8h, \tmp4\().8h, \tmp2\().8h // FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) 158 sshr \tmp3\().8h, \tmp3\().8h, #3 // f1 159 sshr \tmp4\().8h, \tmp4\().8h, #3 // f2 160 161 add v0.8h, v23.8h, \tmp4\().8h // p0 + f2 162 sub v2.8h, v24.8h, \tmp3\().8h // q0 - f1 163 smin v0.8h, v0.8h, \tmp6\().8h 164 smin v2.8h, v2.8h, \tmp6\().8h 165 srshr \tmp3\().8h, \tmp3\().8h, #1 // f = (f1 + 1) >> 1 166 smax v0.8h, v0.8h, \tmp5\().8h // out p0 167 smax v2.8h, v2.8h, \tmp5\().8h // out q0 168 bit v23.16b, v0.16b, v4.16b // if (fm && !flat8in) 169 bit v24.16b, v2.16b, v4.16b 170 171 add v0.8h, v22.8h, \tmp3\().8h // p1 + f 172 sub v2.8h, v25.8h, \tmp3\().8h // q1 - f 173 .if \wd >= 8 174 mov x11, v6.d[0] 175 .endif 176 smin v0.8h, v0.8h, \tmp6\().8h 177 smin v2.8h, v2.8h, \tmp6\().8h 178 .if \wd >= 8 179 mov x12, v6.d[1] 180 .endif 181 smax v0.8h, v0.8h, \tmp5\().8h // out p1 182 smax v2.8h, v2.8h, \tmp5\().8h // out q1 183 .if \wd >= 8 184 adds x11, x11, x12 185 .endif 186 bit v22.16b, v0.16b, v5.16b // if (!hev && fm && !flat8in) 187 bit v25.16b, v2.16b, v5.16b 188 189 // If no pixels need flat8in, jump to flat8out 190 // (or to a writeout of the inner 4 pixels, for wd=8) 191 .if \wd >= 8 192 .if \wd == 16 193 b.eq 6f 194 .else 195 b.ne 1f 196 ret x13 197 1: 198 .endif 199 200 // flat8in 201 add \tmp1\().8h, v20.8h, v21.8h 202 add \tmp3\().8h, v22.8h, v25.8h 203 add \tmp5\().8h, v20.8h, v22.8h 204 add \tmp7\().8h, v23.8h, v26.8h 205 add v0.8h, \tmp1\().8h, \tmp1\().8h 206 add v0.8h, v0.8h, v23.8h 207 add v0.8h, v0.8h, v24.8h 208 add v0.8h, v0.8h, \tmp5\().8h 209 sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h 210 sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h 211 urshr v2.8h, v0.8h, #3 // out p2 212 213 add v0.8h, v0.8h, \tmp3\().8h 214 add \tmp1\().8h, v20.8h, v23.8h 215 add \tmp3\().8h, v24.8h, v27.8h 216 urshr v3.8h, v0.8h, #3 // out p1 217 218 add v0.8h, v0.8h, \tmp7\().8h 219 sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h 220 add \tmp5\().8h, v21.8h, v24.8h 221 add \tmp7\().8h, v25.8h, v27.8h 222 urshr v4.8h, v0.8h, #3 // out p0 223 224 add v0.8h, v0.8h, \tmp3\().8h 225 sub \tmp7\().8h, \tmp7\().8h, \tmp5\().8h 226 add \tmp1\().8h, v22.8h, v25.8h 227 add \tmp3\().8h, v26.8h, v27.8h 228 urshr v5.8h, v0.8h, #3 // out q0 229 230 add v0.8h, v0.8h, \tmp7\().8h 231 sub \tmp3\().8h, \tmp3\().8h, \tmp1\().8h 232 urshr \tmp5\().8h, v0.8h, #3 // out q1 233 234 add v0.8h, v0.8h, \tmp3\().8h 235 // The output here is written back into the input registers. This doesn't 236 // matter for the flat8part below, since we only update those pixels 237 // which won't be touched below. 238 bit v21.16b, v2.16b, v6.16b 239 bit v22.16b, v3.16b, v6.16b 240 bit v23.16b, v4.16b, v6.16b 241 urshr \tmp6\().8h, v0.8h, #3 // out q2 242 bit v24.16b, v5.16b, v6.16b 243 bit v25.16b, \tmp5\().16b, v6.16b 244 bit v26.16b, \tmp6\().16b, v6.16b 245 .endif 246 .if \wd == 16 247 6: 248 orr v2.16b, v6.16b, v7.16b 249 mov x11, v2.d[0] 250 mov x12, v2.d[1] 251 adds x11, x11, x12 252 b.ne 1f 253 // If no pixels needed flat8in nor flat8out, jump to a 254 // writeout of the inner 4 pixels 255 ret x14 256 1: 257 258 mov x11, v7.d[0] 259 mov x12, v7.d[1] 260 adds x11, x11, x12 261 b.ne 1f 262 // If no pixels need flat8out, jump to a writeout of the inner 6 pixels 263 ret x15 264 265 1: 266 // flat8out 267 // This writes all outputs into v2-v17 (skipping v6 and v16). 268 // If this part is skipped, the output is read from v21-v26 (which is the input 269 // to this section). 270 shl v0.8h, v16.8h, #3 // 8 * v16 271 sub v0.8h, v0.8h, v16.8h // 7 * v16 272 add v0.8h, v0.8h, v17.8h 273 add v8.8h, v17.8h, v18.8h 274 add v10.8h, v19.8h, v20.8h 275 add v0.8h, v0.8h, v8.8h 276 add v8.8h, v16.8h, v17.8h 277 add v12.8h, v21.8h, v22.8h 278 add v0.8h, v0.8h, v10.8h 279 add v10.8h, v18.8h, v25.8h 280 add v14.8h, v23.8h, v24.8h 281 sub v10.8h, v10.8h, v8.8h 282 add v0.8h, v0.8h, v12.8h 283 add v0.8h, v0.8h, v14.8h 284 add v12.8h, v16.8h, v18.8h 285 add v14.8h, v19.8h, v26.8h 286 urshr v2.8h, v0.8h, #4 287 288 add v0.8h, v0.8h, v10.8h 289 add v8.8h, v16.8h, v19.8h 290 add v10.8h, v20.8h, v27.8h 291 sub v14.8h, v14.8h, v12.8h 292 bif v2.16b, v17.16b, v7.16b 293 urshr v3.8h , v0.8h, #4 294 295 add v0.8h, v0.8h, v14.8h 296 add v12.8h, v16.8h, v20.8h 297 add v14.8h, v21.8h, v28.8h 298 sub v10.8h, v10.8h, v8.8h 299 bif v3.16b, v18.16b, v7.16b 300 urshr v4.8h, v0.8h, #4 301 302 add v0.8h, v0.8h, v10.8h 303 add v8.8h, v16.8h, v21.8h 304 add v10.8h, v22.8h, v29.8h 305 sub v14.8h, v14.8h, v12.8h 306 bif v4.16b, v19.16b, v7.16b 307 urshr v5.8h, v0.8h, #4 308 309 add v0.8h, v0.8h, v14.8h 310 add v12.8h, v16.8h, v22.8h 311 add v14.8h, v23.8h, v30.8h 312 sub v10.8h, v10.8h, v8.8h 313 bif v5.16b, v20.16b, v7.16b 314 urshr v6.8h, v0.8h, #4 315 316 add v0.8h, v0.8h, v10.8h 317 add v10.8h, v16.8h, v23.8h 318 sub v14.8h, v14.8h, v12.8h 319 add v12.8h, v24.8h, v31.8h 320 bif v6.16b, v21.16b, v7.16b 321 urshr v8.8h, v0.8h, #4 322 323 add v0.8h, v0.8h, v14.8h 324 sub v10.8h, v12.8h, v10.8h 325 add v12.8h, v17.8h, v24.8h 326 add v14.8h, v25.8h, v31.8h 327 bif v8.16b, v22.16b, v7.16b 328 urshr v9.8h, v0.8h, #4 329 330 add v0.8h, v0.8h, v10.8h 331 sub v14.8h, v14.8h, v12.8h 332 add v12.8h, v26.8h, v31.8h 333 bif v9.16b, v23.16b, v7.16b 334 urshr v10.8h, v0.8h, #4 335 336 add v0.8h, v0.8h, v14.8h 337 add v14.8h, v18.8h, v25.8h 338 add v18.8h, v19.8h, v26.8h 339 sub v12.8h, v12.8h, v14.8h 340 add v14.8h, v27.8h, v31.8h 341 bif v10.16b, v24.16b, v7.16b 342 urshr v11.8h, v0.8h, #4 343 344 add v0.8h, v0.8h, v12.8h 345 add v12.8h, v20.8h, v27.8h 346 sub v14.8h, v14.8h, v18.8h 347 add v18.8h, v28.8h, v31.8h 348 bif v11.16b, v25.16b, v7.16b 349 sub v18.8h, v18.8h, v12.8h 350 urshr v12.8h, v0.8h, #4 351 352 add v0.8h, v0.8h, v14.8h 353 add v14.8h, v21.8h, v28.8h 354 add v20.8h, v29.8h, v31.8h 355 bif v12.16b, v26.16b, v7.16b 356 urshr v13.8h, v0.8h, #4 357 358 add v0.8h, v0.8h, v18.8h 359 sub v20.8h, v20.8h, v14.8h 360 add v18.8h, v22.8h, v29.8h 361 add v22.8h, v30.8h, v31.8h 362 bif v13.16b, v27.16b, v7.16b 363 urshr v14.8h, v0.8h, #4 364 365 add v0.8h, v0.8h, v20.8h 366 sub v22.8h, v22.8h, v18.8h 367 bif v14.16b, v28.16b, v7.16b 368 urshr v15.8h, v0.8h, #4 369 370 add v0.8h, v0.8h, v22.8h 371 bif v15.16b, v29.16b, v7.16b 372 urshr v17.8h, v0.8h, #4 373 bif v17.16b, v30.16b, v7.16b 374 .endif 375 .endm 376 377 // For wd <= 8, we use v16-v19 and v28-v31 for temp registers, 378 // while we need those for inputs/outputs in wd=16 and use v8-v15 379 // for temp registers there instead. 380 function vp9_loop_filter_4 381 loop_filter 4, v16, v17, v18, v19, v28, v29, v30, v31 382 ret 383 endfunc 384 385 function vp9_loop_filter_8 386 loop_filter 8, v16, v17, v18, v19, v28, v29, v30, v31 387 ret 388 endfunc 389 390 function vp9_loop_filter_16 391 loop_filter 16, v8, v9, v10, v11, v12, v13, v14, v15 392 ret 393 endfunc 394 395 .macro loop_filter_4 396 bl vp9_loop_filter_4 397 .endm 398 399 .macro loop_filter_8 400 // calculate alternative 'return' targets 401 adr x13, 6f 402 bl vp9_loop_filter_8 403 .endm 404 405 .macro loop_filter_16 406 // calculate alternative 'return' targets 407 adr x14, 7f 408 adr x15, 8f 409 bl vp9_loop_filter_16 410 .endm 411 412 413 // The public functions in this file have got the following signature: 414 // void loop_filter(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr); 415 416 .macro bpp_frontend func, bpp, push 417 function ff_\func\()_\bpp\()_neon, export=1 418 .if \push 419 mov x16, x30 420 stp d14, d15, [sp, #-0x10]! 421 stp d12, d13, [sp, #-0x10]! 422 stp d10, d11, [sp, #-0x10]! 423 stp d8, d9, [sp, #-0x10]! 424 .endif 425 lsl w2, w2, #\bpp - 8 426 lsl w3, w3, #\bpp - 8 427 lsl w4, w4, #\bpp - 8 428 mov x5, #1 << (\bpp - 8) 429 mov x6, #16 - \bpp 430 mov x7, #((1 << \bpp) - 1) 431 .if \push 432 bl \func\()_16_neon 433 ldp d8, d9, [sp], 0x10 434 ldp d10, d11, [sp], 0x10 435 ldp d12, d13, [sp], 0x10 436 ldp d14, d15, [sp], 0x10 437 ret x16 438 .else 439 b \func\()_16_neon 440 .endif 441 endfunc 442 .endm 443 444 .macro bpp_frontends func, push=0 445 bpp_frontend \func, 10, \push 446 bpp_frontend \func, 12, \push 447 .endm 448 449 .macro bpp_frontend_rep func, suffix, int_suffix, dir, bpp, push 450 function ff_\func\()_\suffix\()_\bpp\()_neon, export=1 451 mov x16, x30 452 .if \push 453 stp d14, d15, [sp, #-0x10]! 454 stp d12, d13, [sp, #-0x10]! 455 stp d10, d11, [sp, #-0x10]! 456 stp d8, d9, [sp, #-0x10]! 457 .endif 458 lsl w2, w2, #\bpp - 8 459 lsl w3, w3, #\bpp - 8 460 lsl w4, w4, #\bpp - 8 461 mov x5, #1 << (\bpp - 8) 462 mov x6, #16 - \bpp 463 mov x7, #((1 << \bpp) - 1) 464 bl \func\()_\int_suffix\()_16_neon 465 .ifc \dir,h 466 add x0, x0, x1, lsl #3 467 .else 468 add x0, x0, #16 469 .endif 470 bl \func\()_\int_suffix\()_16_neon 471 .if \push 472 ldp d8, d9, [sp], 0x10 473 ldp d10, d11, [sp], 0x10 474 ldp d12, d13, [sp], 0x10 475 ldp d14, d15, [sp], 0x10 476 .endif 477 ret x16 478 endfunc 479 .endm 480 481 .macro bpp_frontends_rep func, suffix, int_suffix, dir, push=0 482 bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 10, \push 483 bpp_frontend_rep \func, \suffix, \int_suffix, \dir, 12, \push 484 .endm 485 486 .macro bpp_frontend_mix2 wd1, wd2, dir, bpp 487 function ff_vp9_loop_filter_\dir\()_\wd1\()\wd2\()_16_\bpp\()_neon, export=1 488 mov x16, x30 489 lsr w8, w2, #8 490 lsr w14, w3, #8 491 lsr w15, w4, #8 492 and w2, w2, #0xff 493 and w3, w3, #0xff 494 and w4, w4, #0xff 495 lsl w2, w2, #\bpp - 8 496 lsl w3, w3, #\bpp - 8 497 lsl w4, w4, #\bpp - 8 498 mov x5, #1 << (\bpp - 8) 499 mov x6, #16 - \bpp 500 mov x7, #((1 << \bpp) - 1) 501 bl vp9_loop_filter_\dir\()_\wd1\()_8_16_neon 502 .ifc \dir,h 503 add x0, x0, x1, lsl #3 504 .else 505 add x0, x0, #16 506 .endif 507 lsl w2, w8, #\bpp - 8 508 lsl w3, w14, #\bpp - 8 509 lsl w4, w15, #\bpp - 8 510 bl vp9_loop_filter_\dir\()_\wd2\()_8_16_neon 511 ret x16 512 endfunc 513 .endm 514 515 .macro bpp_frontends_mix2 wd1, wd2 516 bpp_frontend_mix2 \wd1, \wd2, v, 10 517 bpp_frontend_mix2 \wd1, \wd2, v, 12 518 bpp_frontend_mix2 \wd1, \wd2, h, 10 519 bpp_frontend_mix2 \wd1, \wd2, h, 12 520 .endm 521 522 function vp9_loop_filter_v_4_8_16_neon 523 mov x10, x30 524 sub x9, x0, x1, lsl #2 525 ld1 {v20.8h}, [x9], x1 // p3 526 ld1 {v24.8h}, [x0], x1 // q0 527 ld1 {v21.8h}, [x9], x1 // p2 528 ld1 {v25.8h}, [x0], x1 // q1 529 ld1 {v22.8h}, [x9], x1 // p1 530 ld1 {v26.8h}, [x0], x1 // q2 531 ld1 {v23.8h}, [x9], x1 // p0 532 ld1 {v27.8h}, [x0], x1 // q3 533 sub x0, x0, x1, lsl #2 534 sub x9, x9, x1, lsl #1 535 536 loop_filter_4 537 538 st1 {v22.8h}, [x9], x1 539 st1 {v24.8h}, [x0], x1 540 st1 {v23.8h}, [x9], x1 541 st1 {v25.8h}, [x0], x1 542 sub x0, x0, x1, lsl #1 543 544 ret x10 545 endfunc 546 547 bpp_frontends vp9_loop_filter_v_4_8 548 549 function vp9_loop_filter_h_4_8_16_neon 550 mov x10, x30 551 sub x9, x0, #8 552 add x0, x9, x1, lsl #2 553 ld1 {v20.8h}, [x9], x1 554 ld1 {v24.8h}, [x0], x1 555 ld1 {v21.8h}, [x9], x1 556 ld1 {v25.8h}, [x0], x1 557 ld1 {v22.8h}, [x9], x1 558 ld1 {v26.8h}, [x0], x1 559 ld1 {v23.8h}, [x9], x1 560 ld1 {v27.8h}, [x0], x1 561 562 sub x9, x9, x1, lsl #2 563 sub x0, x0, x1, lsl #3 564 add x0, x0, #8 565 566 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 567 568 loop_filter_4 569 570 // Move x9 forward by 2 pixels; we don't need to rewrite the 571 // outermost 2 pixels since they aren't changed. 572 add x9, x9, #4 573 add x0, x9, x1, lsl #2 574 575 // We only will write the mid 4 pixels back; after the loop filter, 576 // these are in v22, v23, v24, v25, ordered as rows (8x4 pixels). 577 // We need to transpose them to columns, done with a 4x8 transpose 578 // (which in practice is two 4x4 transposes of the two 4x4 halves 579 // of the 8x4 pixels; into 4x8 pixels). 580 transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 581 st1 {v22.d}[0], [x9], x1 582 st1 {v22.d}[1], [x0], x1 583 st1 {v23.d}[0], [x9], x1 584 st1 {v23.d}[1], [x0], x1 585 st1 {v24.d}[0], [x9], x1 586 st1 {v24.d}[1], [x0], x1 587 st1 {v25.d}[0], [x9], x1 588 st1 {v25.d}[1], [x0], x1 589 sub x0, x0, x1, lsl #3 590 add x0, x0, #4 591 592 ret x10 593 endfunc 594 595 bpp_frontends vp9_loop_filter_h_4_8 596 597 function vp9_loop_filter_v_8_8_16_neon 598 mov x10, x30 599 sub x9, x0, x1, lsl #2 600 ld1 {v20.8h}, [x9], x1 // p3 601 ld1 {v24.8h}, [x0], x1 // q0 602 ld1 {v21.8h}, [x9], x1 // p2 603 ld1 {v25.8h}, [x0], x1 // q1 604 ld1 {v22.8h}, [x9], x1 // p1 605 ld1 {v26.8h}, [x0], x1 // q2 606 ld1 {v23.8h}, [x9], x1 // p0 607 ld1 {v27.8h}, [x0], x1 // q3 608 sub x9, x9, x1, lsl #2 609 sub x0, x0, x1, lsl #2 610 add x9, x9, x1 611 612 loop_filter_8 613 614 st1 {v21.8h}, [x9], x1 615 st1 {v24.8h}, [x0], x1 616 st1 {v22.8h}, [x9], x1 617 st1 {v25.8h}, [x0], x1 618 st1 {v23.8h}, [x9], x1 619 st1 {v26.8h}, [x0], x1 620 sub x0, x0, x1, lsl #1 621 sub x0, x0, x1 622 623 ret x10 624 6: 625 sub x9, x0, x1, lsl #1 626 st1 {v22.8h}, [x9], x1 627 st1 {v24.8h}, [x0], x1 628 st1 {v23.8h}, [x9], x1 629 st1 {v25.8h}, [x0], x1 630 sub x0, x0, x1, lsl #1 631 ret x10 632 endfunc 633 634 bpp_frontends vp9_loop_filter_v_8_8 635 636 function vp9_loop_filter_h_8_8_16_neon 637 mov x10, x30 638 sub x9, x0, #8 639 add x0, x9, x1, lsl #2 640 ld1 {v20.8h}, [x9], x1 641 ld1 {v24.8h}, [x0], x1 642 ld1 {v21.8h}, [x9], x1 643 ld1 {v25.8h}, [x0], x1 644 ld1 {v22.8h}, [x9], x1 645 ld1 {v26.8h}, [x0], x1 646 ld1 {v23.8h}, [x9], x1 647 ld1 {v27.8h}, [x0], x1 648 649 sub x9, x9, x1, lsl #2 650 sub x0, x0, x1, lsl #3 651 add x0, x0, #8 652 653 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 654 655 loop_filter_8 656 657 add x0, x9, x1, lsl #2 658 659 // Even though only 6 pixels per row have been changed, we write the 660 // full 8 pixel registers. 661 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 662 663 st1 {v20.8h}, [x9], x1 664 st1 {v24.8h}, [x0], x1 665 st1 {v21.8h}, [x9], x1 666 st1 {v25.8h}, [x0], x1 667 st1 {v22.8h}, [x9], x1 668 st1 {v26.8h}, [x0], x1 669 st1 {v23.8h}, [x9], x1 670 st1 {v27.8h}, [x0], x1 671 sub x0, x0, x1, lsl #3 672 add x0, x0, #8 673 674 ret x10 675 6: 676 // If we didn't need to do the flat8in part, we use the same writeback 677 // as in loop_filter_h_4_8. 678 add x9, x9, #4 679 add x0, x9, x1, lsl #2 680 transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 681 st1 {v22.d}[0], [x9], x1 682 st1 {v22.d}[1], [x0], x1 683 st1 {v23.d}[0], [x9], x1 684 st1 {v23.d}[1], [x0], x1 685 st1 {v24.d}[0], [x9], x1 686 st1 {v24.d}[1], [x0], x1 687 st1 {v25.d}[0], [x9], x1 688 st1 {v25.d}[1], [x0], x1 689 sub x0, x0, x1, lsl #3 690 add x0, x0, #4 691 ret x10 692 endfunc 693 694 bpp_frontends vp9_loop_filter_h_8_8 695 696 bpp_frontends_mix2 4, 4 697 bpp_frontends_mix2 4, 8 698 bpp_frontends_mix2 8, 4 699 bpp_frontends_mix2 8, 8 700 701 function vp9_loop_filter_v_16_8_16_neon 702 mov x10, x30 703 sub x9, x0, x1, lsl #3 704 ld1 {v16.8h}, [x9], x1 // p7 705 ld1 {v24.8h}, [x0], x1 // q0 706 ld1 {v17.8h}, [x9], x1 // p6 707 ld1 {v25.8h}, [x0], x1 // q1 708 ld1 {v18.8h}, [x9], x1 // p5 709 ld1 {v26.8h}, [x0], x1 // q2 710 ld1 {v19.8h}, [x9], x1 // p4 711 ld1 {v27.8h}, [x0], x1 // q3 712 ld1 {v20.8h}, [x9], x1 // p3 713 ld1 {v28.8h}, [x0], x1 // q4 714 ld1 {v21.8h}, [x9], x1 // p2 715 ld1 {v29.8h}, [x0], x1 // q5 716 ld1 {v22.8h}, [x9], x1 // p1 717 ld1 {v30.8h}, [x0], x1 // q6 718 ld1 {v23.8h}, [x9], x1 // p0 719 ld1 {v31.8h}, [x0], x1 // q7 720 sub x9, x9, x1, lsl #3 721 sub x0, x0, x1, lsl #3 722 add x9, x9, x1 723 724 loop_filter_16 725 726 // If we did the flat8out part, we get the output in 727 // v2-v17 (skipping v7 and v16). x9 points to x0 - 7 * stride, 728 // store v2-v9 there, and v10-v17 into x0. 729 st1 {v2.8h}, [x9], x1 730 st1 {v10.8h}, [x0], x1 731 st1 {v3.8h}, [x9], x1 732 st1 {v11.8h}, [x0], x1 733 st1 {v4.8h}, [x9], x1 734 st1 {v12.8h}, [x0], x1 735 st1 {v5.8h}, [x9], x1 736 st1 {v13.8h}, [x0], x1 737 st1 {v6.8h}, [x9], x1 738 st1 {v14.8h}, [x0], x1 739 st1 {v8.8h}, [x9], x1 740 st1 {v15.8h}, [x0], x1 741 st1 {v9.8h}, [x9], x1 742 st1 {v17.8h}, [x0], x1 743 sub x0, x0, x1, lsl #3 744 add x0, x0, x1 745 746 ret x10 747 8: 748 add x9, x9, x1, lsl #2 749 // If we didn't do the flat8out part, the output is left in the 750 // input registers. 751 st1 {v21.8h}, [x9], x1 752 st1 {v24.8h}, [x0], x1 753 st1 {v22.8h}, [x9], x1 754 st1 {v25.8h}, [x0], x1 755 st1 {v23.8h}, [x9], x1 756 st1 {v26.8h}, [x0], x1 757 sub x0, x0, x1, lsl #1 758 sub x0, x0, x1 759 ret x10 760 7: 761 sub x9, x0, x1, lsl #1 762 st1 {v22.8h}, [x9], x1 763 st1 {v24.8h}, [x0], x1 764 st1 {v23.8h}, [x9], x1 765 st1 {v25.8h}, [x0], x1 766 sub x0, x0, x1, lsl #1 767 ret x10 768 endfunc 769 770 bpp_frontends vp9_loop_filter_v_16_8, push=1 771 bpp_frontends_rep vp9_loop_filter_v_16, 16, 8, v, push=1 772 773 function vp9_loop_filter_h_16_8_16_neon 774 mov x10, x30 775 sub x9, x0, #16 776 ld1 {v16.8h}, [x9], x1 777 ld1 {v24.8h}, [x0], x1 778 ld1 {v17.8h}, [x9], x1 779 ld1 {v25.8h}, [x0], x1 780 ld1 {v18.8h}, [x9], x1 781 ld1 {v26.8h}, [x0], x1 782 ld1 {v19.8h}, [x9], x1 783 ld1 {v27.8h}, [x0], x1 784 ld1 {v20.8h}, [x9], x1 785 ld1 {v28.8h}, [x0], x1 786 ld1 {v21.8h}, [x9], x1 787 ld1 {v29.8h}, [x0], x1 788 ld1 {v22.8h}, [x9], x1 789 ld1 {v30.8h}, [x0], x1 790 ld1 {v23.8h}, [x9], x1 791 ld1 {v31.8h}, [x0], x1 792 sub x0, x0, x1, lsl #3 793 sub x9, x9, x1, lsl #3 794 795 // The 16x8 pixels read above is in two 8x8 blocks; the left 796 // half in v16-v23, and the right half in v24-v31. Do two 8x8 transposes 797 // of this, to get one column per register. 798 transpose_8x8H v16, v17, v18, v19, v20, v21, v22, v23, v0, v1 799 transpose_8x8H v24, v25, v26, v27, v28, v29, v30, v31, v0, v1 800 801 loop_filter_16 802 803 transpose_8x8H v16, v2, v3, v4, v5, v6, v8, v9, v0, v1 804 transpose_8x8H v10, v11, v12, v13, v14, v15, v17, v31, v0, v1 805 806 st1 {v16.8h}, [x9], x1 807 st1 {v10.8h}, [x0], x1 808 st1 {v2.8h}, [x9], x1 809 st1 {v11.8h}, [x0], x1 810 st1 {v3.8h}, [x9], x1 811 st1 {v12.8h}, [x0], x1 812 st1 {v4.8h}, [x9], x1 813 st1 {v13.8h}, [x0], x1 814 st1 {v5.8h}, [x9], x1 815 st1 {v14.8h}, [x0], x1 816 st1 {v6.8h}, [x9], x1 817 st1 {v15.8h}, [x0], x1 818 st1 {v8.8h}, [x9], x1 819 st1 {v17.8h}, [x0], x1 820 st1 {v9.8h}, [x9], x1 821 st1 {v31.8h}, [x0], x1 822 sub x0, x0, x1, lsl #3 823 824 ret x10 825 8: 826 // The same writeback as in loop_filter_h_8_8 827 sub x9, x0, #8 828 add x0, x9, x1, lsl #2 829 transpose_8x8H v20, v21, v22, v23, v24, v25, v26, v27, v28, v29 830 831 st1 {v20.8h}, [x9], x1 832 st1 {v24.8h}, [x0], x1 833 st1 {v21.8h}, [x9], x1 834 st1 {v25.8h}, [x0], x1 835 st1 {v22.8h}, [x9], x1 836 st1 {v26.8h}, [x0], x1 837 st1 {v23.8h}, [x9], x1 838 st1 {v27.8h}, [x0], x1 839 sub x0, x0, x1, lsl #3 840 add x0, x0, #8 841 ret x10 842 7: 843 // The same writeback as in loop_filter_h_4_8 844 sub x9, x0, #4 845 add x0, x9, x1, lsl #2 846 transpose_4x8H v22, v23, v24, v25, v26, v27, v28, v29 847 st1 {v22.d}[0], [x9], x1 848 st1 {v22.d}[1], [x0], x1 849 st1 {v23.d}[0], [x9], x1 850 st1 {v23.d}[1], [x0], x1 851 st1 {v24.d}[0], [x9], x1 852 st1 {v24.d}[1], [x0], x1 853 st1 {v25.d}[0], [x9], x1 854 st1 {v25.d}[1], [x0], x1 855 sub x0, x0, x1, lsl #3 856 add x0, x0, #4 857 ret x10 858 endfunc 859 860 bpp_frontends vp9_loop_filter_h_16_8, push=1 861 bpp_frontends_rep vp9_loop_filter_h_16, 16, 8, h, push=1 862