1 /* 2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> 3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22 #include "config_components.h" 23 24 #include "libavutil/aarch64/asm.S" 25 26 /* chroma_mc8(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */ 27 .macro h264_chroma_mc8 type, codec=h264 28 function ff_\type\()_\codec\()_chroma_mc8_neon, export=1 29 .ifc \type,avg 30 mov x8, x0 31 .endif 32 prfm pldl1strm, [x1] 33 prfm pldl1strm, [x1, x2] 34 .ifc \codec,rv40 35 movrel x6, rv40bias 36 lsr w9, w5, #1 37 lsr w10, w4, #1 38 lsl w9, w9, #3 39 lsl w10, w10, #1 40 add w9, w9, w10 41 add x6, x6, w9, UXTW 42 ld1r {v22.8H}, [x6] 43 .endif 44 .ifc \codec,vc1 45 movi v22.8H, #28 46 .endif 47 mul w7, w4, w5 48 lsl w14, w5, #3 49 lsl w13, w4, #3 50 cmp w7, #0 51 sub w6, w14, w7 52 sub w12, w13, w7 53 sub w4, w7, w13 54 sub w4, w4, w14 55 add w4, w4, #64 56 b.eq 2f 57 58 dup v0.8B, w4 59 dup v1.8B, w12 60 ld1 {v4.8B, v5.8B}, [x1], x2 61 dup v2.8B, w6 62 dup v3.8B, w7 63 ext v5.8B, v4.8B, v5.8B, #1 64 1: ld1 {v6.8B, v7.8B}, [x1], x2 65 umull v16.8H, v4.8B, v0.8B 66 umlal v16.8H, v5.8B, v1.8B 67 ext v7.8B, v6.8B, v7.8B, #1 68 ld1 {v4.8B, v5.8B}, [x1], x2 69 umlal v16.8H, v6.8B, v2.8B 70 prfm pldl1strm, [x1] 71 ext v5.8B, v4.8B, v5.8B, #1 72 umlal v16.8H, v7.8B, v3.8B 73 umull v17.8H, v6.8B, v0.8B 74 subs w3, w3, #2 75 umlal v17.8H, v7.8B, v1.8B 76 umlal v17.8H, v4.8B, v2.8B 77 umlal v17.8H, v5.8B, v3.8B 78 prfm pldl1strm, [x1, x2] 79 .ifc \codec,h264 80 rshrn v16.8B, v16.8H, #6 81 rshrn v17.8B, v17.8H, #6 82 .else 83 add v16.8H, v16.8H, v22.8H 84 add v17.8H, v17.8H, v22.8H 85 shrn v16.8B, v16.8H, #6 86 shrn v17.8B, v17.8H, #6 87 .endif 88 .ifc \type,avg 89 ld1 {v20.8B}, [x8], x2 90 ld1 {v21.8B}, [x8], x2 91 urhadd v16.8B, v16.8B, v20.8B 92 urhadd v17.8B, v17.8B, v21.8B 93 .endif 94 st1 {v16.8B}, [x0], x2 95 st1 {v17.8B}, [x0], x2 96 b.gt 1b 97 ret 98 99 2: adds w12, w12, w6 100 dup v0.8B, w4 101 b.eq 5f 102 tst w6, w6 103 dup v1.8B, w12 104 b.eq 4f 105 106 ld1 {v4.8B}, [x1], x2 107 3: ld1 {v6.8B}, [x1], x2 108 umull v16.8H, v4.8B, v0.8B 109 umlal v16.8H, v6.8B, v1.8B 110 ld1 {v4.8B}, [x1], x2 111 umull v17.8H, v6.8B, v0.8B 112 umlal v17.8H, v4.8B, v1.8B 113 prfm pldl1strm, [x1] 114 .ifc \codec,h264 115 rshrn v16.8B, v16.8H, #6 116 rshrn v17.8B, v17.8H, #6 117 .else 118 add v16.8H, v16.8H, v22.8H 119 add v17.8H, v17.8H, v22.8H 120 shrn v16.8B, v16.8H, #6 121 shrn v17.8B, v17.8H, #6 122 .endif 123 prfm pldl1strm, [x1, x2] 124 .ifc \type,avg 125 ld1 {v20.8B}, [x8], x2 126 ld1 {v21.8B}, [x8], x2 127 urhadd v16.8B, v16.8B, v20.8B 128 urhadd v17.8B, v17.8B, v21.8B 129 .endif 130 subs w3, w3, #2 131 st1 {v16.8B}, [x0], x2 132 st1 {v17.8B}, [x0], x2 133 b.gt 3b 134 ret 135 136 4: ld1 {v4.8B, v5.8B}, [x1], x2 137 ld1 {v6.8B, v7.8B}, [x1], x2 138 ext v5.8B, v4.8B, v5.8B, #1 139 ext v7.8B, v6.8B, v7.8B, #1 140 prfm pldl1strm, [x1] 141 subs w3, w3, #2 142 umull v16.8H, v4.8B, v0.8B 143 umlal v16.8H, v5.8B, v1.8B 144 umull v17.8H, v6.8B, v0.8B 145 umlal v17.8H, v7.8B, v1.8B 146 prfm pldl1strm, [x1, x2] 147 .ifc \codec,h264 148 rshrn v16.8B, v16.8H, #6 149 rshrn v17.8B, v17.8H, #6 150 .else 151 add v16.8H, v16.8H, v22.8H 152 add v17.8H, v17.8H, v22.8H 153 shrn v16.8B, v16.8H, #6 154 shrn v17.8B, v17.8H, #6 155 .endif 156 .ifc \type,avg 157 ld1 {v20.8B}, [x8], x2 158 ld1 {v21.8B}, [x8], x2 159 urhadd v16.8B, v16.8B, v20.8B 160 urhadd v17.8B, v17.8B, v21.8B 161 .endif 162 st1 {v16.8B}, [x0], x2 163 st1 {v17.8B}, [x0], x2 164 b.gt 4b 165 ret 166 167 5: ld1 {v4.8B}, [x1], x2 168 ld1 {v5.8B}, [x1], x2 169 prfm pldl1strm, [x1] 170 subs w3, w3, #2 171 umull v16.8H, v4.8B, v0.8B 172 umull v17.8H, v5.8B, v0.8B 173 prfm pldl1strm, [x1, x2] 174 .ifc \codec,h264 175 rshrn v16.8B, v16.8H, #6 176 rshrn v17.8B, v17.8H, #6 177 .else 178 add v16.8H, v16.8H, v22.8H 179 add v17.8H, v17.8H, v22.8H 180 shrn v16.8B, v16.8H, #6 181 shrn v17.8B, v17.8H, #6 182 .endif 183 .ifc \type,avg 184 ld1 {v20.8B}, [x8], x2 185 ld1 {v21.8B}, [x8], x2 186 urhadd v16.8B, v16.8B, v20.8B 187 urhadd v17.8B, v17.8B, v21.8B 188 .endif 189 st1 {v16.8B}, [x0], x2 190 st1 {v17.8B}, [x0], x2 191 b.gt 5b 192 ret 193 endfunc 194 .endm 195 196 /* chroma_mc4(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h, int x, int y) */ 197 .macro h264_chroma_mc4 type, codec=h264 198 function ff_\type\()_\codec\()_chroma_mc4_neon, export=1 199 .ifc \type,avg 200 mov x8, x0 201 .endif 202 prfm pldl1strm, [x1] 203 prfm pldl1strm, [x1, x2] 204 .ifc \codec,rv40 205 movrel x6, rv40bias 206 lsr w9, w5, #1 207 lsr w10, w4, #1 208 lsl w9, w9, #3 209 lsl w10, w10, #1 210 add w9, w9, w10 211 add x6, x6, w9, UXTW 212 ld1r {v22.8H}, [x6] 213 .endif 214 .ifc \codec,vc1 215 movi v22.8H, #28 216 .endif 217 mul w7, w4, w5 218 lsl w14, w5, #3 219 lsl w13, w4, #3 220 cmp w7, #0 221 sub w6, w14, w7 222 sub w12, w13, w7 223 sub w4, w7, w13 224 sub w4, w4, w14 225 add w4, w4, #64 226 b.eq 2f 227 228 dup v24.8B, w4 229 dup v25.8B, w12 230 ld1 {v4.8B}, [x1], x2 231 dup v26.8B, w6 232 dup v27.8B, w7 233 ext v5.8B, v4.8B, v5.8B, #1 234 trn1 v0.2S, v24.2S, v25.2S 235 trn1 v2.2S, v26.2S, v27.2S 236 trn1 v4.2S, v4.2S, v5.2S 237 1: ld1 {v6.8B}, [x1], x2 238 ext v7.8B, v6.8B, v7.8B, #1 239 trn1 v6.2S, v6.2S, v7.2S 240 umull v18.8H, v4.8B, v0.8B 241 umlal v18.8H, v6.8B, v2.8B 242 ld1 {v4.8B}, [x1], x2 243 ext v5.8B, v4.8B, v5.8B, #1 244 trn1 v4.2S, v4.2S, v5.2S 245 prfm pldl1strm, [x1] 246 umull v19.8H, v6.8B, v0.8B 247 umlal v19.8H, v4.8B, v2.8B 248 trn1 v30.2D, v18.2D, v19.2D 249 trn2 v31.2D, v18.2D, v19.2D 250 add v18.8H, v30.8H, v31.8H 251 .ifc \codec,h264 252 rshrn v16.8B, v18.8H, #6 253 .else 254 add v18.8H, v18.8H, v22.8H 255 shrn v16.8B, v18.8H, #6 256 .endif 257 subs w3, w3, #2 258 prfm pldl1strm, [x1, x2] 259 .ifc \type,avg 260 ld1 {v20.S}[0], [x8], x2 261 ld1 {v20.S}[1], [x8], x2 262 urhadd v16.8B, v16.8B, v20.8B 263 .endif 264 st1 {v16.S}[0], [x0], x2 265 st1 {v16.S}[1], [x0], x2 266 b.gt 1b 267 ret 268 269 2: adds w12, w12, w6 270 dup v30.8B, w4 271 b.eq 5f 272 tst w6, w6 273 dup v31.8B, w12 274 trn1 v0.2S, v30.2S, v31.2S 275 trn2 v1.2S, v30.2S, v31.2S 276 b.eq 4f 277 278 ext v1.8B, v0.8B, v1.8B, #4 279 ld1 {v4.S}[0], [x1], x2 280 3: ld1 {v4.S}[1], [x1], x2 281 umull v18.8H, v4.8B, v0.8B 282 ld1 {v4.S}[0], [x1], x2 283 umull v19.8H, v4.8B, v1.8B 284 trn1 v30.2D, v18.2D, v19.2D 285 trn2 v31.2D, v18.2D, v19.2D 286 add v18.8H, v30.8H, v31.8H 287 prfm pldl1strm, [x1] 288 .ifc \codec,h264 289 rshrn v16.8B, v18.8H, #6 290 .else 291 add v18.8H, v18.8H, v22.8H 292 shrn v16.8B, v18.8H, #6 293 .endif 294 .ifc \type,avg 295 ld1 {v20.S}[0], [x8], x2 296 ld1 {v20.S}[1], [x8], x2 297 urhadd v16.8B, v16.8B, v20.8B 298 .endif 299 subs w3, w3, #2 300 prfm pldl1strm, [x1, x2] 301 st1 {v16.S}[0], [x0], x2 302 st1 {v16.S}[1], [x0], x2 303 b.gt 3b 304 ret 305 306 4: ld1 {v4.8B}, [x1], x2 307 ld1 {v6.8B}, [x1], x2 308 ext v5.8B, v4.8B, v5.8B, #1 309 ext v7.8B, v6.8B, v7.8B, #1 310 trn1 v4.2S, v4.2S, v5.2S 311 trn1 v6.2S, v6.2S, v7.2S 312 umull v18.8H, v4.8B, v0.8B 313 umull v19.8H, v6.8B, v0.8B 314 subs w3, w3, #2 315 trn1 v30.2D, v18.2D, v19.2D 316 trn2 v31.2D, v18.2D, v19.2D 317 add v18.8H, v30.8H, v31.8H 318 prfm pldl1strm, [x1] 319 .ifc \codec,h264 320 rshrn v16.8B, v18.8H, #6 321 .else 322 add v18.8H, v18.8H, v22.8H 323 shrn v16.8B, v18.8H, #6 324 .endif 325 .ifc \type,avg 326 ld1 {v20.S}[0], [x8], x2 327 ld1 {v20.S}[1], [x8], x2 328 urhadd v16.8B, v16.8B, v20.8B 329 .endif 330 prfm pldl1strm, [x1] 331 st1 {v16.S}[0], [x0], x2 332 st1 {v16.S}[1], [x0], x2 333 b.gt 4b 334 ret 335 336 5: ld1 {v4.S}[0], [x1], x2 337 ld1 {v4.S}[1], [x1], x2 338 umull v18.8H, v4.8B, v30.8B 339 subs w3, w3, #2 340 prfm pldl1strm, [x1] 341 .ifc \codec,h264 342 rshrn v16.8B, v18.8H, #6 343 .else 344 add v18.8H, v18.8H, v22.8H 345 shrn v16.8B, v18.8H, #6 346 .endif 347 .ifc \type,avg 348 ld1 {v20.S}[0], [x8], x2 349 ld1 {v20.S}[1], [x8], x2 350 urhadd v16.8B, v16.8B, v20.8B 351 .endif 352 prfm pldl1strm, [x1] 353 st1 {v16.S}[0], [x0], x2 354 st1 {v16.S}[1], [x0], x2 355 b.gt 5b 356 ret 357 endfunc 358 .endm 359 360 .macro h264_chroma_mc2 type 361 function ff_\type\()_h264_chroma_mc2_neon, export=1 362 prfm pldl1strm, [x1] 363 prfm pldl1strm, [x1, x2] 364 orr w7, w4, w5 365 cbz w7, 2f 366 367 mul w7, w4, w5 368 lsl w14, w5, #3 369 lsl w13, w4, #3 370 sub w6, w14, w7 371 sub w12, w13, w7 372 sub w4, w7, w13 373 sub w4, w4, w14 374 add w4, w4, #64 375 dup v0.8B, w4 376 dup v2.8B, w12 377 dup v1.8B, w6 378 dup v3.8B, w7 379 trn1 v0.4H, v0.4H, v2.4H 380 trn1 v1.4H, v1.4H, v3.4H 381 1: 382 ld1 {v4.S}[0], [x1], x2 383 ld1 {v4.S}[1], [x1], x2 384 rev64 v5.2S, v4.2S 385 ld1 {v5.S}[1], [x1] 386 ext v6.8B, v4.8B, v5.8B, #1 387 ext v7.8B, v5.8B, v4.8B, #1 388 trn1 v4.4H, v4.4H, v6.4H 389 trn1 v5.4H, v5.4H, v7.4H 390 umull v16.8H, v4.8B, v0.8B 391 umlal v16.8H, v5.8B, v1.8B 392 .ifc \type,avg 393 ld1 {v18.H}[0], [x0], x2 394 ld1 {v18.H}[2], [x0] 395 sub x0, x0, x2 396 .endif 397 rev64 v17.4S, v16.4S 398 add v16.8H, v16.8H, v17.8H 399 rshrn v16.8B, v16.8H, #6 400 .ifc \type,avg 401 urhadd v16.8B, v16.8B, v18.8B 402 .endif 403 st1 {v16.H}[0], [x0], x2 404 st1 {v16.H}[2], [x0], x2 405 subs w3, w3, #2 406 b.gt 1b 407 ret 408 409 2: 410 ld1 {v16.H}[0], [x1], x2 411 ld1 {v16.H}[1], [x1], x2 412 .ifc \type,avg 413 ld1 {v18.H}[0], [x0], x2 414 ld1 {v18.H}[1], [x0] 415 sub x0, x0, x2 416 urhadd v16.8B, v16.8B, v18.8B 417 .endif 418 st1 {v16.H}[0], [x0], x2 419 st1 {v16.H}[1], [x0], x2 420 subs w3, w3, #2 421 b.gt 2b 422 ret 423 endfunc 424 .endm 425 426 h264_chroma_mc8 put 427 h264_chroma_mc8 avg 428 h264_chroma_mc4 put 429 h264_chroma_mc4 avg 430 h264_chroma_mc2 put 431 h264_chroma_mc2 avg 432 433 #if CONFIG_RV40_DECODER 434 const rv40bias 435 .short 0, 16, 32, 16 436 .short 32, 28, 32, 28 437 .short 0, 32, 16, 32 438 .short 32, 28, 32, 28 439 endconst 440 441 h264_chroma_mc8 put, rv40 442 h264_chroma_mc8 avg, rv40 443 h264_chroma_mc4 put, rv40 444 h264_chroma_mc4 avg, rv40 445 #endif 446 447 #if CONFIG_VC1DSP 448 h264_chroma_mc8 put, vc1 449 h264_chroma_mc8 avg, vc1 450 h264_chroma_mc4 put, vc1 451 h264_chroma_mc4 avg, vc1 452 #endif 453