1;****************************************************************************** 2;* SIMD lossless video DSP utils 3;* Copyright (c) 2008 Loren Merritt 4;* Copyright (c) 2014 Michael Niedermayer 5;* Copyright (c) 2017 Jokyo Images 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28cextern pb_15 29pb_zzzzzzzz77777777: times 8 db -1 30pb_7: times 8 db 7 31pb_ef: times 8 db 14,15 32pb_67: times 8 db 6, 7 33pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 34pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 35pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11 36pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7 37 38SECTION .text 39 40;------------------------------------------------------------------------------ 41; void ff_add_median_pred(uint8_t *dst, const uint8_t *top, 42; const uint8_t *diff, int w, 43; int *left, int *left_top) 44;------------------------------------------------------------------------------ 45INIT_XMM sse2 46cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top 47 movu m0, [topq] 48 mova m2, m0 49 movd m4, [left_topq] 50 LSHIFT m2, 1 51 mova m1, m0 52 por m4, m2 53 movd m3, [leftq] 54 psubb m0, m4 ; t-tl 55 add dstq, wq 56 add topq, wq 57 add diffq, wq 58 neg wq 59 jmp .skip 60.loop: 61 movu m4, [topq+wq] 62 mova m0, m4 63 LSHIFT m4, 1 64 por m4, m1 65 mova m1, m0 ; t 66 psubb m0, m4 ; t-tl 67.skip: 68 movu m2, [diffq+wq] 69%assign i 0 70%rep mmsize 71 mova m4, m0 72 paddb m4, m3 ; t-tl+l 73 mova m5, m3 74 pmaxub m3, m1 75 pminub m5, m1 76 pminub m3, m4 77 pmaxub m3, m5 ; median 78 paddb m3, m2 ; +residual 79%if i==0 80 mova m7, m3 81 LSHIFT m7, mmsize-1 82%else 83 mova m6, m3 84 RSHIFT m7, 1 85 LSHIFT m6, mmsize-1 86 por m7, m6 87%endif 88%if i<mmsize-1 89 RSHIFT m0, 1 90 RSHIFT m1, 1 91 RSHIFT m2, 1 92%endif 93%assign i i+1 94%endrep 95 movu [dstq+wq], m7 96 add wq, mmsize 97 jl .loop 98 movzx r2d, byte [dstq-1] 99 mov [leftq], r2d 100 movzx r2d, byte [topq-1] 101 mov [left_topq], r2d 102 RET 103 104 105%macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned 106 add srcq, wq 107 add dstq, wq 108 neg wq 109%%.loop: 110 pshufb xm0, xm5 111%if %2 112 mova m1, [srcq+wq] 113%else 114 movu m1, [srcq+wq] 115%endif 116 psllw m2, m1, 8 117 paddb m1, m2 118 pshufb m2, m1, m3 119 paddb m1, m2 120 pshufb m2, m1, m4 121 paddb m1, m2 122%if mmsize >= 16 123 pshufb m2, m1, m6 124 paddb m1, m2 125%endif 126 paddb xm0, xm1 127%if %1 128 mova [dstq+wq], xm0 129%else 130 movq [dstq+wq], xm0 131 movhps [dstq+wq+8], xm0 132%endif 133 134%if mmsize == 32 135 vextracti128 xm2, m1, 1 ; get second lane of the ymm 136 pshufb xm0, xm5 ; set alls val to last val of the first lane 137 paddb xm0, xm2 138;store val 139%if %1 140 mova [dstq+wq+16], xm0 141%else; 142 movq [dstq+wq+16], xm0 143 movhps [dstq+wq+16+8], xm0 144%endif 145%endif 146 add wq, mmsize 147 jl %%.loop 148%if mmsize == 32 149 movzx eax, byte [dstq - 1] 150%else; 151 mov eax, mmsize-1 152 sub eax, wd 153 movd m1, eax 154 pshufb m0, m1 155 movd eax, m0 156%endif 157 RET 158%endmacro 159 160;------------------------------------------------------------------------------ 161; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left) 162;------------------------------------------------------------------------------ 163INIT_MMX ssse3 164cglobal add_left_pred, 3,3,7, dst, src, w, left 165.skip_prologue: 166 mova m5, [pb_7] 167 mova m4, [pb_zzzz3333zzzzbbbb] 168 mova m3, [pb_zz11zz55zz99zzdd] 169 movd m0, leftm 170 psllq m0, 56 171 ADD_LEFT_LOOP 1, 1 172 173%macro ADD_LEFT_PRED_UNALIGNED 0 174cglobal add_left_pred_unaligned, 3,3,7, dst, src, w, left 175 mova xm5, [pb_15] 176 VBROADCASTI128 m6, [pb_zzzzzzzz77777777] 177 VBROADCASTI128 m4, [pb_zzzz3333zzzzbbbb] 178 VBROADCASTI128 m3, [pb_zz11zz55zz99zzdd] 179 movd xm0, leftm 180 pslldq xm0, 15 181 test srcq, mmsize - 1 182 jnz .src_unaligned 183 test dstq, mmsize - 1 184 jnz .dst_unaligned 185 ADD_LEFT_LOOP 1, 1 186.dst_unaligned: 187 ADD_LEFT_LOOP 0, 1 188.src_unaligned: 189 ADD_LEFT_LOOP 0, 0 190%endmacro 191 192INIT_XMM ssse3 193ADD_LEFT_PRED_UNALIGNED 194 195%if HAVE_AVX2_EXTERNAL 196INIT_YMM avx2 197ADD_LEFT_PRED_UNALIGNED 198%endif 199 200;------------------------------------------------------------------------------ 201; void ff_add_bytes(uint8_t *dst, uint8_t *src, ptrdiff_t w); 202;------------------------------------------------------------------------------ 203%macro ADD_BYTES 0 204cglobal add_bytes, 3,4,2, dst, src, w, size 205 mov sizeq, wq 206 and sizeq, -2*mmsize 207 jz .2 208 add dstq, sizeq 209 add srcq, sizeq 210 neg sizeq 211.1: 212 mova m0, [srcq + sizeq] 213 mova m1, [srcq + sizeq + mmsize] 214 paddb m0, [dstq + sizeq] 215 paddb m1, [dstq + sizeq + mmsize] 216 mova [dstq + sizeq], m0 217 mova [dstq + sizeq + mmsize], m1 218 add sizeq, 2*mmsize 219 jl .1 220.2: 221 and wq, 2*mmsize-1 222 jz .end 223 add dstq, wq 224 add srcq, wq 225 neg wq 226.3: 227 mov sizeb, [srcq + wq] 228 add [dstq + wq], sizeb 229 inc wq 230 jl .3 231.end: 232 REP_RET 233%endmacro 234 235INIT_XMM sse2 236ADD_BYTES 237 238%if HAVE_AVX2_EXTERNAL 239INIT_YMM avx2 240ADD_BYTES 241%endif 242 243%macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u) 244 add wd, wd 245 add srcq, wq 246 add dstq, wq 247 neg wq 248%%.loop: 249 mov%2 m1, [srcq+wq] 250 mova m2, m1 251 pslld m1, 16 252 paddw m1, m2 253 mova m2, m1 254 255 pshufb m1, m3 256 paddw m1, m2 257 pshufb m0, m5 258%if mmsize == 16 259 mova m2, m1 260 pshufb m1, m4 261 paddw m1, m2 262%endif 263 paddw m0, m1 264 pand m0, m7 265%ifidn %1, a 266 mova [dstq+wq], m0 267%else 268 movq [dstq+wq], m0 269 movhps [dstq+wq+8], m0 270%endif 271 add wq, mmsize 272 jl %%.loop 273 mov eax, mmsize-1 274 sub eax, wd 275 mov wd, eax 276 shl wd, 8 277 lea eax, [wd+eax-1] 278 movd m1, eax 279 pshufb m0, m1 280 movd eax, m0 281 RET 282%endmacro 283 284;--------------------------------------------------------------------------------------------- 285; int add_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left) 286;--------------------------------------------------------------------------------------------- 287INIT_MMX ssse3 288cglobal add_left_pred_int16, 4,4,8, dst, src, mask, w, left 289.skip_prologue: 290 mova m5, [pb_67] 291 mova m3, [pb_zzzz2323zzzzabab] 292 movd m0, leftm 293 psllq m0, 48 294 movd m7, maskm 295 SPLATW m7 ,m7 296 ADD_HFYU_LEFT_LOOP_INT16 a, a 297 298INIT_XMM ssse3 299cglobal add_left_pred_int16_unaligned, 4,4,8, dst, src, mask, w, left 300 mova m5, [pb_ef] 301 mova m4, [pb_zzzzzzzz67676767] 302 mova m3, [pb_zzzz2323zzzzabab] 303 movd m0, leftm 304 pslldq m0, 14 305 movd m7, maskm 306 SPLATW m7 ,m7 307 test srcq, 15 308 jnz .src_unaligned 309 test dstq, 15 310 jnz .dst_unaligned 311 ADD_HFYU_LEFT_LOOP_INT16 a, a 312.dst_unaligned: 313 ADD_HFYU_LEFT_LOOP_INT16 u, a 314.src_unaligned: 315 ADD_HFYU_LEFT_LOOP_INT16 u, u 316 317 318;--------------------------------------------------------------------------------------------- 319; void add_gradient_pred(uint8_t *src, const ptrdiff_t stride, const ptrdiff_t width) 320;--------------------------------------------------------------------------------------------- 321%macro ADD_GRADIENT_PRED 0 322cglobal add_gradient_pred, 3,4,5, src, stride, width, tmp 323 mova xm0, [pb_15] 324 325;load src - 1 in xm1 326 movd xm1, [srcq-1] 327%if cpuflag(avx2) 328 vpbroadcastb xm1, xm1 329%else 330 pxor xm2, xm2 331 pshufb xm1, xm2 332%endif 333 334 add srcq, widthq 335 neg widthq 336 neg strideq 337 338.loop: 339 lea tmpq, [srcq + strideq] 340 mova m2, [tmpq + widthq] ; A = src[x-stride] 341 movu m3, [tmpq + widthq - 1] ; B = src[x - (stride + 1)] 342 mova m4, [srcq + widthq] ; current val (src[x]) 343 344 psubb m2, m3; A - B 345 346; prefix sum A-B 347 pslldq m3, m2, 1 348 paddb m2, m3 349 pslldq m3, m2, 2 350 paddb m2, m3 351 pslldq m3, m2, 4 352 paddb m2, m3 353 pslldq m3, m2, 8 354 paddb m2, m3 355 356; prefix sum current val 357 pslldq m3, m4, 1 358 paddb m4, m3 359 pslldq m3, m4, 2 360 paddb m4, m3 361 pslldq m3, m4, 4 362 paddb m4, m3 363 pslldq m3, m4, 8 364 paddb m4, m3 365 366; last sum 367 paddb m2, m4 ; current + (A - B) 368 369 paddb xm1, xm2 ; += C 370 mova [srcq + widthq], xm1 ; store 371 372 pshufb xm1, xm0 ; put last val in all val of xm1 373 374%if mmsize == 32 375 vextracti128 xm2, m2, 1 ; get second lane of the ymm 376 paddb xm1, xm2; += C 377 378 mova [srcq + widthq + 16], xm1 ; store 379 pshufb xm1, xm0 ; put last val in all val of m1 380%endif 381 382 add widthq, mmsize 383 jl .loop 384 RET 385 386%endmacro 387 388INIT_XMM ssse3 389ADD_GRADIENT_PRED 390 391%if HAVE_AVX2_EXTERNAL 392INIT_YMM avx2 393ADD_GRADIENT_PRED 394%endif 395