1cabdff1aSopenharmony_ci;******************************************************************************* 2cabdff1aSopenharmony_ci;* SIMD-optimized IDCT functions for HEVC decoding 3cabdff1aSopenharmony_ci;* Copyright (c) 2014 Pierre-Edouard LEPERE 4cabdff1aSopenharmony_ci;* Copyright (c) 2014 James Almer 5cabdff1aSopenharmony_ci;* Copyright (c) 2016 Alexandra Hájková 6cabdff1aSopenharmony_ci;* 7cabdff1aSopenharmony_ci;* This file is part of FFmpeg. 8cabdff1aSopenharmony_ci;* 9cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or 10cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public 11cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either 12cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version. 13cabdff1aSopenharmony_ci;* 14cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful, 15cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17cabdff1aSopenharmony_ci;* Lesser General Public License for more details. 18cabdff1aSopenharmony_ci;* 19cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public 20cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software 21cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22cabdff1aSopenharmony_ci;****************************************************************************** 23cabdff1aSopenharmony_ci 24cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm" 25cabdff1aSopenharmony_ci 26cabdff1aSopenharmony_ciSECTION_RODATA 27cabdff1aSopenharmony_ci 28cabdff1aSopenharmony_cipd_64: times 4 dd 64 29cabdff1aSopenharmony_cipd_2048: times 4 dd 2048 30cabdff1aSopenharmony_cipd_512: times 4 dd 512 31cabdff1aSopenharmony_ci 32cabdff1aSopenharmony_ci; 4x4 transform coeffs 33cabdff1aSopenharmony_cicextern pw_64 34cabdff1aSopenharmony_cipw_64_m64: times 4 dw 64, -64 35cabdff1aSopenharmony_cipw_83_36: times 4 dw 83, 36 36cabdff1aSopenharmony_cipw_36_m83: times 4 dw 36, -83 37cabdff1aSopenharmony_ci 38cabdff1aSopenharmony_ci; 8x8 transform coeffs 39cabdff1aSopenharmony_cipw_89_75: times 4 dw 89, 75 40cabdff1aSopenharmony_cipw_50_18: times 4 dw 50, 18 41cabdff1aSopenharmony_ci 42cabdff1aSopenharmony_cipw_75_m18: times 4 dw 75, -18 43cabdff1aSopenharmony_cipw_m89_m50: times 4 dw -89, -50 44cabdff1aSopenharmony_ci 45cabdff1aSopenharmony_cipw_50_m89: times 4 dw 50, -89 46cabdff1aSopenharmony_cipw_18_75: times 4 dw 18, 75 47cabdff1aSopenharmony_ci 48cabdff1aSopenharmony_cipw_18_m50: times 4 dw 18, -50 49cabdff1aSopenharmony_cipw_75_m89: times 4 dw 75, -89 50cabdff1aSopenharmony_ci 51cabdff1aSopenharmony_ci; 16x16 transformation coeffs 52cabdff1aSopenharmony_citrans_coeffs16: times 4 dw 90, 87 53cabdff1aSopenharmony_citimes 4 dw 80, 70 54cabdff1aSopenharmony_citimes 4 dw 57, 43 55cabdff1aSopenharmony_citimes 4 dw 25, 9 56cabdff1aSopenharmony_ci 57cabdff1aSopenharmony_citimes 4 dw 87, 57 58cabdff1aSopenharmony_citimes 4 dw 9, -43 59cabdff1aSopenharmony_citimes 4 dw -80, -90 60cabdff1aSopenharmony_citimes 4 dw -70, -25 61cabdff1aSopenharmony_ci 62cabdff1aSopenharmony_citimes 4 dw 80, 9 63cabdff1aSopenharmony_citimes 4 dw -70, -87 64cabdff1aSopenharmony_citimes 4 dw -25, 57 65cabdff1aSopenharmony_citimes 4 dw 90, 43 66cabdff1aSopenharmony_ci 67cabdff1aSopenharmony_citimes 4 dw 70, -43 68cabdff1aSopenharmony_citimes 4 dw -87, 9 69cabdff1aSopenharmony_citimes 4 dw 90, 25 70cabdff1aSopenharmony_citimes 4 dw -80, -57 71cabdff1aSopenharmony_ci 72cabdff1aSopenharmony_citimes 4 dw 57, -80 73cabdff1aSopenharmony_citimes 4 dw -25, 90 74cabdff1aSopenharmony_citimes 4 dw -9, -87 75cabdff1aSopenharmony_citimes 4 dw 43, 70 76cabdff1aSopenharmony_ci 77cabdff1aSopenharmony_citimes 4 dw 43, -90 78cabdff1aSopenharmony_citimes 4 dw 57, 25 79cabdff1aSopenharmony_citimes 4 dw -87, 70 80cabdff1aSopenharmony_citimes 4 dw 9, -80 81cabdff1aSopenharmony_ci 82cabdff1aSopenharmony_citimes 4 dw 25, -70 83cabdff1aSopenharmony_citimes 4 dw 90, -80 84cabdff1aSopenharmony_citimes 4 dw 43, 9 85cabdff1aSopenharmony_citimes 4 dw -57, 87 86cabdff1aSopenharmony_ci 87cabdff1aSopenharmony_citimes 4 dw 9, -25 88cabdff1aSopenharmony_citimes 4 dw 43, -57 89cabdff1aSopenharmony_citimes 4 dw 70, -80 90cabdff1aSopenharmony_citimes 4 dw 87, -90 91cabdff1aSopenharmony_ci 92cabdff1aSopenharmony_ci; 32x32 transform coeffs 93cabdff1aSopenharmony_citrans_coeff32: times 8 dw 90 94cabdff1aSopenharmony_citimes 4 dw 88, 85 95cabdff1aSopenharmony_citimes 4 dw 82, 78 96cabdff1aSopenharmony_citimes 4 dw 73, 67 97cabdff1aSopenharmony_citimes 4 dw 61, 54 98cabdff1aSopenharmony_citimes 4 dw 46, 38 99cabdff1aSopenharmony_citimes 4 dw 31, 22 100cabdff1aSopenharmony_citimes 4 dw 13, 4 101cabdff1aSopenharmony_ci 102cabdff1aSopenharmony_citimes 4 dw 90, 82 103cabdff1aSopenharmony_citimes 4 dw 67, 46 104cabdff1aSopenharmony_citimes 4 dw 22, -4 105cabdff1aSopenharmony_citimes 4 dw -31, -54 106cabdff1aSopenharmony_citimes 4 dw -73, -85 107cabdff1aSopenharmony_citimes 4 dw -90, -88 108cabdff1aSopenharmony_citimes 4 dw -78, -61 109cabdff1aSopenharmony_citimes 4 dw -38, -13 110cabdff1aSopenharmony_ci 111cabdff1aSopenharmony_citimes 4 dw 88, 67 112cabdff1aSopenharmony_citimes 4 dw 31, -13 113cabdff1aSopenharmony_citimes 4 dw -54, -82 114cabdff1aSopenharmony_citimes 4 dw -90, -78 115cabdff1aSopenharmony_citimes 4 dw -46, -4 116cabdff1aSopenharmony_citimes 4 dw 38, 73 117cabdff1aSopenharmony_citimes 4 dw 90, 85 118cabdff1aSopenharmony_citimes 4 dw 61, 22 119cabdff1aSopenharmony_ci 120cabdff1aSopenharmony_citimes 4 dw 85, 46 121cabdff1aSopenharmony_citimes 4 dw -13, -67 122cabdff1aSopenharmony_citimes 4 dw -90, -73 123cabdff1aSopenharmony_citimes 4 dw -22, 38 124cabdff1aSopenharmony_citimes 4 dw 82, 88 125cabdff1aSopenharmony_citimes 4 dw 54, -4 126cabdff1aSopenharmony_citimes 4 dw -61, -90 127cabdff1aSopenharmony_citimes 4 dw -78, -31 128cabdff1aSopenharmony_ci 129cabdff1aSopenharmony_citimes 4 dw 82, 22 130cabdff1aSopenharmony_citimes 4 dw -54, -90 131cabdff1aSopenharmony_citimes 4 dw -61, 13 132cabdff1aSopenharmony_citimes 4 dw 78, 85 133cabdff1aSopenharmony_citimes 4 dw 31, -46 134cabdff1aSopenharmony_citimes 4 dw -90, -67 135cabdff1aSopenharmony_citimes 4 dw 4, 73 136cabdff1aSopenharmony_citimes 4 dw 88, 38 137cabdff1aSopenharmony_ci 138cabdff1aSopenharmony_citimes 4 dw 78, -4 139cabdff1aSopenharmony_citimes 4 dw -82, -73 140cabdff1aSopenharmony_citimes 4 dw 13, 85 141cabdff1aSopenharmony_citimes 4 dw 67, -22 142cabdff1aSopenharmony_citimes 4 dw -88, -61 143cabdff1aSopenharmony_citimes 4 dw 31, 90 144cabdff1aSopenharmony_citimes 4 dw 54, -38 145cabdff1aSopenharmony_citimes 4 dw -90, -46 146cabdff1aSopenharmony_ci 147cabdff1aSopenharmony_citimes 4 dw 73, -31 148cabdff1aSopenharmony_citimes 4 dw -90, -22 149cabdff1aSopenharmony_citimes 4 dw 78, 67 150cabdff1aSopenharmony_citimes 4 dw -38, -90 151cabdff1aSopenharmony_citimes 4 dw -13, 82 152cabdff1aSopenharmony_citimes 4 dw 61, -46 153cabdff1aSopenharmony_citimes 4 dw -88, -4 154cabdff1aSopenharmony_citimes 4 dw 85, 54 155cabdff1aSopenharmony_ci 156cabdff1aSopenharmony_citimes 4 dw 67, -54 157cabdff1aSopenharmony_citimes 4 dw -78, 38 158cabdff1aSopenharmony_citimes 4 dw 85, -22 159cabdff1aSopenharmony_citimes 4 dw -90, 4 160cabdff1aSopenharmony_citimes 4 dw 90, 13 161cabdff1aSopenharmony_citimes 4 dw -88, -31 162cabdff1aSopenharmony_citimes 4 dw 82, 46 163cabdff1aSopenharmony_citimes 4 dw -73, -61 164cabdff1aSopenharmony_ci 165cabdff1aSopenharmony_citimes 4 dw 61, -73 166cabdff1aSopenharmony_citimes 4 dw -46, 82 167cabdff1aSopenharmony_citimes 4 dw 31, -88 168cabdff1aSopenharmony_citimes 4 dw -13, 90 169cabdff1aSopenharmony_citimes 4 dw -4, -90 170cabdff1aSopenharmony_citimes 4 dw 22, 85 171cabdff1aSopenharmony_citimes 4 dw -38, -78 172cabdff1aSopenharmony_citimes 4 dw 54, 67 173cabdff1aSopenharmony_ci 174cabdff1aSopenharmony_citimes 4 dw 54, -85 175cabdff1aSopenharmony_citimes 4 dw -4, 88 176cabdff1aSopenharmony_citimes 4 dw -46, -61 177cabdff1aSopenharmony_citimes 4 dw 82, 13 178cabdff1aSopenharmony_citimes 4 dw -90, 38 179cabdff1aSopenharmony_citimes 4 dw 67, -78 180cabdff1aSopenharmony_citimes 4 dw -22, 90 181cabdff1aSopenharmony_citimes 4 dw -31, -73 182cabdff1aSopenharmony_ci 183cabdff1aSopenharmony_citimes 4 dw 46, -90 184cabdff1aSopenharmony_citimes 4 dw 38, 54 185cabdff1aSopenharmony_citimes 4 dw -90, 31 186cabdff1aSopenharmony_citimes 4 dw 61, -88 187cabdff1aSopenharmony_citimes 4 dw 22, 67 188cabdff1aSopenharmony_citimes 4 dw -85, 13 189cabdff1aSopenharmony_citimes 4 dw 73, -82 190cabdff1aSopenharmony_citimes 4 dw 4, 78 191cabdff1aSopenharmony_ci 192cabdff1aSopenharmony_citimes 4 dw 38, -88 193cabdff1aSopenharmony_citimes 4 dw 73, -4 194cabdff1aSopenharmony_citimes 4 dw -67, 90 195cabdff1aSopenharmony_citimes 4 dw -46, -31 196cabdff1aSopenharmony_citimes 4 dw 85, -78 197cabdff1aSopenharmony_citimes 4 dw 13, 61 198cabdff1aSopenharmony_citimes 4 dw -90, 54 199cabdff1aSopenharmony_citimes 4 dw 22, -82 200cabdff1aSopenharmony_ci 201cabdff1aSopenharmony_citimes 4 dw 31, -78 202cabdff1aSopenharmony_citimes 4 dw 90, -61 203cabdff1aSopenharmony_citimes 4 dw 4, 54 204cabdff1aSopenharmony_citimes 4 dw -88, 82 205cabdff1aSopenharmony_citimes 4 dw -38, -22 206cabdff1aSopenharmony_citimes 4 dw 73, -90 207cabdff1aSopenharmony_citimes 4 dw 67, -13 208cabdff1aSopenharmony_citimes 4 dw -46, 85 209cabdff1aSopenharmony_ci 210cabdff1aSopenharmony_citimes 4 dw 22, -61 211cabdff1aSopenharmony_citimes 4 dw 85, -90 212cabdff1aSopenharmony_citimes 4 dw 73, -38 213cabdff1aSopenharmony_citimes 4 dw -4, 46 214cabdff1aSopenharmony_citimes 4 dw -78, 90 215cabdff1aSopenharmony_citimes 4 dw -82, 54 216cabdff1aSopenharmony_citimes 4 dw -13, -31 217cabdff1aSopenharmony_citimes 4 dw 67, -88 218cabdff1aSopenharmony_ci 219cabdff1aSopenharmony_citimes 4 dw 13, -38 220cabdff1aSopenharmony_citimes 4 dw 61, -78 221cabdff1aSopenharmony_citimes 4 dw 88, -90 222cabdff1aSopenharmony_citimes 4 dw 85, -73 223cabdff1aSopenharmony_citimes 4 dw 54, -31 224cabdff1aSopenharmony_citimes 4 dw 4, 22 225cabdff1aSopenharmony_citimes 4 dw -46, 67 226cabdff1aSopenharmony_citimes 4 dw -82, 90 227cabdff1aSopenharmony_ci 228cabdff1aSopenharmony_citimes 4 dw 4, -13 229cabdff1aSopenharmony_citimes 4 dw 22, -31 230cabdff1aSopenharmony_citimes 4 dw 38, -46 231cabdff1aSopenharmony_citimes 4 dw 54, -61 232cabdff1aSopenharmony_citimes 4 dw 67, -73 233cabdff1aSopenharmony_citimes 4 dw 78, -82 234cabdff1aSopenharmony_citimes 4 dw 85, -88 235cabdff1aSopenharmony_citimes 4 dw 90, -90 236cabdff1aSopenharmony_ci 237cabdff1aSopenharmony_ciSECTION .text 238cabdff1aSopenharmony_ci 239cabdff1aSopenharmony_ci; void ff_hevc_idct_HxW_dc_{8,10}_<opt>(int16_t *coeffs) 240cabdff1aSopenharmony_ci; %1 = HxW 241cabdff1aSopenharmony_ci; %2 = number of loops 242cabdff1aSopenharmony_ci; %3 = bitdepth 243cabdff1aSopenharmony_ci%macro IDCT_DC 3 244cabdff1aSopenharmony_cicglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp 245cabdff1aSopenharmony_ci movsx tmpd, word [coeffq] 246cabdff1aSopenharmony_ci add tmpd, (1 << (14 - %3)) + 1 247cabdff1aSopenharmony_ci sar tmpd, (15 - %3) 248cabdff1aSopenharmony_ci movd xm0, tmpd 249cabdff1aSopenharmony_ci SPLATW m0, xm0 250cabdff1aSopenharmony_ci DEFINE_ARGS coeff, cnt 251cabdff1aSopenharmony_ci mov cntd, %2 252cabdff1aSopenharmony_ci.loop: 253cabdff1aSopenharmony_ci mova [coeffq+mmsize*0], m0 254cabdff1aSopenharmony_ci mova [coeffq+mmsize*1], m0 255cabdff1aSopenharmony_ci mova [coeffq+mmsize*2], m0 256cabdff1aSopenharmony_ci mova [coeffq+mmsize*3], m0 257cabdff1aSopenharmony_ci add coeffq, mmsize*8 258cabdff1aSopenharmony_ci mova [coeffq+mmsize*-4], m0 259cabdff1aSopenharmony_ci mova [coeffq+mmsize*-3], m0 260cabdff1aSopenharmony_ci mova [coeffq+mmsize*-2], m0 261cabdff1aSopenharmony_ci mova [coeffq+mmsize*-1], m0 262cabdff1aSopenharmony_ci dec cntd 263cabdff1aSopenharmony_ci jg .loop 264cabdff1aSopenharmony_ci RET 265cabdff1aSopenharmony_ci%endmacro 266cabdff1aSopenharmony_ci 267cabdff1aSopenharmony_ci; %1 = HxW 268cabdff1aSopenharmony_ci; %2 = bitdepth 269cabdff1aSopenharmony_ci%macro IDCT_DC_NL 2 ; No loop 270cabdff1aSopenharmony_cicglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp 271cabdff1aSopenharmony_ci movsx tmpd, word [coeffq] 272cabdff1aSopenharmony_ci add tmpd, (1 << (14 - %2)) + 1 273cabdff1aSopenharmony_ci sar tmpd, (15 - %2) 274cabdff1aSopenharmony_ci movd m0, tmpd 275cabdff1aSopenharmony_ci SPLATW m0, xm0 276cabdff1aSopenharmony_ci mova [coeffq+mmsize*0], m0 277cabdff1aSopenharmony_ci mova [coeffq+mmsize*1], m0 278cabdff1aSopenharmony_ci mova [coeffq+mmsize*2], m0 279cabdff1aSopenharmony_ci mova [coeffq+mmsize*3], m0 280cabdff1aSopenharmony_ci%if mmsize == 16 281cabdff1aSopenharmony_ci mova [coeffq+mmsize*4], m0 282cabdff1aSopenharmony_ci mova [coeffq+mmsize*5], m0 283cabdff1aSopenharmony_ci mova [coeffq+mmsize*6], m0 284cabdff1aSopenharmony_ci mova [coeffq+mmsize*7], m0 285cabdff1aSopenharmony_ci%endif 286cabdff1aSopenharmony_ci RET 287cabdff1aSopenharmony_ci%endmacro 288cabdff1aSopenharmony_ci 289cabdff1aSopenharmony_ci; IDCT 4x4, expects input in m0, m1 290cabdff1aSopenharmony_ci; %1 - shift 291cabdff1aSopenharmony_ci; %2 - 1/0 - SCALE and Transpose or not 292cabdff1aSopenharmony_ci; %3 - 1/0 add constant or not 293cabdff1aSopenharmony_ci%macro TR_4x4 3 294cabdff1aSopenharmony_ci ; interleaves src0 with src2 to m0 295cabdff1aSopenharmony_ci ; and src1 with scr3 to m2 296cabdff1aSopenharmony_ci ; src0: 00 01 02 03 m0: 00 20 01 21 02 22 03 23 297cabdff1aSopenharmony_ci ; src1: 10 11 12 13 --> 298cabdff1aSopenharmony_ci ; src2: 20 21 22 23 m1: 10 30 11 31 12 32 13 33 299cabdff1aSopenharmony_ci ; src3: 30 31 32 33 300cabdff1aSopenharmony_ci 301cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 1, 2 302cabdff1aSopenharmony_ci 303cabdff1aSopenharmony_ci pmaddwd m2, m0, [pw_64] ; e0 304cabdff1aSopenharmony_ci pmaddwd m3, m1, [pw_83_36] ; o0 305cabdff1aSopenharmony_ci pmaddwd m0, [pw_64_m64] ; e1 306cabdff1aSopenharmony_ci pmaddwd m1, [pw_36_m83] ; o1 307cabdff1aSopenharmony_ci 308cabdff1aSopenharmony_ci%if %3 == 1 309cabdff1aSopenharmony_ci %assign %%add 1 << (%1 - 1) 310cabdff1aSopenharmony_ci mova m4, [pd_ %+ %%add] 311cabdff1aSopenharmony_ci paddd m2, m4 312cabdff1aSopenharmony_ci paddd m0, m4 313cabdff1aSopenharmony_ci%endif 314cabdff1aSopenharmony_ci 315cabdff1aSopenharmony_ci SUMSUB_BADC d, 3, 2, 1, 0, 4 316cabdff1aSopenharmony_ci 317cabdff1aSopenharmony_ci%if %2 == 1 318cabdff1aSopenharmony_ci psrad m3, %1 ; e0 + o0 319cabdff1aSopenharmony_ci psrad m1, %1 ; e1 + o1 320cabdff1aSopenharmony_ci psrad m2, %1 ; e0 - o0 321cabdff1aSopenharmony_ci psrad m0, %1 ; e1 - o1 322cabdff1aSopenharmony_ci ;clip16 323cabdff1aSopenharmony_ci packssdw m3, m1 324cabdff1aSopenharmony_ci packssdw m0, m2 325cabdff1aSopenharmony_ci ; Transpose 326cabdff1aSopenharmony_ci SBUTTERFLY wd, 3, 0, 1 327cabdff1aSopenharmony_ci SBUTTERFLY wd, 3, 0, 1 328cabdff1aSopenharmony_ci SWAP 3, 1, 0 329cabdff1aSopenharmony_ci%else 330cabdff1aSopenharmony_ci SWAP 3, 2, 0 331cabdff1aSopenharmony_ci%endif 332cabdff1aSopenharmony_ci%endmacro 333cabdff1aSopenharmony_ci 334cabdff1aSopenharmony_ci%macro DEFINE_BIAS 1 335cabdff1aSopenharmony_ci %assign shift (20 - %1) 336cabdff1aSopenharmony_ci %assign c_add (1 << (shift - 1)) 337cabdff1aSopenharmony_ci %define arr_add pd_ %+ c_add 338cabdff1aSopenharmony_ci%endmacro 339cabdff1aSopenharmony_ci 340cabdff1aSopenharmony_ci; %1 - bit_depth 341cabdff1aSopenharmony_ci; %2 - register add constant 342cabdff1aSopenharmony_ci; is loaded to 343cabdff1aSopenharmony_ci; shift = 20 - bit_depth 344cabdff1aSopenharmony_ci%macro LOAD_BIAS 2 345cabdff1aSopenharmony_ci DEFINE_BIAS %1 346cabdff1aSopenharmony_ci mova %2, [arr_add] 347cabdff1aSopenharmony_ci%endmacro 348cabdff1aSopenharmony_ci 349cabdff1aSopenharmony_ci; %1, %2 - registers to load packed 16 bit values to 350cabdff1aSopenharmony_ci; %3, %4, %5, %6 - vertical offsets 351cabdff1aSopenharmony_ci; %7 - horizontal offset 352cabdff1aSopenharmony_ci%macro LOAD_BLOCK 7 353cabdff1aSopenharmony_ci movq %1, [r0 + %3 + %7] 354cabdff1aSopenharmony_ci movhps %1, [r0 + %5 + %7] 355cabdff1aSopenharmony_ci movq %2, [r0 + %4 + %7] 356cabdff1aSopenharmony_ci movhps %2, [r0 + %6 + %7] 357cabdff1aSopenharmony_ci%endmacro 358cabdff1aSopenharmony_ci 359cabdff1aSopenharmony_ci; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit) 360cabdff1aSopenharmony_ci; %1 = bitdepth 361cabdff1aSopenharmony_ci%macro IDCT_4x4 1 362cabdff1aSopenharmony_cicglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs 363cabdff1aSopenharmony_ci mova m0, [coeffsq] 364cabdff1aSopenharmony_ci mova m1, [coeffsq + 16] 365cabdff1aSopenharmony_ci 366cabdff1aSopenharmony_ci TR_4x4 7, 1, 1 367cabdff1aSopenharmony_ci TR_4x4 20 - %1, 1, 1 368cabdff1aSopenharmony_ci 369cabdff1aSopenharmony_ci mova [coeffsq], m0 370cabdff1aSopenharmony_ci mova [coeffsq + 16], m1 371cabdff1aSopenharmony_ci RET 372cabdff1aSopenharmony_ci%endmacro 373cabdff1aSopenharmony_ci 374cabdff1aSopenharmony_ci; scale, pack (clip16) and store the residuals 0 e8[0] + o8[0] --> + %1 375cabdff1aSopenharmony_ci; 4 at one time (4 columns) 1 e8[1] + o8[1] 376cabdff1aSopenharmony_ci; from %5: e8/16 + o8/16, with %1 offset ... 377cabdff1aSopenharmony_ci; and %3: e8/16 - o8/16, with %2 offset 6 e8[1] - o8[1] 378cabdff1aSopenharmony_ci; %4 - shift 7 e8[0] - o8[0] --> + %2 379cabdff1aSopenharmony_ci%macro STORE_8 7 380cabdff1aSopenharmony_ci psrad %5, %4 381cabdff1aSopenharmony_ci psrad %3, %4 382cabdff1aSopenharmony_ci packssdw %5, %3 383cabdff1aSopenharmony_ci movq [coeffsq + %1], %5 384cabdff1aSopenharmony_ci movhps [coeffsq + %2], %5 385cabdff1aSopenharmony_ci%endmacro 386cabdff1aSopenharmony_ci 387cabdff1aSopenharmony_ci; %1 - horizontal offset 388cabdff1aSopenharmony_ci; %2 - shift 389cabdff1aSopenharmony_ci; %3, %4 - transform coeffs 390cabdff1aSopenharmony_ci; %5 - vertical offset for e8 + o8 391cabdff1aSopenharmony_ci; %6 - vertical offset for e8 - o8 392cabdff1aSopenharmony_ci; %7 - register with e8 inside 393cabdff1aSopenharmony_ci; %8 - block_size 394cabdff1aSopenharmony_ci; %9 - register to store e8 +o8 395cabdff1aSopenharmony_ci; %10 - register to store e8 - o8 396cabdff1aSopenharmony_ci%macro E8_O8 10 397cabdff1aSopenharmony_ci pmaddwd m6, m4, %3 398cabdff1aSopenharmony_ci pmaddwd m7, m5, %4 399cabdff1aSopenharmony_ci 400cabdff1aSopenharmony_ci paddd m6, m7 401cabdff1aSopenharmony_ci paddd m7, m6, %7 ; o8 + e8 402cabdff1aSopenharmony_ci psubd %7, m6 ; e8 - o8 403cabdff1aSopenharmony_ci%if %8 == 8 404cabdff1aSopenharmony_ci STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0 405cabdff1aSopenharmony_ci%else 406cabdff1aSopenharmony_ci SWAP m7, %9 407cabdff1aSopenharmony_ci SWAP %7, %10 408cabdff1aSopenharmony_ci%endif 409cabdff1aSopenharmony_ci%endmacro 410cabdff1aSopenharmony_ci 411cabdff1aSopenharmony_ci; 8x4 residuals are processed and stored 412cabdff1aSopenharmony_ci; %1 - horizontal offset 413cabdff1aSopenharmony_ci; %2 - shift 414cabdff1aSopenharmony_ci; %3 - offset of the even row 415cabdff1aSopenharmony_ci; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32 416cabdff1aSopenharmony_ci; %5 - offset of the odd row 417cabdff1aSopenharmony_ci; %6 - block size 418cabdff1aSopenharmony_ci; %7 - 1/0 add a constant in TR_4x4 or not 419cabdff1aSopenharmony_ci; I want to add a constant for 8x8 transform but not for 16x16 and 32x32 420cabdff1aSopenharmony_ci%macro TR_8x4 7 421cabdff1aSopenharmony_ci ; load 4 columns of even rows 422cabdff1aSopenharmony_ci LOAD_BLOCK m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1 423cabdff1aSopenharmony_ci 424cabdff1aSopenharmony_ci TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only 425cabdff1aSopenharmony_ci 426cabdff1aSopenharmony_ci ; load 4 columns of odd rows 427cabdff1aSopenharmony_ci LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1 428cabdff1aSopenharmony_ci 429cabdff1aSopenharmony_ci ; 00 01 02 03 430cabdff1aSopenharmony_ci ; 10 11 12 13 m4: 10 30 11 31 12 32 13 33 431cabdff1aSopenharmony_ci 432cabdff1aSopenharmony_ci ; ... -- > 433cabdff1aSopenharmony_ci ; m5: 50 70 51 71 52 72 53 73 434cabdff1aSopenharmony_ci ; 70 71 72 73 435cabdff1aSopenharmony_ci SBUTTERFLY wd, 4, 5, 6 436cabdff1aSopenharmony_ci 437cabdff1aSopenharmony_ci E8_O8 %1, %2, [pw_89_75], [pw_50_18], 0, %5 * 7, m0, %6, m8, m15 438cabdff1aSopenharmony_ci E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5, %5 * 6, m1, %6, m9, m14 439cabdff1aSopenharmony_ci E8_O8 %1, %2, [pw_50_m89], [pw_18_75], %5 * 2, %5 * 5, m2, %6, m10, m13 440cabdff1aSopenharmony_ci E8_O8 %1, %2, [pw_18_m50], [pw_75_m89], %5 * 3, %5 * 4, m3, %6, m11, m12 441cabdff1aSopenharmony_ci%endmacro 442cabdff1aSopenharmony_ci 443cabdff1aSopenharmony_ci%macro STORE_PACKED 7 444cabdff1aSopenharmony_ci movq [r0 + %3 + %7], %1 445cabdff1aSopenharmony_ci movhps [r0 + %4 + %7], %1 446cabdff1aSopenharmony_ci movq [r0 + %5 + %7], %2 447cabdff1aSopenharmony_ci movhps [r0 + %6 + %7], %2 448cabdff1aSopenharmony_ci%endmacro 449cabdff1aSopenharmony_ci 450cabdff1aSopenharmony_ci; transpose 4x4 block packed 451cabdff1aSopenharmony_ci; in %1 and %2 registers 452cabdff1aSopenharmony_ci; %3 - temporary register 453cabdff1aSopenharmony_ci%macro TRANSPOSE_4x4 3 454cabdff1aSopenharmony_ci SBUTTERFLY wd, %1, %2, %3 455cabdff1aSopenharmony_ci SBUTTERFLY dq, %1, %2, %3 456cabdff1aSopenharmony_ci%endmacro 457cabdff1aSopenharmony_ci 458cabdff1aSopenharmony_ci; %1 - horizontal offset of the block i 459cabdff1aSopenharmony_ci; %2 - vertical offset of the block i 460cabdff1aSopenharmony_ci; %3 - width in bytes 461cabdff1aSopenharmony_ci; %4 - vertical offset for the block j 462cabdff1aSopenharmony_ci; %5 - horizontal offset for the block j 463cabdff1aSopenharmony_ci%macro SWAP_BLOCKS 5 464cabdff1aSopenharmony_ci ; M_j 465cabdff1aSopenharmony_ci LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5 466cabdff1aSopenharmony_ci TRANSPOSE_4x4 4, 5, 6 467cabdff1aSopenharmony_ci 468cabdff1aSopenharmony_ci ; M_i 469cabdff1aSopenharmony_ci LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1 470cabdff1aSopenharmony_ci 471cabdff1aSopenharmony_ci STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1 472cabdff1aSopenharmony_ci 473cabdff1aSopenharmony_ci ; transpose and store M_i 474cabdff1aSopenharmony_ci SWAP m6, m4 475cabdff1aSopenharmony_ci SWAP m7, m5 476cabdff1aSopenharmony_ci TRANSPOSE_4x4 4, 5, 6 477cabdff1aSopenharmony_ci STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5 478cabdff1aSopenharmony_ci%endmacro 479cabdff1aSopenharmony_ci 480cabdff1aSopenharmony_ci; %1 - horizontal offset 481cabdff1aSopenharmony_ci; %2 - vertical offset of the block 482cabdff1aSopenharmony_ci; %3 - width in bytes 483cabdff1aSopenharmony_ci%macro TRANSPOSE_BLOCK 3 484cabdff1aSopenharmony_ci LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1 485cabdff1aSopenharmony_ci TRANSPOSE_4x4 4, 5, 6 486cabdff1aSopenharmony_ci STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1 487cabdff1aSopenharmony_ci%endmacro 488cabdff1aSopenharmony_ci 489cabdff1aSopenharmony_ci%macro TRANSPOSE_8x8 0 490cabdff1aSopenharmony_cicglobal hevc_idct_transpose_8x8, 0, 0, 0 491cabdff1aSopenharmony_ci ; M1 M2 ^T = M1^t M3^t 492cabdff1aSopenharmony_ci ; M3 M4 M2^t M4^t 493cabdff1aSopenharmony_ci 494cabdff1aSopenharmony_ci ; M1 4x4 block 495cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 0, 0, 16 496cabdff1aSopenharmony_ci 497cabdff1aSopenharmony_ci ; M2 and M3 498cabdff1aSopenharmony_ci SWAP_BLOCKS 0, 64, 16, 0, 8 499cabdff1aSopenharmony_ci 500cabdff1aSopenharmony_ci ; M4 501cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 8, 64, 16 502cabdff1aSopenharmony_ci 503cabdff1aSopenharmony_ci ret 504cabdff1aSopenharmony_ci%endmacro 505cabdff1aSopenharmony_ci 506cabdff1aSopenharmony_ci; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit) 507cabdff1aSopenharmony_ci; %1 = bitdepth 508cabdff1aSopenharmony_ci%macro IDCT_8x8 1 509cabdff1aSopenharmony_cicglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs 510cabdff1aSopenharmony_ci TR_8x4 0, 7, 32, 1, 16, 8, 1 511cabdff1aSopenharmony_ci TR_8x4 8, 7, 32, 1, 16, 8, 1 512cabdff1aSopenharmony_ci 513cabdff1aSopenharmony_ci call hevc_idct_transpose_8x8_ %+ cpuname 514cabdff1aSopenharmony_ci 515cabdff1aSopenharmony_ci DEFINE_BIAS %1 516cabdff1aSopenharmony_ci TR_8x4 0, shift, 32, 1, 16, 8, 1 517cabdff1aSopenharmony_ci TR_8x4 8, shift, 32, 1, 16, 8, 1 518cabdff1aSopenharmony_ci 519cabdff1aSopenharmony_ci TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1 520cabdff1aSopenharmony_ci%endmacro 521cabdff1aSopenharmony_ci 522cabdff1aSopenharmony_ci; store intermedite e32 coeffs on stack 523cabdff1aSopenharmony_ci; as 16x4 matrix 524cabdff1aSopenharmony_ci; from m10: e8 + o8, with %6 offset 525cabdff1aSopenharmony_ci; and %3: e8 - o8, with %7 offset 526cabdff1aSopenharmony_ci; %4 - shift, unused here 527cabdff1aSopenharmony_ci%macro STORE_16 7 528cabdff1aSopenharmony_ci mova [rsp + %6], %5 529cabdff1aSopenharmony_ci mova [rsp + %7], %3 530cabdff1aSopenharmony_ci%endmacro 531cabdff1aSopenharmony_ci 532cabdff1aSopenharmony_ci; %1, %2 - transform constants 533cabdff1aSopenharmony_ci; %3, %4 - regs with interleaved coeffs 534cabdff1aSopenharmony_ci; %5 - 1/0 SWAP or add 535cabdff1aSopenharmony_ci; %6, %7 - registers for intermidiate sums 536cabdff1aSopenharmony_ci; %8 - accumulator register 537cabdff1aSopenharmony_ci%macro ADD_ROWS 8 538cabdff1aSopenharmony_ci pmaddwd %6, %3, %1 539cabdff1aSopenharmony_ci pmaddwd %7, %4, %2 540cabdff1aSopenharmony_ci paddd %6, %7 541cabdff1aSopenharmony_ci%if %5 == 1 542cabdff1aSopenharmony_ci SWAP %6, %8 543cabdff1aSopenharmony_ci%else 544cabdff1aSopenharmony_ci paddd %8, %6 545cabdff1aSopenharmony_ci%endif 546cabdff1aSopenharmony_ci%endmacro 547cabdff1aSopenharmony_ci 548cabdff1aSopenharmony_ci; %1 - transform coeffs 549cabdff1aSopenharmony_ci; %2, %3 offsets for storing e+o/e-o back to coeffsq 550cabdff1aSopenharmony_ci; %4 - shift 551cabdff1aSopenharmony_ci; %5 - add 552cabdff1aSopenharmony_ci; %6 - block_size 553cabdff1aSopenharmony_ci; %7 - register with e16 554cabdff1aSopenharmony_ci; %8, %9 - stack offsets for storing e+o/e-o 555cabdff1aSopenharmony_ci%macro E16_O16 9 556cabdff1aSopenharmony_ci ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m5, m6, m7 557cabdff1aSopenharmony_ci ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7 558cabdff1aSopenharmony_ci 559cabdff1aSopenharmony_ci%if %6 == 8 560cabdff1aSopenharmony_ci paddd %7, %5 561cabdff1aSopenharmony_ci%endif 562cabdff1aSopenharmony_ci 563cabdff1aSopenharmony_ci paddd m4, m7, %7 ; o16 + e16 564cabdff1aSopenharmony_ci psubd %7, m7 ; e16 - o16 565cabdff1aSopenharmony_ci STORE_%6 %2, %3, %7, %4, m4, %8, %9 566cabdff1aSopenharmony_ci%endmacro 567cabdff1aSopenharmony_ci 568cabdff1aSopenharmony_ci%macro TR_16x4 10 569cabdff1aSopenharmony_ci ; produce 8x4 matrix of e16 coeffs 570cabdff1aSopenharmony_ci ; for 4 first rows and store it on stack (128 bytes) 571cabdff1aSopenharmony_ci TR_8x4 %1, 7, %4, %5, %6, %8, 0 572cabdff1aSopenharmony_ci 573cabdff1aSopenharmony_ci ; load 8 even rows 574cabdff1aSopenharmony_ci LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1 575cabdff1aSopenharmony_ci LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1 576cabdff1aSopenharmony_ci 577cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 1, 4 578cabdff1aSopenharmony_ci SBUTTERFLY wd, 2, 3, 4 579cabdff1aSopenharmony_ci 580cabdff1aSopenharmony_ci E16_O16 trans_coeffs16, 0 + %1, 15 * %6 + %1, %2, %3, %7, m8, 0, 15 * 16 581cabdff1aSopenharmony_ci mova m8, %3 582cabdff1aSopenharmony_ci E16_O16 trans_coeffs16 + 64, %6 + %1, 14 * %6 + %1, %2, m8, %7, m9, 16, 14 * 16 583cabdff1aSopenharmony_ci E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16 584cabdff1aSopenharmony_ci E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16 585cabdff1aSopenharmony_ci E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16 586cabdff1aSopenharmony_ci E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16 587cabdff1aSopenharmony_ci E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1, 9 * %6 + %1, %2, m8, %7, m14, 6 * 16, 9 * 16 588cabdff1aSopenharmony_ci E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1, 8 * %6 + %1, %2, m8, %7, m15, 7 * 16, 8 * 16 589cabdff1aSopenharmony_ci%endmacro 590cabdff1aSopenharmony_ci 591cabdff1aSopenharmony_ci%macro TRANSPOSE_16x16 0 592cabdff1aSopenharmony_cicglobal hevc_idct_transpose_16x16, 0, 0, 0 593cabdff1aSopenharmony_ci; M1 M2 M3 M4 ^T m1 m5 m9 m13 M_i^T = m_i 594cabdff1aSopenharmony_ci; M5 M6 M7 M8 --> m2 m6 m10 m14 595cabdff1aSopenharmony_ci; M9 M10 M11 M12 m3 m7 m11 m15 596cabdff1aSopenharmony_ci; M13 M14 M15 M16 m4 m8 m12 m16 597cabdff1aSopenharmony_ci 598cabdff1aSopenharmony_ci ; M1 4x4 block 599cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 0, 0, 32 600cabdff1aSopenharmony_ci 601cabdff1aSopenharmony_ci ; M5, M2 602cabdff1aSopenharmony_ci SWAP_BLOCKS 0, 128, 32, 0, 8 603cabdff1aSopenharmony_ci ; M9, M3 604cabdff1aSopenharmony_ci SWAP_BLOCKS 0, 256, 32, 0, 16 605cabdff1aSopenharmony_ci ; M13, M4 606cabdff1aSopenharmony_ci SWAP_BLOCKS 0, 384, 32, 0, 24 607cabdff1aSopenharmony_ci 608cabdff1aSopenharmony_ci ;M6 609cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 8, 128, 32 610cabdff1aSopenharmony_ci 611cabdff1aSopenharmony_ci ; M10, M7 612cabdff1aSopenharmony_ci SWAP_BLOCKS 8, 256, 32, 128, 16 613cabdff1aSopenharmony_ci ; M14, M8 614cabdff1aSopenharmony_ci SWAP_BLOCKS 8, 384, 32, 128, 24 615cabdff1aSopenharmony_ci 616cabdff1aSopenharmony_ci ;M11 617cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 16, 256, 32 618cabdff1aSopenharmony_ci 619cabdff1aSopenharmony_ci ; M15, M12 620cabdff1aSopenharmony_ci SWAP_BLOCKS 16, 384, 32, 256, 24 621cabdff1aSopenharmony_ci 622cabdff1aSopenharmony_ci ;M16 623cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 24, 384, 32 624cabdff1aSopenharmony_ci 625cabdff1aSopenharmony_ci ret 626cabdff1aSopenharmony_ci%endmacro 627cabdff1aSopenharmony_ci 628cabdff1aSopenharmony_ci; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit) 629cabdff1aSopenharmony_ci; %1 = bitdepth 630cabdff1aSopenharmony_ci%macro IDCT_16x16 1 631cabdff1aSopenharmony_cicglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs 632cabdff1aSopenharmony_ci mov r1d, 3 633cabdff1aSopenharmony_ci.loop16: 634cabdff1aSopenharmony_ci TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0 635cabdff1aSopenharmony_ci dec r1d 636cabdff1aSopenharmony_ci jge .loop16 637cabdff1aSopenharmony_ci 638cabdff1aSopenharmony_ci call hevc_idct_transpose_16x16_ %+ cpuname 639cabdff1aSopenharmony_ci 640cabdff1aSopenharmony_ci DEFINE_BIAS %1 641cabdff1aSopenharmony_ci mov r1d, 3 642cabdff1aSopenharmony_ci.loop16_2: 643cabdff1aSopenharmony_ci TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1 644cabdff1aSopenharmony_ci dec r1d 645cabdff1aSopenharmony_ci jge .loop16_2 646cabdff1aSopenharmony_ci 647cabdff1aSopenharmony_ci TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1 648cabdff1aSopenharmony_ci%endmacro 649cabdff1aSopenharmony_ci 650cabdff1aSopenharmony_ci; scale, pack (clip16) and store the residuals 0 e32[0] + o32[0] --> %1 651cabdff1aSopenharmony_ci; 4 at one time (4 columns) 1 e32[1] + o32[1] 652cabdff1aSopenharmony_ci; %1 - address to store e32 + o32 653cabdff1aSopenharmony_ci; %2 - address to store e32 - e32 654cabdff1aSopenharmony_ci; %5 - reg with e32 + o32 ... 655cabdff1aSopenharmony_ci; %3 - reg with e32 - o32 30 e32[1] - o32[1] 656cabdff1aSopenharmony_ci; %4 - shift 31 e32[0] - o32[0] --> %2 657cabdff1aSopenharmony_ci%macro STORE_32 5 658cabdff1aSopenharmony_ci psrad %5, %4 659cabdff1aSopenharmony_ci psrad %3, %4 660cabdff1aSopenharmony_ci packssdw %5, %3 661cabdff1aSopenharmony_ci movq [%1], %5 662cabdff1aSopenharmony_ci movhps [%2], %5 663cabdff1aSopenharmony_ci%endmacro 664cabdff1aSopenharmony_ci 665cabdff1aSopenharmony_ci; %1 - transform coeffs 666cabdff1aSopenharmony_ci; %2 - stack offset for e32 667cabdff1aSopenharmony_ci; %2, %3 offsets for storing e+o/e-o back to coeffsq 668cabdff1aSopenharmony_ci; %4 - shift 669cabdff1aSopenharmony_ci; %5 - stack offset of e32 670cabdff1aSopenharmony_ci%macro E32_O32 5 671cabdff1aSopenharmony_ci ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m8, m9, m10 672cabdff1aSopenharmony_ci ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10 673cabdff1aSopenharmony_ci ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10 674cabdff1aSopenharmony_ci ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10 675cabdff1aSopenharmony_ci 676cabdff1aSopenharmony_ci paddd m11, m14, [rsp + %5] 677cabdff1aSopenharmony_ci paddd m12, m10, m11 ; o32 + e32 678cabdff1aSopenharmony_ci psubd m11, m10 ; e32 - o32 679cabdff1aSopenharmony_ci STORE_32 %2, %3, m11, %4, m12 680cabdff1aSopenharmony_ci%endmacro 681cabdff1aSopenharmony_ci 682cabdff1aSopenharmony_ci; %1 - horizontal offset 683cabdff1aSopenharmony_ci; %2 - bitdepth 684cabdff1aSopenharmony_ci%macro TR_32x4 3 685cabdff1aSopenharmony_ci TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0 686cabdff1aSopenharmony_ci 687cabdff1aSopenharmony_ci LOAD_BLOCK m0, m1, 64, 3 * 64, 5 * 64, 7 * 64, %1 688cabdff1aSopenharmony_ci LOAD_BLOCK m2, m3, 9 * 64, 11 * 64, 13 * 64, 15 * 64, %1 689cabdff1aSopenharmony_ci LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1 690cabdff1aSopenharmony_ci LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1 691cabdff1aSopenharmony_ci 692cabdff1aSopenharmony_ci SBUTTERFLY wd, 0, 1, 8 693cabdff1aSopenharmony_ci SBUTTERFLY wd, 2, 3, 8 694cabdff1aSopenharmony_ci SBUTTERFLY wd, 4, 5, 8 695cabdff1aSopenharmony_ci SBUTTERFLY wd, 6, 7, 8 696cabdff1aSopenharmony_ci 697cabdff1aSopenharmony_ci%if %3 == 1 698cabdff1aSopenharmony_ci %assign shift 7 699cabdff1aSopenharmony_ci mova m14, [pd_64] 700cabdff1aSopenharmony_ci%else 701cabdff1aSopenharmony_ci LOAD_BIAS %2, m14 702cabdff1aSopenharmony_ci%endif 703cabdff1aSopenharmony_ci 704cabdff1aSopenharmony_ci lea r2, [trans_coeff32 + 15 * 128] 705cabdff1aSopenharmony_ci lea r3, [coeffsq + %1] 706cabdff1aSopenharmony_ci lea r4, [r3 + 16 * 64] 707cabdff1aSopenharmony_ci mov r5d, 15 * 16 708cabdff1aSopenharmony_ci%%loop: 709cabdff1aSopenharmony_ci E32_O32 r2, r3 + r5 * 4, r4, shift, r5 710cabdff1aSopenharmony_ci sub r2, 128 711cabdff1aSopenharmony_ci add r4, 64 712cabdff1aSopenharmony_ci sub r5d, 16 713cabdff1aSopenharmony_ci jge %%loop 714cabdff1aSopenharmony_ci%endmacro 715cabdff1aSopenharmony_ci 716cabdff1aSopenharmony_ci%macro TRANSPOSE_32x32 0 717cabdff1aSopenharmony_cicglobal hevc_idct_transpose_32x32, 0, 0, 0 718cabdff1aSopenharmony_ci ; M0 M1 ... M7 719cabdff1aSopenharmony_ci ; M8 M15 720cabdff1aSopenharmony_ci ; 721cabdff1aSopenharmony_ci ; ... 722cabdff1aSopenharmony_ci ; 723cabdff1aSopenharmony_ci ; M56 M63 724cabdff1aSopenharmony_ci 725cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 0, 0, 64 ; M1 726cabdff1aSopenharmony_ci mov r1d, 7 727cabdff1aSopenharmony_ci mov r2d, 7 * 256 728cabdff1aSopenharmony_ci.loop_transpose: 729cabdff1aSopenharmony_ci SWAP_BLOCKS 0, r2, 64, 0, r1 * 8 730cabdff1aSopenharmony_ci sub r2d, 256 731cabdff1aSopenharmony_ci dec r1d 732cabdff1aSopenharmony_ci jg .loop_transpose 733cabdff1aSopenharmony_ci 734cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 8, 256, 64 ; M9 735cabdff1aSopenharmony_ci mov r1d, 6 736cabdff1aSopenharmony_ci mov r2d, 512 737cabdff1aSopenharmony_ci mov r3d, 16 738cabdff1aSopenharmony_ci.loop_transpose2: 739cabdff1aSopenharmony_ci SWAP_BLOCKS 8, r2, 64, 256, r3 740cabdff1aSopenharmony_ci add r3d, 8 741cabdff1aSopenharmony_ci add r2d, 256 742cabdff1aSopenharmony_ci dec r1d 743cabdff1aSopenharmony_ci jg .loop_transpose2 744cabdff1aSopenharmony_ci 745cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9 746cabdff1aSopenharmony_ci mov r1d, 5 747cabdff1aSopenharmony_ci mov r2d, 768 748cabdff1aSopenharmony_ci mov r3d, 24 749cabdff1aSopenharmony_ci.loop_transpose3: 750cabdff1aSopenharmony_ci SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3 751cabdff1aSopenharmony_ci add r3d, 8 752cabdff1aSopenharmony_ci add r2d, 256 753cabdff1aSopenharmony_ci dec r1d 754cabdff1aSopenharmony_ci jg .loop_transpose3 755cabdff1aSopenharmony_ci 756cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27 757cabdff1aSopenharmony_ci mov r1d, 4 758cabdff1aSopenharmony_ci mov r2d, 1024 759cabdff1aSopenharmony_ci mov r3d, 32 760cabdff1aSopenharmony_ci.loop_transpose4: 761cabdff1aSopenharmony_ci SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3 762cabdff1aSopenharmony_ci add r3d, 8 763cabdff1aSopenharmony_ci add r2d, 256 764cabdff1aSopenharmony_ci dec r1d 765cabdff1aSopenharmony_ci jg .loop_transpose4 766cabdff1aSopenharmony_ci 767cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36 768cabdff1aSopenharmony_ci mov r1d, 3 769cabdff1aSopenharmony_ci mov r2d, 1280 770cabdff1aSopenharmony_ci mov r3d, 40 771cabdff1aSopenharmony_ci.loop_transpose5: 772cabdff1aSopenharmony_ci SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3 773cabdff1aSopenharmony_ci add r3d, 8 774cabdff1aSopenharmony_ci add r2d, 256 775cabdff1aSopenharmony_ci dec r1d 776cabdff1aSopenharmony_ci jg .loop_transpose5 777cabdff1aSopenharmony_ci 778cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45 779cabdff1aSopenharmony_ci SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8 780cabdff1aSopenharmony_ci SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8 781cabdff1aSopenharmony_ci 782cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54 783cabdff1aSopenharmony_ci SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8 784cabdff1aSopenharmony_ci 785cabdff1aSopenharmony_ci TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63 786cabdff1aSopenharmony_ci 787cabdff1aSopenharmony_ci ret 788cabdff1aSopenharmony_ci%endmacro 789cabdff1aSopenharmony_ci 790cabdff1aSopenharmony_ci; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit) 791cabdff1aSopenharmony_ci; %1 = bitdepth 792cabdff1aSopenharmony_ci%macro IDCT_32x32 1 793cabdff1aSopenharmony_cicglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs 794cabdff1aSopenharmony_ci mov r1d, 7 795cabdff1aSopenharmony_ci.loop32: 796cabdff1aSopenharmony_ci TR_32x4 8 * r1, %1, 1 797cabdff1aSopenharmony_ci dec r1d 798cabdff1aSopenharmony_ci jge .loop32 799cabdff1aSopenharmony_ci 800cabdff1aSopenharmony_ci call hevc_idct_transpose_32x32_ %+ cpuname 801cabdff1aSopenharmony_ci 802cabdff1aSopenharmony_ci mov r1d, 7 803cabdff1aSopenharmony_ci.loop32_2: 804cabdff1aSopenharmony_ci TR_32x4 8 * r1, %1, 0 805cabdff1aSopenharmony_ci dec r1d 806cabdff1aSopenharmony_ci jge .loop32_2 807cabdff1aSopenharmony_ci 808cabdff1aSopenharmony_ci TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1 809cabdff1aSopenharmony_ci%endmacro 810cabdff1aSopenharmony_ci 811cabdff1aSopenharmony_ci%macro INIT_IDCT_DC 1 812cabdff1aSopenharmony_ciINIT_MMX mmxext 813cabdff1aSopenharmony_ciIDCT_DC_NL 4, %1 814cabdff1aSopenharmony_ci 815cabdff1aSopenharmony_ciINIT_XMM sse2 816cabdff1aSopenharmony_ciIDCT_DC_NL 8, %1 817cabdff1aSopenharmony_ciIDCT_DC 16, 4, %1 818cabdff1aSopenharmony_ciIDCT_DC 32, 16, %1 819cabdff1aSopenharmony_ci 820cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL 821cabdff1aSopenharmony_ci INIT_YMM avx2 822cabdff1aSopenharmony_ci IDCT_DC 16, 2, %1 823cabdff1aSopenharmony_ci IDCT_DC 32, 8, %1 824cabdff1aSopenharmony_ci%endif ;HAVE_AVX2_EXTERNAL 825cabdff1aSopenharmony_ci%endmacro 826cabdff1aSopenharmony_ci 827cabdff1aSopenharmony_ci%macro INIT_IDCT 2 828cabdff1aSopenharmony_ciINIT_XMM %2 829cabdff1aSopenharmony_ci%if %1 == 8 830cabdff1aSopenharmony_ci TRANSPOSE_8x8 831cabdff1aSopenharmony_ci %if ARCH_X86_64 832cabdff1aSopenharmony_ci TRANSPOSE_16x16 833cabdff1aSopenharmony_ci TRANSPOSE_32x32 834cabdff1aSopenharmony_ci %endif 835cabdff1aSopenharmony_ci%endif 836cabdff1aSopenharmony_ci%if ARCH_X86_64 837cabdff1aSopenharmony_ci IDCT_32x32 %1 838cabdff1aSopenharmony_ci IDCT_16x16 %1 839cabdff1aSopenharmony_ci%endif 840cabdff1aSopenharmony_ciIDCT_8x8 %1 841cabdff1aSopenharmony_ciIDCT_4x4 %1 842cabdff1aSopenharmony_ci%endmacro 843cabdff1aSopenharmony_ci 844cabdff1aSopenharmony_ciINIT_IDCT_DC 8 845cabdff1aSopenharmony_ciINIT_IDCT_DC 10 846cabdff1aSopenharmony_ciINIT_IDCT_DC 12 847cabdff1aSopenharmony_ciINIT_IDCT 8, sse2 848cabdff1aSopenharmony_ciINIT_IDCT 8, avx 849cabdff1aSopenharmony_ciINIT_IDCT 10, sse2 850cabdff1aSopenharmony_ciINIT_IDCT 10, avx 851cabdff1aSopenharmony_ci;INIT_IDCT 12, sse2 852cabdff1aSopenharmony_ci;INIT_IDCT 12, avx 853