1cabdff1aSopenharmony_ci;******************************************************************************
2cabdff1aSopenharmony_ci;* X86 Optimized functions for Open Exr Decoder
3cabdff1aSopenharmony_ci;* Copyright (c) 2006 Industrial Light & Magic, a division of Lucas Digital Ltd. LLC
4cabdff1aSopenharmony_ci;*
5cabdff1aSopenharmony_ci;* reorder_pixels, predictor based on patch by John Loy
6cabdff1aSopenharmony_ci;* port to ASM by Jokyo Images support by CNC - French National Center for Cinema
7cabdff1aSopenharmony_ci;*
8cabdff1aSopenharmony_ci;* predictor AVX/AVX2 by Henrik Gramner
9cabdff1aSopenharmony_ci;*
10cabdff1aSopenharmony_ci;* This file is part of FFmpeg.
11cabdff1aSopenharmony_ci;*
12cabdff1aSopenharmony_ci;* FFmpeg is free software; you can redistribute it and/or
13cabdff1aSopenharmony_ci;* modify it under the terms of the GNU Lesser General Public
14cabdff1aSopenharmony_ci;* License as published by the Free Software Foundation; either
15cabdff1aSopenharmony_ci;* version 2.1 of the License, or (at your option) any later version.
16cabdff1aSopenharmony_ci;*
17cabdff1aSopenharmony_ci;* FFmpeg is distributed in the hope that it will be useful,
18cabdff1aSopenharmony_ci;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19cabdff1aSopenharmony_ci;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20cabdff1aSopenharmony_ci;* Lesser General Public License for more details.
21cabdff1aSopenharmony_ci;*
22cabdff1aSopenharmony_ci;* You should have received a copy of the GNU Lesser General Public
23cabdff1aSopenharmony_ci;* License along with FFmpeg; if not, write to the Free Software
24cabdff1aSopenharmony_ci;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25cabdff1aSopenharmony_ci;******************************************************************************
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci%include "libavutil/x86/x86util.asm"
28cabdff1aSopenharmony_ci
29cabdff1aSopenharmony_cicextern pb_15
30cabdff1aSopenharmony_cicextern pb_80
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ciSECTION .text
33cabdff1aSopenharmony_ci
34cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
35cabdff1aSopenharmony_ci; void ff_reorder_pixels(uint8_t *dst, const uint8_t *src, ptrdiff_t size);
36cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
37cabdff1aSopenharmony_ci
38cabdff1aSopenharmony_ci%macro REORDER_PIXELS 0
39cabdff1aSopenharmony_cicglobal reorder_pixels, 3,4,3, dst, src1, size, src2
40cabdff1aSopenharmony_ci    lea                              src2q, [src1q+sizeq] ; src2 = src + 2 * half_size
41cabdff1aSopenharmony_ci    add                               dstq, sizeq         ; dst offset by size
42cabdff1aSopenharmony_ci    shr                              sizeq, 1             ; half_size
43cabdff1aSopenharmony_ci    add                              src1q, sizeq         ; offset src by half_size
44cabdff1aSopenharmony_ci    neg                              sizeq                ; size = offset for dst, src1, src2
45cabdff1aSopenharmony_ci.loop:
46cabdff1aSopenharmony_ci
47cabdff1aSopenharmony_ci    mova                                m0, [src1q+sizeq]        ; load first part
48cabdff1aSopenharmony_ci    movu                                m1, [src2q+sizeq]        ; load second part
49cabdff1aSopenharmony_ci    SBUTTERFLY bw, 0, 1, 2                                       ; interleaved
50cabdff1aSopenharmony_ci    mova                 [dstq+2*sizeq   ], xm0                  ; copy to dst
51cabdff1aSopenharmony_ci    mova                 [dstq+2*sizeq+16], xm1
52cabdff1aSopenharmony_ci%if cpuflag(avx2)
53cabdff1aSopenharmony_ci    vperm2i128                          m0, m0, m1, q0301
54cabdff1aSopenharmony_ci    mova                 [dstq+2*sizeq+32], m0
55cabdff1aSopenharmony_ci%endif
56cabdff1aSopenharmony_ci    add     sizeq, mmsize
57cabdff1aSopenharmony_ci    jl .loop
58cabdff1aSopenharmony_ci    RET
59cabdff1aSopenharmony_ci%endmacro
60cabdff1aSopenharmony_ci
61cabdff1aSopenharmony_ciINIT_XMM sse2
62cabdff1aSopenharmony_ciREORDER_PIXELS
63cabdff1aSopenharmony_ci
64cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
65cabdff1aSopenharmony_ciINIT_YMM avx2
66cabdff1aSopenharmony_ciREORDER_PIXELS
67cabdff1aSopenharmony_ci%endif
68cabdff1aSopenharmony_ci
69cabdff1aSopenharmony_ci
70cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
71cabdff1aSopenharmony_ci; void ff_predictor(uint8_t *src, ptrdiff_t size);
72cabdff1aSopenharmony_ci;------------------------------------------------------------------------------
73cabdff1aSopenharmony_ci
74cabdff1aSopenharmony_ci%macro PREDICTOR 0
75cabdff1aSopenharmony_cicglobal predictor, 2,2,5, src, size
76cabdff1aSopenharmony_ci    mova             m0, [pb_80]
77cabdff1aSopenharmony_ci    mova            xm1, [pb_15]
78cabdff1aSopenharmony_ci    mova            xm2, xm0
79cabdff1aSopenharmony_ci    add            srcq, sizeq
80cabdff1aSopenharmony_ci    neg           sizeq
81cabdff1aSopenharmony_ci.loop:
82cabdff1aSopenharmony_ci    pxor             m3, m0, [srcq + sizeq]
83cabdff1aSopenharmony_ci    pslldq           m4, m3, 1
84cabdff1aSopenharmony_ci    paddb            m3, m4
85cabdff1aSopenharmony_ci    pslldq           m4, m3, 2
86cabdff1aSopenharmony_ci    paddb            m3, m4
87cabdff1aSopenharmony_ci    pslldq           m4, m3, 4
88cabdff1aSopenharmony_ci    paddb            m3, m4
89cabdff1aSopenharmony_ci    pslldq           m4, m3, 8
90cabdff1aSopenharmony_ci%if mmsize == 32
91cabdff1aSopenharmony_ci    paddb            m3, m4
92cabdff1aSopenharmony_ci    paddb           xm2, xm3
93cabdff1aSopenharmony_ci    vextracti128    xm4, m3, 1
94cabdff1aSopenharmony_ci    mova [srcq + sizeq], xm2
95cabdff1aSopenharmony_ci    pshufb          xm2, xm1
96cabdff1aSopenharmony_ci    paddb           xm2, xm4
97cabdff1aSopenharmony_ci    mova [srcq + sizeq + 16], xm2
98cabdff1aSopenharmony_ci%else
99cabdff1aSopenharmony_ci    paddb            m2, m3
100cabdff1aSopenharmony_ci    paddb            m2, m4
101cabdff1aSopenharmony_ci    mova [srcq + sizeq], m2
102cabdff1aSopenharmony_ci%endif
103cabdff1aSopenharmony_ci    pshufb          xm2, xm1
104cabdff1aSopenharmony_ci    add           sizeq, mmsize
105cabdff1aSopenharmony_ci    jl .loop
106cabdff1aSopenharmony_ci    RET
107cabdff1aSopenharmony_ci%endmacro
108cabdff1aSopenharmony_ci
109cabdff1aSopenharmony_ciINIT_XMM ssse3
110cabdff1aSopenharmony_ciPREDICTOR
111cabdff1aSopenharmony_ci
112cabdff1aSopenharmony_ciINIT_XMM avx
113cabdff1aSopenharmony_ciPREDICTOR
114cabdff1aSopenharmony_ci
115cabdff1aSopenharmony_ci%if HAVE_AVX2_EXTERNAL
116cabdff1aSopenharmony_ciINIT_YMM avx2
117cabdff1aSopenharmony_ciPREDICTOR
118cabdff1aSopenharmony_ci%endif
119