1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
3cabdff1aSopenharmony_ci * Copyright (c) 2000, 2001 Fabrice Bellard
4cabdff1aSopenharmony_ci * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5cabdff1aSopenharmony_ci *
6cabdff1aSopenharmony_ci * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7cabdff1aSopenharmony_ci * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8cabdff1aSopenharmony_ci * and improved by Zdenek Kabelac <kabi@users.sf.net>
9cabdff1aSopenharmony_ci *
10cabdff1aSopenharmony_ci * This file is part of FFmpeg.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
13cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
14cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
15cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
18cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
19cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20cabdff1aSopenharmony_ci * Lesser General Public License for more details.
21cabdff1aSopenharmony_ci *
22cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
23cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
24cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25cabdff1aSopenharmony_ci */
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci#include <stddef.h>
28cabdff1aSopenharmony_ci#include <stdint.h>
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci#include "inline_asm.h"
31cabdff1aSopenharmony_ci
32cabdff1aSopenharmony_ci// put_pixels
33cabdff1aSopenharmony_ciav_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
34cabdff1aSopenharmony_ci                                  ptrdiff_t line_size, int h)
35cabdff1aSopenharmony_ci{
36cabdff1aSopenharmony_ci    MOVQ_ZERO(mm7);
37cabdff1aSopenharmony_ci    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
38cabdff1aSopenharmony_ci    __asm__ volatile(
39cabdff1aSopenharmony_ci        "movq   (%1), %%mm0             \n\t"
40cabdff1aSopenharmony_ci        "movq   1(%1), %%mm4            \n\t"
41cabdff1aSopenharmony_ci        "movq   %%mm0, %%mm1            \n\t"
42cabdff1aSopenharmony_ci        "movq   %%mm4, %%mm5            \n\t"
43cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm0         \n\t"
44cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm4         \n\t"
45cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm1         \n\t"
46cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm5         \n\t"
47cabdff1aSopenharmony_ci        "paddusw %%mm0, %%mm4           \n\t"
48cabdff1aSopenharmony_ci        "paddusw %%mm1, %%mm5           \n\t"
49cabdff1aSopenharmony_ci        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
50cabdff1aSopenharmony_ci        "add    %3, %1                  \n\t"
51cabdff1aSopenharmony_ci        ".p2align 3                     \n\t"
52cabdff1aSopenharmony_ci        "1:                             \n\t"
53cabdff1aSopenharmony_ci        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
54cabdff1aSopenharmony_ci        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
55cabdff1aSopenharmony_ci        "movq   %%mm0, %%mm1            \n\t"
56cabdff1aSopenharmony_ci        "movq   %%mm2, %%mm3            \n\t"
57cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm0         \n\t"
58cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm2         \n\t"
59cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm1         \n\t"
60cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm3         \n\t"
61cabdff1aSopenharmony_ci        "paddusw %%mm2, %%mm0           \n\t"
62cabdff1aSopenharmony_ci        "paddusw %%mm3, %%mm1           \n\t"
63cabdff1aSopenharmony_ci        "paddusw %%mm6, %%mm4           \n\t"
64cabdff1aSopenharmony_ci        "paddusw %%mm6, %%mm5           \n\t"
65cabdff1aSopenharmony_ci        "paddusw %%mm0, %%mm4           \n\t"
66cabdff1aSopenharmony_ci        "paddusw %%mm1, %%mm5           \n\t"
67cabdff1aSopenharmony_ci        "psrlw  $2, %%mm4               \n\t"
68cabdff1aSopenharmony_ci        "psrlw  $2, %%mm5               \n\t"
69cabdff1aSopenharmony_ci        "packuswb  %%mm5, %%mm4         \n\t"
70cabdff1aSopenharmony_ci        "movq   %%mm4, (%2, %%"FF_REG_a")  \n\t"
71cabdff1aSopenharmony_ci        "add    %3, %%"FF_REG_a"           \n\t"
72cabdff1aSopenharmony_ci
73cabdff1aSopenharmony_ci        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
74cabdff1aSopenharmony_ci        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
75cabdff1aSopenharmony_ci        "movq   %%mm2, %%mm3            \n\t"
76cabdff1aSopenharmony_ci        "movq   %%mm4, %%mm5            \n\t"
77cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm2         \n\t"
78cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm4         \n\t"
79cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm3         \n\t"
80cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm5         \n\t"
81cabdff1aSopenharmony_ci        "paddusw %%mm2, %%mm4           \n\t"
82cabdff1aSopenharmony_ci        "paddusw %%mm3, %%mm5           \n\t"
83cabdff1aSopenharmony_ci        "paddusw %%mm6, %%mm0           \n\t"
84cabdff1aSopenharmony_ci        "paddusw %%mm6, %%mm1           \n\t"
85cabdff1aSopenharmony_ci        "paddusw %%mm4, %%mm0           \n\t"
86cabdff1aSopenharmony_ci        "paddusw %%mm5, %%mm1           \n\t"
87cabdff1aSopenharmony_ci        "psrlw  $2, %%mm0               \n\t"
88cabdff1aSopenharmony_ci        "psrlw  $2, %%mm1               \n\t"
89cabdff1aSopenharmony_ci        "packuswb  %%mm1, %%mm0         \n\t"
90cabdff1aSopenharmony_ci        "movq   %%mm0, (%2, %%"FF_REG_a")  \n\t"
91cabdff1aSopenharmony_ci        "add    %3, %%"FF_REG_a"        \n\t"
92cabdff1aSopenharmony_ci
93cabdff1aSopenharmony_ci        "subl   $2, %0                  \n\t"
94cabdff1aSopenharmony_ci        "jnz    1b                      \n\t"
95cabdff1aSopenharmony_ci        :"+g"(h), "+S"(pixels)
96cabdff1aSopenharmony_ci        :"D"(block), "r"((x86_reg)line_size)
97cabdff1aSopenharmony_ci        :FF_REG_a, "memory");
98cabdff1aSopenharmony_ci}
99cabdff1aSopenharmony_ci
100cabdff1aSopenharmony_ci#ifndef NO_AVG
101cabdff1aSopenharmony_ci// avg_pixels
102cabdff1aSopenharmony_ci// this routine is 'slightly' suboptimal but mostly unused
103cabdff1aSopenharmony_ciav_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
104cabdff1aSopenharmony_ci                                  ptrdiff_t line_size, int h)
105cabdff1aSopenharmony_ci{
106cabdff1aSopenharmony_ci    MOVQ_ZERO(mm7);
107cabdff1aSopenharmony_ci    SET_RND(mm6); // =2 for rnd  and  =1 for no_rnd version
108cabdff1aSopenharmony_ci    __asm__ volatile(
109cabdff1aSopenharmony_ci        "movq   (%1), %%mm0             \n\t"
110cabdff1aSopenharmony_ci        "movq   1(%1), %%mm4            \n\t"
111cabdff1aSopenharmony_ci        "movq   %%mm0, %%mm1            \n\t"
112cabdff1aSopenharmony_ci        "movq   %%mm4, %%mm5            \n\t"
113cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm0         \n\t"
114cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm4         \n\t"
115cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm1         \n\t"
116cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm5         \n\t"
117cabdff1aSopenharmony_ci        "paddusw %%mm0, %%mm4           \n\t"
118cabdff1aSopenharmony_ci        "paddusw %%mm1, %%mm5           \n\t"
119cabdff1aSopenharmony_ci        "xor    %%"FF_REG_a", %%"FF_REG_a" \n\t"
120cabdff1aSopenharmony_ci        "add    %3, %1                  \n\t"
121cabdff1aSopenharmony_ci        ".p2align 3                     \n\t"
122cabdff1aSopenharmony_ci        "1:                             \n\t"
123cabdff1aSopenharmony_ci        "movq   (%1, %%"FF_REG_a"), %%mm0  \n\t"
124cabdff1aSopenharmony_ci        "movq   1(%1, %%"FF_REG_a"), %%mm2 \n\t"
125cabdff1aSopenharmony_ci        "movq   %%mm0, %%mm1            \n\t"
126cabdff1aSopenharmony_ci        "movq   %%mm2, %%mm3            \n\t"
127cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm0         \n\t"
128cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm2         \n\t"
129cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm1         \n\t"
130cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm3         \n\t"
131cabdff1aSopenharmony_ci        "paddusw %%mm2, %%mm0           \n\t"
132cabdff1aSopenharmony_ci        "paddusw %%mm3, %%mm1           \n\t"
133cabdff1aSopenharmony_ci        "paddusw %%mm6, %%mm4           \n\t"
134cabdff1aSopenharmony_ci        "paddusw %%mm6, %%mm5           \n\t"
135cabdff1aSopenharmony_ci        "paddusw %%mm0, %%mm4           \n\t"
136cabdff1aSopenharmony_ci        "paddusw %%mm1, %%mm5           \n\t"
137cabdff1aSopenharmony_ci        "psrlw  $2, %%mm4               \n\t"
138cabdff1aSopenharmony_ci        "psrlw  $2, %%mm5               \n\t"
139cabdff1aSopenharmony_ci                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
140cabdff1aSopenharmony_ci        "packuswb  %%mm5, %%mm4         \n\t"
141cabdff1aSopenharmony_ci                "pcmpeqd %%mm2, %%mm2   \n\t"
142cabdff1aSopenharmony_ci                "paddb %%mm2, %%mm2     \n\t"
143cabdff1aSopenharmony_ci                PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
144cabdff1aSopenharmony_ci                "movq   %%mm5, (%2, %%"FF_REG_a")  \n\t"
145cabdff1aSopenharmony_ci        "add    %3, %%"FF_REG_a"        \n\t"
146cabdff1aSopenharmony_ci
147cabdff1aSopenharmony_ci        "movq   (%1, %%"FF_REG_a"), %%mm2  \n\t" // 0 <-> 2   1 <-> 3
148cabdff1aSopenharmony_ci        "movq   1(%1, %%"FF_REG_a"), %%mm4 \n\t"
149cabdff1aSopenharmony_ci        "movq   %%mm2, %%mm3            \n\t"
150cabdff1aSopenharmony_ci        "movq   %%mm4, %%mm5            \n\t"
151cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm2         \n\t"
152cabdff1aSopenharmony_ci        "punpcklbw %%mm7, %%mm4         \n\t"
153cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm3         \n\t"
154cabdff1aSopenharmony_ci        "punpckhbw %%mm7, %%mm5         \n\t"
155cabdff1aSopenharmony_ci        "paddusw %%mm2, %%mm4           \n\t"
156cabdff1aSopenharmony_ci        "paddusw %%mm3, %%mm5           \n\t"
157cabdff1aSopenharmony_ci        "paddusw %%mm6, %%mm0           \n\t"
158cabdff1aSopenharmony_ci        "paddusw %%mm6, %%mm1           \n\t"
159cabdff1aSopenharmony_ci        "paddusw %%mm4, %%mm0           \n\t"
160cabdff1aSopenharmony_ci        "paddusw %%mm5, %%mm1           \n\t"
161cabdff1aSopenharmony_ci        "psrlw  $2, %%mm0               \n\t"
162cabdff1aSopenharmony_ci        "psrlw  $2, %%mm1               \n\t"
163cabdff1aSopenharmony_ci                "movq   (%2, %%"FF_REG_a"), %%mm3  \n\t"
164cabdff1aSopenharmony_ci        "packuswb  %%mm1, %%mm0         \n\t"
165cabdff1aSopenharmony_ci                "pcmpeqd %%mm2, %%mm2   \n\t"
166cabdff1aSopenharmony_ci                "paddb %%mm2, %%mm2     \n\t"
167cabdff1aSopenharmony_ci                PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
168cabdff1aSopenharmony_ci                "movq   %%mm1, (%2, %%"FF_REG_a")  \n\t"
169cabdff1aSopenharmony_ci        "add    %3, %%"FF_REG_a"           \n\t"
170cabdff1aSopenharmony_ci
171cabdff1aSopenharmony_ci        "subl   $2, %0                  \n\t"
172cabdff1aSopenharmony_ci        "jnz    1b                      \n\t"
173cabdff1aSopenharmony_ci        :"+g"(h), "+S"(pixels)
174cabdff1aSopenharmony_ci        :"D"(block), "r"((x86_reg)line_size)
175cabdff1aSopenharmony_ci        :FF_REG_a, "memory");
176cabdff1aSopenharmony_ci}
177cabdff1aSopenharmony_ci#endif
178