1cabdff1aSopenharmony_ci/*
2cabdff1aSopenharmony_ci * software RGB to RGB converter
3cabdff1aSopenharmony_ci * pluralize by software PAL8 to RGB converter
4cabdff1aSopenharmony_ci *              software YUV to YUV converter
5cabdff1aSopenharmony_ci *              software YUV to RGB converter
6cabdff1aSopenharmony_ci * Written by Nick Kurshev.
7cabdff1aSopenharmony_ci * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8cabdff1aSopenharmony_ci * lot of big-endian byte order fixes by Alex Beregszaszi
9cabdff1aSopenharmony_ci *
10cabdff1aSopenharmony_ci * This file is part of FFmpeg.
11cabdff1aSopenharmony_ci *
12cabdff1aSopenharmony_ci * FFmpeg is free software; you can redistribute it and/or
13cabdff1aSopenharmony_ci * modify it under the terms of the GNU Lesser General Public
14cabdff1aSopenharmony_ci * License as published by the Free Software Foundation; either
15cabdff1aSopenharmony_ci * version 2.1 of the License, or (at your option) any later version.
16cabdff1aSopenharmony_ci *
17cabdff1aSopenharmony_ci * FFmpeg is distributed in the hope that it will be useful,
18cabdff1aSopenharmony_ci * but WITHOUT ANY WARRANTY; without even the implied warranty of
19cabdff1aSopenharmony_ci * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20cabdff1aSopenharmony_ci * Lesser General Public License for more details.
21cabdff1aSopenharmony_ci *
22cabdff1aSopenharmony_ci * You should have received a copy of the GNU Lesser General Public
23cabdff1aSopenharmony_ci * License along with FFmpeg; if not, write to the Free Software
24cabdff1aSopenharmony_ci * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25cabdff1aSopenharmony_ci */
26cabdff1aSopenharmony_ci
27cabdff1aSopenharmony_ci#include <stddef.h>
28cabdff1aSopenharmony_ci#include <stdint.h>
29cabdff1aSopenharmony_ci
30cabdff1aSopenharmony_ci#include "libavutil/attributes.h"
31cabdff1aSopenharmony_ci#include "libavutil/x86/asm.h"
32cabdff1aSopenharmony_ci
33cabdff1aSopenharmony_ci#undef PREFETCH
34cabdff1aSopenharmony_ci#undef MOVNTQ
35cabdff1aSopenharmony_ci#undef EMMS
36cabdff1aSopenharmony_ci#undef SFENCE
37cabdff1aSopenharmony_ci#undef PAVGB
38cabdff1aSopenharmony_ci
39cabdff1aSopenharmony_ci#define PREFETCH "prefetchnta"
40cabdff1aSopenharmony_ci#define PAVGB     "pavgb"
41cabdff1aSopenharmony_ci#define MOVNTQ "movntq"
42cabdff1aSopenharmony_ci#define SFENCE "sfence"
43cabdff1aSopenharmony_ci
44cabdff1aSopenharmony_ci#define EMMS     "emms"
45cabdff1aSopenharmony_ci
46cabdff1aSopenharmony_ci#if !COMPILE_TEMPLATE_SSE2
47cabdff1aSopenharmony_ci
48cabdff1aSopenharmony_cistatic inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
49cabdff1aSopenharmony_ci{
50cabdff1aSopenharmony_ci    uint8_t *dest = dst;
51cabdff1aSopenharmony_ci    const uint8_t *s = src;
52cabdff1aSopenharmony_ci    const uint8_t *end;
53cabdff1aSopenharmony_ci    const uint8_t *mm_end;
54cabdff1aSopenharmony_ci    end = s + src_size;
55cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
56cabdff1aSopenharmony_ci    mm_end = end - 23;
57cabdff1aSopenharmony_ci    __asm__ volatile("movq        %0, %%mm7"::"m"(mask32a):"memory");
58cabdff1aSopenharmony_ci    while (s < mm_end) {
59cabdff1aSopenharmony_ci        __asm__ volatile(
60cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
61cabdff1aSopenharmony_ci            "movd        (%1), %%mm0    \n\t"
62cabdff1aSopenharmony_ci            "punpckldq  3(%1), %%mm0    \n\t"
63cabdff1aSopenharmony_ci            "movd       6(%1), %%mm1    \n\t"
64cabdff1aSopenharmony_ci            "punpckldq  9(%1), %%mm1    \n\t"
65cabdff1aSopenharmony_ci            "movd      12(%1), %%mm2    \n\t"
66cabdff1aSopenharmony_ci            "punpckldq 15(%1), %%mm2    \n\t"
67cabdff1aSopenharmony_ci            "movd      18(%1), %%mm3    \n\t"
68cabdff1aSopenharmony_ci            "punpckldq 21(%1), %%mm3    \n\t"
69cabdff1aSopenharmony_ci            "por        %%mm7, %%mm0    \n\t"
70cabdff1aSopenharmony_ci            "por        %%mm7, %%mm1    \n\t"
71cabdff1aSopenharmony_ci            "por        %%mm7, %%mm2    \n\t"
72cabdff1aSopenharmony_ci            "por        %%mm7, %%mm3    \n\t"
73cabdff1aSopenharmony_ci            MOVNTQ"     %%mm0,   (%0)   \n\t"
74cabdff1aSopenharmony_ci            MOVNTQ"     %%mm1,  8(%0)   \n\t"
75cabdff1aSopenharmony_ci            MOVNTQ"     %%mm2, 16(%0)   \n\t"
76cabdff1aSopenharmony_ci            MOVNTQ"     %%mm3, 24(%0)"
77cabdff1aSopenharmony_ci            :: "r"(dest), "r"(s)
78cabdff1aSopenharmony_ci            :"memory");
79cabdff1aSopenharmony_ci        dest += 32;
80cabdff1aSopenharmony_ci        s += 24;
81cabdff1aSopenharmony_ci    }
82cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
83cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
84cabdff1aSopenharmony_ci    while (s < end) {
85cabdff1aSopenharmony_ci        *dest++ = *s++;
86cabdff1aSopenharmony_ci        *dest++ = *s++;
87cabdff1aSopenharmony_ci        *dest++ = *s++;
88cabdff1aSopenharmony_ci        *dest++ = 255;
89cabdff1aSopenharmony_ci    }
90cabdff1aSopenharmony_ci}
91cabdff1aSopenharmony_ci
92cabdff1aSopenharmony_ci#define STORE_BGR24_MMX \
93cabdff1aSopenharmony_ci            "psrlq         $8, %%mm2    \n\t" \
94cabdff1aSopenharmony_ci            "psrlq         $8, %%mm3    \n\t" \
95cabdff1aSopenharmony_ci            "psrlq         $8, %%mm6    \n\t" \
96cabdff1aSopenharmony_ci            "psrlq         $8, %%mm7    \n\t" \
97cabdff1aSopenharmony_ci            "pand "MANGLE(mask24l)", %%mm0\n\t" \
98cabdff1aSopenharmony_ci            "pand "MANGLE(mask24l)", %%mm1\n\t" \
99cabdff1aSopenharmony_ci            "pand "MANGLE(mask24l)", %%mm4\n\t" \
100cabdff1aSopenharmony_ci            "pand "MANGLE(mask24l)", %%mm5\n\t" \
101cabdff1aSopenharmony_ci            "pand "MANGLE(mask24h)", %%mm2\n\t" \
102cabdff1aSopenharmony_ci            "pand "MANGLE(mask24h)", %%mm3\n\t" \
103cabdff1aSopenharmony_ci            "pand "MANGLE(mask24h)", %%mm6\n\t" \
104cabdff1aSopenharmony_ci            "pand "MANGLE(mask24h)", %%mm7\n\t" \
105cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t" \
106cabdff1aSopenharmony_ci            "por        %%mm3, %%mm1    \n\t" \
107cabdff1aSopenharmony_ci            "por        %%mm6, %%mm4    \n\t" \
108cabdff1aSopenharmony_ci            "por        %%mm7, %%mm5    \n\t" \
109cabdff1aSopenharmony_ci \
110cabdff1aSopenharmony_ci            "movq       %%mm1, %%mm2    \n\t" \
111cabdff1aSopenharmony_ci            "movq       %%mm4, %%mm3    \n\t" \
112cabdff1aSopenharmony_ci            "psllq        $48, %%mm2    \n\t" \
113cabdff1aSopenharmony_ci            "psllq        $32, %%mm3    \n\t" \
114cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t" \
115cabdff1aSopenharmony_ci            "psrlq        $16, %%mm1    \n\t" \
116cabdff1aSopenharmony_ci            "psrlq        $32, %%mm4    \n\t" \
117cabdff1aSopenharmony_ci            "psllq        $16, %%mm5    \n\t" \
118cabdff1aSopenharmony_ci            "por        %%mm3, %%mm1    \n\t" \
119cabdff1aSopenharmony_ci            "por        %%mm5, %%mm4    \n\t" \
120cabdff1aSopenharmony_ci \
121cabdff1aSopenharmony_ci            MOVNTQ"     %%mm0,   (%0)    \n\t" \
122cabdff1aSopenharmony_ci            MOVNTQ"     %%mm1,  8(%0)    \n\t" \
123cabdff1aSopenharmony_ci            MOVNTQ"     %%mm4, 16(%0)"
124cabdff1aSopenharmony_ci
125cabdff1aSopenharmony_ci
126cabdff1aSopenharmony_cistatic inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
127cabdff1aSopenharmony_ci{
128cabdff1aSopenharmony_ci    uint8_t *dest = dst;
129cabdff1aSopenharmony_ci    const uint8_t *s = src;
130cabdff1aSopenharmony_ci    const uint8_t *end;
131cabdff1aSopenharmony_ci    const uint8_t *mm_end;
132cabdff1aSopenharmony_ci    end = s + src_size;
133cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
134cabdff1aSopenharmony_ci    mm_end = end - 31;
135cabdff1aSopenharmony_ci    while (s < mm_end) {
136cabdff1aSopenharmony_ci        __asm__ volatile(
137cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
138cabdff1aSopenharmony_ci            "movq        (%1), %%mm0    \n\t"
139cabdff1aSopenharmony_ci            "movq       8(%1), %%mm1    \n\t"
140cabdff1aSopenharmony_ci            "movq      16(%1), %%mm4    \n\t"
141cabdff1aSopenharmony_ci            "movq      24(%1), %%mm5    \n\t"
142cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm2    \n\t"
143cabdff1aSopenharmony_ci            "movq       %%mm1, %%mm3    \n\t"
144cabdff1aSopenharmony_ci            "movq       %%mm4, %%mm6    \n\t"
145cabdff1aSopenharmony_ci            "movq       %%mm5, %%mm7    \n\t"
146cabdff1aSopenharmony_ci            STORE_BGR24_MMX
147cabdff1aSopenharmony_ci            :: "r"(dest), "r"(s)
148cabdff1aSopenharmony_ci              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
149cabdff1aSopenharmony_ci            :"memory");
150cabdff1aSopenharmony_ci        dest += 24;
151cabdff1aSopenharmony_ci        s += 32;
152cabdff1aSopenharmony_ci    }
153cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
154cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
155cabdff1aSopenharmony_ci    while (s < end) {
156cabdff1aSopenharmony_ci        *dest++ = *s++;
157cabdff1aSopenharmony_ci        *dest++ = *s++;
158cabdff1aSopenharmony_ci        *dest++ = *s++;
159cabdff1aSopenharmony_ci        s++;
160cabdff1aSopenharmony_ci    }
161cabdff1aSopenharmony_ci}
162cabdff1aSopenharmony_ci
163cabdff1aSopenharmony_ci/*
164cabdff1aSopenharmony_ci original by Strepto/Astral
165cabdff1aSopenharmony_ci ported to gcc & bugfixed: A'rpi
166cabdff1aSopenharmony_ci MMXEXT, 3DNOW optimization by Nick Kurshev
167cabdff1aSopenharmony_ci 32-bit C version, and and&add trick by Michael Niedermayer
168cabdff1aSopenharmony_ci*/
169cabdff1aSopenharmony_cistatic inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
170cabdff1aSopenharmony_ci{
171cabdff1aSopenharmony_ci    register const uint8_t* s=src;
172cabdff1aSopenharmony_ci    register uint8_t* d=dst;
173cabdff1aSopenharmony_ci    register const uint8_t *end;
174cabdff1aSopenharmony_ci    const uint8_t *mm_end;
175cabdff1aSopenharmony_ci    end = s + src_size;
176cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
177cabdff1aSopenharmony_ci    __asm__ volatile("movq        %0, %%mm4"::"m"(mask15s));
178cabdff1aSopenharmony_ci    mm_end = end - 15;
179cabdff1aSopenharmony_ci    while (s<mm_end) {
180cabdff1aSopenharmony_ci        __asm__ volatile(
181cabdff1aSopenharmony_ci            PREFETCH" 32(%1)        \n\t"
182cabdff1aSopenharmony_ci            "movq      (%1), %%mm0  \n\t"
183cabdff1aSopenharmony_ci            "movq     8(%1), %%mm2  \n\t"
184cabdff1aSopenharmony_ci            "movq     %%mm0, %%mm1  \n\t"
185cabdff1aSopenharmony_ci            "movq     %%mm2, %%mm3  \n\t"
186cabdff1aSopenharmony_ci            "pand     %%mm4, %%mm0  \n\t"
187cabdff1aSopenharmony_ci            "pand     %%mm4, %%mm2  \n\t"
188cabdff1aSopenharmony_ci            "paddw    %%mm1, %%mm0  \n\t"
189cabdff1aSopenharmony_ci            "paddw    %%mm3, %%mm2  \n\t"
190cabdff1aSopenharmony_ci            MOVNTQ"   %%mm0,  (%0)  \n\t"
191cabdff1aSopenharmony_ci            MOVNTQ"   %%mm2, 8(%0)"
192cabdff1aSopenharmony_ci            :: "r"(d), "r"(s)
193cabdff1aSopenharmony_ci        );
194cabdff1aSopenharmony_ci        d+=16;
195cabdff1aSopenharmony_ci        s+=16;
196cabdff1aSopenharmony_ci    }
197cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
198cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
199cabdff1aSopenharmony_ci    mm_end = end - 3;
200cabdff1aSopenharmony_ci    while (s < mm_end) {
201cabdff1aSopenharmony_ci        register unsigned x= *((const uint32_t *)s);
202cabdff1aSopenharmony_ci        *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
203cabdff1aSopenharmony_ci        d+=4;
204cabdff1aSopenharmony_ci        s+=4;
205cabdff1aSopenharmony_ci    }
206cabdff1aSopenharmony_ci    if (s < end) {
207cabdff1aSopenharmony_ci        register unsigned short x= *((const uint16_t *)s);
208cabdff1aSopenharmony_ci        *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
209cabdff1aSopenharmony_ci    }
210cabdff1aSopenharmony_ci}
211cabdff1aSopenharmony_ci
212cabdff1aSopenharmony_cistatic inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
213cabdff1aSopenharmony_ci{
214cabdff1aSopenharmony_ci    register const uint8_t* s=src;
215cabdff1aSopenharmony_ci    register uint8_t* d=dst;
216cabdff1aSopenharmony_ci    register const uint8_t *end;
217cabdff1aSopenharmony_ci    const uint8_t *mm_end;
218cabdff1aSopenharmony_ci    end = s + src_size;
219cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*s));
220cabdff1aSopenharmony_ci    __asm__ volatile("movq        %0, %%mm7"::"m"(mask15rg));
221cabdff1aSopenharmony_ci    __asm__ volatile("movq        %0, %%mm6"::"m"(mask15b));
222cabdff1aSopenharmony_ci    mm_end = end - 15;
223cabdff1aSopenharmony_ci    while (s<mm_end) {
224cabdff1aSopenharmony_ci        __asm__ volatile(
225cabdff1aSopenharmony_ci            PREFETCH" 32(%1)        \n\t"
226cabdff1aSopenharmony_ci            "movq      (%1), %%mm0  \n\t"
227cabdff1aSopenharmony_ci            "movq     8(%1), %%mm2  \n\t"
228cabdff1aSopenharmony_ci            "movq     %%mm0, %%mm1  \n\t"
229cabdff1aSopenharmony_ci            "movq     %%mm2, %%mm3  \n\t"
230cabdff1aSopenharmony_ci            "psrlq       $1, %%mm0  \n\t"
231cabdff1aSopenharmony_ci            "psrlq       $1, %%mm2  \n\t"
232cabdff1aSopenharmony_ci            "pand     %%mm7, %%mm0  \n\t"
233cabdff1aSopenharmony_ci            "pand     %%mm7, %%mm2  \n\t"
234cabdff1aSopenharmony_ci            "pand     %%mm6, %%mm1  \n\t"
235cabdff1aSopenharmony_ci            "pand     %%mm6, %%mm3  \n\t"
236cabdff1aSopenharmony_ci            "por      %%mm1, %%mm0  \n\t"
237cabdff1aSopenharmony_ci            "por      %%mm3, %%mm2  \n\t"
238cabdff1aSopenharmony_ci            MOVNTQ"   %%mm0,  (%0)  \n\t"
239cabdff1aSopenharmony_ci            MOVNTQ"   %%mm2, 8(%0)"
240cabdff1aSopenharmony_ci            :: "r"(d), "r"(s)
241cabdff1aSopenharmony_ci        );
242cabdff1aSopenharmony_ci        d+=16;
243cabdff1aSopenharmony_ci        s+=16;
244cabdff1aSopenharmony_ci    }
245cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
246cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
247cabdff1aSopenharmony_ci    mm_end = end - 3;
248cabdff1aSopenharmony_ci    while (s < mm_end) {
249cabdff1aSopenharmony_ci        register uint32_t x= *((const uint32_t*)s);
250cabdff1aSopenharmony_ci        *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
251cabdff1aSopenharmony_ci        s+=4;
252cabdff1aSopenharmony_ci        d+=4;
253cabdff1aSopenharmony_ci    }
254cabdff1aSopenharmony_ci    if (s < end) {
255cabdff1aSopenharmony_ci        register uint16_t x= *((const uint16_t*)s);
256cabdff1aSopenharmony_ci        *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
257cabdff1aSopenharmony_ci    }
258cabdff1aSopenharmony_ci}
259cabdff1aSopenharmony_ci
260cabdff1aSopenharmony_cistatic inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
261cabdff1aSopenharmony_ci{
262cabdff1aSopenharmony_ci    const uint8_t *s = src;
263cabdff1aSopenharmony_ci    const uint8_t *end;
264cabdff1aSopenharmony_ci    const uint8_t *mm_end;
265cabdff1aSopenharmony_ci    uint16_t *d = (uint16_t *)dst;
266cabdff1aSopenharmony_ci    end = s + src_size;
267cabdff1aSopenharmony_ci    mm_end = end - 15;
268cabdff1aSopenharmony_ci    __asm__ volatile(
269cabdff1aSopenharmony_ci        "movq           %3, %%mm5   \n\t"
270cabdff1aSopenharmony_ci        "movq           %4, %%mm6   \n\t"
271cabdff1aSopenharmony_ci        "movq           %5, %%mm7   \n\t"
272cabdff1aSopenharmony_ci        "jmp 2f                     \n\t"
273cabdff1aSopenharmony_ci        ".p2align        4          \n\t"
274cabdff1aSopenharmony_ci        "1:                         \n\t"
275cabdff1aSopenharmony_ci        PREFETCH"   32(%1)          \n\t"
276cabdff1aSopenharmony_ci        "movd         (%1), %%mm0   \n\t"
277cabdff1aSopenharmony_ci        "movd        4(%1), %%mm3   \n\t"
278cabdff1aSopenharmony_ci        "punpckldq   8(%1), %%mm0   \n\t"
279cabdff1aSopenharmony_ci        "punpckldq  12(%1), %%mm3   \n\t"
280cabdff1aSopenharmony_ci        "movq        %%mm0, %%mm1   \n\t"
281cabdff1aSopenharmony_ci        "movq        %%mm3, %%mm4   \n\t"
282cabdff1aSopenharmony_ci        "pand        %%mm6, %%mm0   \n\t"
283cabdff1aSopenharmony_ci        "pand        %%mm6, %%mm3   \n\t"
284cabdff1aSopenharmony_ci        "pmaddwd     %%mm7, %%mm0   \n\t"
285cabdff1aSopenharmony_ci        "pmaddwd     %%mm7, %%mm3   \n\t"
286cabdff1aSopenharmony_ci        "pand        %%mm5, %%mm1   \n\t"
287cabdff1aSopenharmony_ci        "pand        %%mm5, %%mm4   \n\t"
288cabdff1aSopenharmony_ci        "por         %%mm1, %%mm0   \n\t"
289cabdff1aSopenharmony_ci        "por         %%mm4, %%mm3   \n\t"
290cabdff1aSopenharmony_ci        "psrld          $5, %%mm0   \n\t"
291cabdff1aSopenharmony_ci        "pslld         $11, %%mm3   \n\t"
292cabdff1aSopenharmony_ci        "por         %%mm3, %%mm0   \n\t"
293cabdff1aSopenharmony_ci        MOVNTQ"      %%mm0, (%0)    \n\t"
294cabdff1aSopenharmony_ci        "add           $16,  %1     \n\t"
295cabdff1aSopenharmony_ci        "add            $8,  %0     \n\t"
296cabdff1aSopenharmony_ci        "2:                         \n\t"
297cabdff1aSopenharmony_ci        "cmp            %2,  %1     \n\t"
298cabdff1aSopenharmony_ci        " jb            1b          \n\t"
299cabdff1aSopenharmony_ci        : "+r" (d), "+r"(s)
300cabdff1aSopenharmony_ci        : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
301cabdff1aSopenharmony_ci    );
302cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
303cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
304cabdff1aSopenharmony_ci    while (s < end) {
305cabdff1aSopenharmony_ci        register int rgb = *(const uint32_t*)s; s += 4;
306cabdff1aSopenharmony_ci        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
307cabdff1aSopenharmony_ci    }
308cabdff1aSopenharmony_ci}
309cabdff1aSopenharmony_ci
310cabdff1aSopenharmony_cistatic inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
311cabdff1aSopenharmony_ci{
312cabdff1aSopenharmony_ci    const uint8_t *s = src;
313cabdff1aSopenharmony_ci    const uint8_t *end;
314cabdff1aSopenharmony_ci    const uint8_t *mm_end;
315cabdff1aSopenharmony_ci    uint16_t *d = (uint16_t *)dst;
316cabdff1aSopenharmony_ci    end = s + src_size;
317cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
318cabdff1aSopenharmony_ci    __asm__ volatile(
319cabdff1aSopenharmony_ci        "movq          %0, %%mm7    \n\t"
320cabdff1aSopenharmony_ci        "movq          %1, %%mm6    \n\t"
321cabdff1aSopenharmony_ci        ::"m"(red_16mask),"m"(green_16mask));
322cabdff1aSopenharmony_ci    mm_end = end - 15;
323cabdff1aSopenharmony_ci    while (s < mm_end) {
324cabdff1aSopenharmony_ci        __asm__ volatile(
325cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
326cabdff1aSopenharmony_ci            "movd        (%1), %%mm0    \n\t"
327cabdff1aSopenharmony_ci            "movd       4(%1), %%mm3    \n\t"
328cabdff1aSopenharmony_ci            "punpckldq  8(%1), %%mm0    \n\t"
329cabdff1aSopenharmony_ci            "punpckldq 12(%1), %%mm3    \n\t"
330cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm1    \n\t"
331cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm2    \n\t"
332cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm4    \n\t"
333cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm5    \n\t"
334cabdff1aSopenharmony_ci            "psllq         $8, %%mm0    \n\t"
335cabdff1aSopenharmony_ci            "psllq         $8, %%mm3    \n\t"
336cabdff1aSopenharmony_ci            "pand       %%mm7, %%mm0    \n\t"
337cabdff1aSopenharmony_ci            "pand       %%mm7, %%mm3    \n\t"
338cabdff1aSopenharmony_ci            "psrlq         $5, %%mm1    \n\t"
339cabdff1aSopenharmony_ci            "psrlq         $5, %%mm4    \n\t"
340cabdff1aSopenharmony_ci            "pand       %%mm6, %%mm1    \n\t"
341cabdff1aSopenharmony_ci            "pand       %%mm6, %%mm4    \n\t"
342cabdff1aSopenharmony_ci            "psrlq        $19, %%mm2    \n\t"
343cabdff1aSopenharmony_ci            "psrlq        $19, %%mm5    \n\t"
344cabdff1aSopenharmony_ci            "pand          %2, %%mm2    \n\t"
345cabdff1aSopenharmony_ci            "pand          %2, %%mm5    \n\t"
346cabdff1aSopenharmony_ci            "por        %%mm1, %%mm0    \n\t"
347cabdff1aSopenharmony_ci            "por        %%mm4, %%mm3    \n\t"
348cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t"
349cabdff1aSopenharmony_ci            "por        %%mm5, %%mm3    \n\t"
350cabdff1aSopenharmony_ci            "psllq        $16, %%mm3    \n\t"
351cabdff1aSopenharmony_ci            "por        %%mm3, %%mm0    \n\t"
352cabdff1aSopenharmony_ci            MOVNTQ"     %%mm0, (%0)     \n\t"
353cabdff1aSopenharmony_ci            :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
354cabdff1aSopenharmony_ci        d += 4;
355cabdff1aSopenharmony_ci        s += 16;
356cabdff1aSopenharmony_ci    }
357cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
358cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
359cabdff1aSopenharmony_ci    while (s < end) {
360cabdff1aSopenharmony_ci        register int rgb = *(const uint32_t*)s; s += 4;
361cabdff1aSopenharmony_ci        *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
362cabdff1aSopenharmony_ci    }
363cabdff1aSopenharmony_ci}
364cabdff1aSopenharmony_ci
365cabdff1aSopenharmony_cistatic inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
366cabdff1aSopenharmony_ci{
367cabdff1aSopenharmony_ci    const uint8_t *s = src;
368cabdff1aSopenharmony_ci    const uint8_t *end;
369cabdff1aSopenharmony_ci    const uint8_t *mm_end;
370cabdff1aSopenharmony_ci    uint16_t *d = (uint16_t *)dst;
371cabdff1aSopenharmony_ci    end = s + src_size;
372cabdff1aSopenharmony_ci    mm_end = end - 15;
373cabdff1aSopenharmony_ci    __asm__ volatile(
374cabdff1aSopenharmony_ci        "movq           %3, %%mm5   \n\t"
375cabdff1aSopenharmony_ci        "movq           %4, %%mm6   \n\t"
376cabdff1aSopenharmony_ci        "movq           %5, %%mm7   \n\t"
377cabdff1aSopenharmony_ci        "jmp            2f          \n\t"
378cabdff1aSopenharmony_ci        ".p2align        4          \n\t"
379cabdff1aSopenharmony_ci        "1:                         \n\t"
380cabdff1aSopenharmony_ci        PREFETCH"   32(%1)          \n\t"
381cabdff1aSopenharmony_ci        "movd         (%1), %%mm0   \n\t"
382cabdff1aSopenharmony_ci        "movd        4(%1), %%mm3   \n\t"
383cabdff1aSopenharmony_ci        "punpckldq   8(%1), %%mm0   \n\t"
384cabdff1aSopenharmony_ci        "punpckldq  12(%1), %%mm3   \n\t"
385cabdff1aSopenharmony_ci        "movq        %%mm0, %%mm1   \n\t"
386cabdff1aSopenharmony_ci        "movq        %%mm3, %%mm4   \n\t"
387cabdff1aSopenharmony_ci        "pand        %%mm6, %%mm0   \n\t"
388cabdff1aSopenharmony_ci        "pand        %%mm6, %%mm3   \n\t"
389cabdff1aSopenharmony_ci        "pmaddwd     %%mm7, %%mm0   \n\t"
390cabdff1aSopenharmony_ci        "pmaddwd     %%mm7, %%mm3   \n\t"
391cabdff1aSopenharmony_ci        "pand        %%mm5, %%mm1   \n\t"
392cabdff1aSopenharmony_ci        "pand        %%mm5, %%mm4   \n\t"
393cabdff1aSopenharmony_ci        "por         %%mm1, %%mm0   \n\t"
394cabdff1aSopenharmony_ci        "por         %%mm4, %%mm3   \n\t"
395cabdff1aSopenharmony_ci        "psrld          $6, %%mm0   \n\t"
396cabdff1aSopenharmony_ci        "pslld         $10, %%mm3   \n\t"
397cabdff1aSopenharmony_ci        "por         %%mm3, %%mm0   \n\t"
398cabdff1aSopenharmony_ci        MOVNTQ"      %%mm0, (%0)    \n\t"
399cabdff1aSopenharmony_ci        "add           $16,  %1     \n\t"
400cabdff1aSopenharmony_ci        "add            $8,  %0     \n\t"
401cabdff1aSopenharmony_ci        "2:                         \n\t"
402cabdff1aSopenharmony_ci        "cmp            %2,  %1     \n\t"
403cabdff1aSopenharmony_ci        " jb            1b          \n\t"
404cabdff1aSopenharmony_ci        : "+r" (d), "+r"(s)
405cabdff1aSopenharmony_ci        : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
406cabdff1aSopenharmony_ci    );
407cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
408cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
409cabdff1aSopenharmony_ci    while (s < end) {
410cabdff1aSopenharmony_ci        register int rgb = *(const uint32_t*)s; s += 4;
411cabdff1aSopenharmony_ci        *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
412cabdff1aSopenharmony_ci    }
413cabdff1aSopenharmony_ci}
414cabdff1aSopenharmony_ci
415cabdff1aSopenharmony_cistatic inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
416cabdff1aSopenharmony_ci{
417cabdff1aSopenharmony_ci    const uint8_t *s = src;
418cabdff1aSopenharmony_ci    const uint8_t *end;
419cabdff1aSopenharmony_ci    const uint8_t *mm_end;
420cabdff1aSopenharmony_ci    uint16_t *d = (uint16_t *)dst;
421cabdff1aSopenharmony_ci    end = s + src_size;
422cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
423cabdff1aSopenharmony_ci    __asm__ volatile(
424cabdff1aSopenharmony_ci        "movq          %0, %%mm7    \n\t"
425cabdff1aSopenharmony_ci        "movq          %1, %%mm6    \n\t"
426cabdff1aSopenharmony_ci        ::"m"(red_15mask),"m"(green_15mask));
427cabdff1aSopenharmony_ci    mm_end = end - 15;
428cabdff1aSopenharmony_ci    while (s < mm_end) {
429cabdff1aSopenharmony_ci        __asm__ volatile(
430cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
431cabdff1aSopenharmony_ci            "movd        (%1), %%mm0    \n\t"
432cabdff1aSopenharmony_ci            "movd       4(%1), %%mm3    \n\t"
433cabdff1aSopenharmony_ci            "punpckldq  8(%1), %%mm0    \n\t"
434cabdff1aSopenharmony_ci            "punpckldq 12(%1), %%mm3    \n\t"
435cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm1    \n\t"
436cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm2    \n\t"
437cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm4    \n\t"
438cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm5    \n\t"
439cabdff1aSopenharmony_ci            "psllq         $7, %%mm0    \n\t"
440cabdff1aSopenharmony_ci            "psllq         $7, %%mm3    \n\t"
441cabdff1aSopenharmony_ci            "pand       %%mm7, %%mm0    \n\t"
442cabdff1aSopenharmony_ci            "pand       %%mm7, %%mm3    \n\t"
443cabdff1aSopenharmony_ci            "psrlq         $6, %%mm1    \n\t"
444cabdff1aSopenharmony_ci            "psrlq         $6, %%mm4    \n\t"
445cabdff1aSopenharmony_ci            "pand       %%mm6, %%mm1    \n\t"
446cabdff1aSopenharmony_ci            "pand       %%mm6, %%mm4    \n\t"
447cabdff1aSopenharmony_ci            "psrlq        $19, %%mm2    \n\t"
448cabdff1aSopenharmony_ci            "psrlq        $19, %%mm5    \n\t"
449cabdff1aSopenharmony_ci            "pand          %2, %%mm2    \n\t"
450cabdff1aSopenharmony_ci            "pand          %2, %%mm5    \n\t"
451cabdff1aSopenharmony_ci            "por        %%mm1, %%mm0    \n\t"
452cabdff1aSopenharmony_ci            "por        %%mm4, %%mm3    \n\t"
453cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t"
454cabdff1aSopenharmony_ci            "por        %%mm5, %%mm3    \n\t"
455cabdff1aSopenharmony_ci            "psllq        $16, %%mm3    \n\t"
456cabdff1aSopenharmony_ci            "por        %%mm3, %%mm0    \n\t"
457cabdff1aSopenharmony_ci            MOVNTQ"     %%mm0, (%0)     \n\t"
458cabdff1aSopenharmony_ci            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
459cabdff1aSopenharmony_ci        d += 4;
460cabdff1aSopenharmony_ci        s += 16;
461cabdff1aSopenharmony_ci    }
462cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
463cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
464cabdff1aSopenharmony_ci    while (s < end) {
465cabdff1aSopenharmony_ci        register int rgb = *(const uint32_t*)s; s += 4;
466cabdff1aSopenharmony_ci        *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
467cabdff1aSopenharmony_ci    }
468cabdff1aSopenharmony_ci}
469cabdff1aSopenharmony_ci
470cabdff1aSopenharmony_cistatic inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
471cabdff1aSopenharmony_ci{
472cabdff1aSopenharmony_ci    const uint8_t *s = src;
473cabdff1aSopenharmony_ci    const uint8_t *end;
474cabdff1aSopenharmony_ci    const uint8_t *mm_end;
475cabdff1aSopenharmony_ci    uint16_t *d = (uint16_t *)dst;
476cabdff1aSopenharmony_ci    end = s + src_size;
477cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
478cabdff1aSopenharmony_ci    __asm__ volatile(
479cabdff1aSopenharmony_ci        "movq         %0, %%mm7     \n\t"
480cabdff1aSopenharmony_ci        "movq         %1, %%mm6     \n\t"
481cabdff1aSopenharmony_ci        ::"m"(red_16mask),"m"(green_16mask));
482cabdff1aSopenharmony_ci    mm_end = end - 11;
483cabdff1aSopenharmony_ci    while (s < mm_end) {
484cabdff1aSopenharmony_ci        __asm__ volatile(
485cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
486cabdff1aSopenharmony_ci            "movd        (%1), %%mm0    \n\t"
487cabdff1aSopenharmony_ci            "movd       3(%1), %%mm3    \n\t"
488cabdff1aSopenharmony_ci            "punpckldq  6(%1), %%mm0    \n\t"
489cabdff1aSopenharmony_ci            "punpckldq  9(%1), %%mm3    \n\t"
490cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm1    \n\t"
491cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm2    \n\t"
492cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm4    \n\t"
493cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm5    \n\t"
494cabdff1aSopenharmony_ci            "psrlq         $3, %%mm0    \n\t"
495cabdff1aSopenharmony_ci            "psrlq         $3, %%mm3    \n\t"
496cabdff1aSopenharmony_ci            "pand          %2, %%mm0    \n\t"
497cabdff1aSopenharmony_ci            "pand          %2, %%mm3    \n\t"
498cabdff1aSopenharmony_ci            "psrlq         $5, %%mm1    \n\t"
499cabdff1aSopenharmony_ci            "psrlq         $5, %%mm4    \n\t"
500cabdff1aSopenharmony_ci            "pand       %%mm6, %%mm1    \n\t"
501cabdff1aSopenharmony_ci            "pand       %%mm6, %%mm4    \n\t"
502cabdff1aSopenharmony_ci            "psrlq         $8, %%mm2    \n\t"
503cabdff1aSopenharmony_ci            "psrlq         $8, %%mm5    \n\t"
504cabdff1aSopenharmony_ci            "pand       %%mm7, %%mm2    \n\t"
505cabdff1aSopenharmony_ci            "pand       %%mm7, %%mm5    \n\t"
506cabdff1aSopenharmony_ci            "por        %%mm1, %%mm0    \n\t"
507cabdff1aSopenharmony_ci            "por        %%mm4, %%mm3    \n\t"
508cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t"
509cabdff1aSopenharmony_ci            "por        %%mm5, %%mm3    \n\t"
510cabdff1aSopenharmony_ci            "psllq        $16, %%mm3    \n\t"
511cabdff1aSopenharmony_ci            "por        %%mm3, %%mm0    \n\t"
512cabdff1aSopenharmony_ci            MOVNTQ"     %%mm0, (%0)     \n\t"
513cabdff1aSopenharmony_ci            ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
514cabdff1aSopenharmony_ci        d += 4;
515cabdff1aSopenharmony_ci        s += 12;
516cabdff1aSopenharmony_ci    }
517cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
518cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
519cabdff1aSopenharmony_ci    while (s < end) {
520cabdff1aSopenharmony_ci        const int b = *s++;
521cabdff1aSopenharmony_ci        const int g = *s++;
522cabdff1aSopenharmony_ci        const int r = *s++;
523cabdff1aSopenharmony_ci        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
524cabdff1aSopenharmony_ci    }
525cabdff1aSopenharmony_ci}
526cabdff1aSopenharmony_ci
527cabdff1aSopenharmony_cistatic inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
528cabdff1aSopenharmony_ci{
529cabdff1aSopenharmony_ci    const uint8_t *s = src;
530cabdff1aSopenharmony_ci    const uint8_t *end;
531cabdff1aSopenharmony_ci    const uint8_t *mm_end;
532cabdff1aSopenharmony_ci    uint16_t *d = (uint16_t *)dst;
533cabdff1aSopenharmony_ci    end = s + src_size;
534cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
535cabdff1aSopenharmony_ci    __asm__ volatile(
536cabdff1aSopenharmony_ci        "movq         %0, %%mm7     \n\t"
537cabdff1aSopenharmony_ci        "movq         %1, %%mm6     \n\t"
538cabdff1aSopenharmony_ci        ::"m"(red_16mask),"m"(green_16mask));
539cabdff1aSopenharmony_ci    mm_end = end - 15;
540cabdff1aSopenharmony_ci    while (s < mm_end) {
541cabdff1aSopenharmony_ci        __asm__ volatile(
542cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
543cabdff1aSopenharmony_ci            "movd        (%1), %%mm0    \n\t"
544cabdff1aSopenharmony_ci            "movd       3(%1), %%mm3    \n\t"
545cabdff1aSopenharmony_ci            "punpckldq  6(%1), %%mm0    \n\t"
546cabdff1aSopenharmony_ci            "punpckldq  9(%1), %%mm3    \n\t"
547cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm1    \n\t"
548cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm2    \n\t"
549cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm4    \n\t"
550cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm5    \n\t"
551cabdff1aSopenharmony_ci            "psllq         $8, %%mm0    \n\t"
552cabdff1aSopenharmony_ci            "psllq         $8, %%mm3    \n\t"
553cabdff1aSopenharmony_ci            "pand       %%mm7, %%mm0    \n\t"
554cabdff1aSopenharmony_ci            "pand       %%mm7, %%mm3    \n\t"
555cabdff1aSopenharmony_ci            "psrlq         $5, %%mm1    \n\t"
556cabdff1aSopenharmony_ci            "psrlq         $5, %%mm4    \n\t"
557cabdff1aSopenharmony_ci            "pand       %%mm6, %%mm1    \n\t"
558cabdff1aSopenharmony_ci            "pand       %%mm6, %%mm4    \n\t"
559cabdff1aSopenharmony_ci            "psrlq        $19, %%mm2    \n\t"
560cabdff1aSopenharmony_ci            "psrlq        $19, %%mm5    \n\t"
561cabdff1aSopenharmony_ci            "pand          %2, %%mm2    \n\t"
562cabdff1aSopenharmony_ci            "pand          %2, %%mm5    \n\t"
563cabdff1aSopenharmony_ci            "por        %%mm1, %%mm0    \n\t"
564cabdff1aSopenharmony_ci            "por        %%mm4, %%mm3    \n\t"
565cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t"
566cabdff1aSopenharmony_ci            "por        %%mm5, %%mm3    \n\t"
567cabdff1aSopenharmony_ci            "psllq        $16, %%mm3    \n\t"
568cabdff1aSopenharmony_ci            "por        %%mm3, %%mm0    \n\t"
569cabdff1aSopenharmony_ci            MOVNTQ"     %%mm0, (%0)     \n\t"
570cabdff1aSopenharmony_ci            ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
571cabdff1aSopenharmony_ci        d += 4;
572cabdff1aSopenharmony_ci        s += 12;
573cabdff1aSopenharmony_ci    }
574cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
575cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
576cabdff1aSopenharmony_ci    while (s < end) {
577cabdff1aSopenharmony_ci        const int r = *s++;
578cabdff1aSopenharmony_ci        const int g = *s++;
579cabdff1aSopenharmony_ci        const int b = *s++;
580cabdff1aSopenharmony_ci        *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
581cabdff1aSopenharmony_ci    }
582cabdff1aSopenharmony_ci}
583cabdff1aSopenharmony_ci
584cabdff1aSopenharmony_cistatic inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
585cabdff1aSopenharmony_ci{
586cabdff1aSopenharmony_ci    const uint8_t *s = src;
587cabdff1aSopenharmony_ci    const uint8_t *end;
588cabdff1aSopenharmony_ci    const uint8_t *mm_end;
589cabdff1aSopenharmony_ci    uint16_t *d = (uint16_t *)dst;
590cabdff1aSopenharmony_ci    end = s + src_size;
591cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
592cabdff1aSopenharmony_ci    __asm__ volatile(
593cabdff1aSopenharmony_ci        "movq          %0, %%mm7    \n\t"
594cabdff1aSopenharmony_ci        "movq          %1, %%mm6    \n\t"
595cabdff1aSopenharmony_ci        ::"m"(red_15mask),"m"(green_15mask));
596cabdff1aSopenharmony_ci    mm_end = end - 11;
597cabdff1aSopenharmony_ci    while (s < mm_end) {
598cabdff1aSopenharmony_ci        __asm__ volatile(
599cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
600cabdff1aSopenharmony_ci            "movd        (%1), %%mm0    \n\t"
601cabdff1aSopenharmony_ci            "movd       3(%1), %%mm3    \n\t"
602cabdff1aSopenharmony_ci            "punpckldq  6(%1), %%mm0    \n\t"
603cabdff1aSopenharmony_ci            "punpckldq  9(%1), %%mm3    \n\t"
604cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm1    \n\t"
605cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm2    \n\t"
606cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm4    \n\t"
607cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm5    \n\t"
608cabdff1aSopenharmony_ci            "psrlq         $3, %%mm0    \n\t"
609cabdff1aSopenharmony_ci            "psrlq         $3, %%mm3    \n\t"
610cabdff1aSopenharmony_ci            "pand          %2, %%mm0    \n\t"
611cabdff1aSopenharmony_ci            "pand          %2, %%mm3    \n\t"
612cabdff1aSopenharmony_ci            "psrlq         $6, %%mm1    \n\t"
613cabdff1aSopenharmony_ci            "psrlq         $6, %%mm4    \n\t"
614cabdff1aSopenharmony_ci            "pand       %%mm6, %%mm1    \n\t"
615cabdff1aSopenharmony_ci            "pand       %%mm6, %%mm4    \n\t"
616cabdff1aSopenharmony_ci            "psrlq         $9, %%mm2    \n\t"
617cabdff1aSopenharmony_ci            "psrlq         $9, %%mm5    \n\t"
618cabdff1aSopenharmony_ci            "pand       %%mm7, %%mm2    \n\t"
619cabdff1aSopenharmony_ci            "pand       %%mm7, %%mm5    \n\t"
620cabdff1aSopenharmony_ci            "por        %%mm1, %%mm0    \n\t"
621cabdff1aSopenharmony_ci            "por        %%mm4, %%mm3    \n\t"
622cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t"
623cabdff1aSopenharmony_ci            "por        %%mm5, %%mm3    \n\t"
624cabdff1aSopenharmony_ci            "psllq        $16, %%mm3    \n\t"
625cabdff1aSopenharmony_ci            "por        %%mm3, %%mm0    \n\t"
626cabdff1aSopenharmony_ci            MOVNTQ"     %%mm0, (%0)     \n\t"
627cabdff1aSopenharmony_ci            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
628cabdff1aSopenharmony_ci        d += 4;
629cabdff1aSopenharmony_ci        s += 12;
630cabdff1aSopenharmony_ci    }
631cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
632cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
633cabdff1aSopenharmony_ci    while (s < end) {
634cabdff1aSopenharmony_ci        const int b = *s++;
635cabdff1aSopenharmony_ci        const int g = *s++;
636cabdff1aSopenharmony_ci        const int r = *s++;
637cabdff1aSopenharmony_ci        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
638cabdff1aSopenharmony_ci    }
639cabdff1aSopenharmony_ci}
640cabdff1aSopenharmony_ci
641cabdff1aSopenharmony_cistatic inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
642cabdff1aSopenharmony_ci{
643cabdff1aSopenharmony_ci    const uint8_t *s = src;
644cabdff1aSopenharmony_ci    const uint8_t *end;
645cabdff1aSopenharmony_ci    const uint8_t *mm_end;
646cabdff1aSopenharmony_ci    uint16_t *d = (uint16_t *)dst;
647cabdff1aSopenharmony_ci    end = s + src_size;
648cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*src):"memory");
649cabdff1aSopenharmony_ci    __asm__ volatile(
650cabdff1aSopenharmony_ci        "movq         %0, %%mm7     \n\t"
651cabdff1aSopenharmony_ci        "movq         %1, %%mm6     \n\t"
652cabdff1aSopenharmony_ci        ::"m"(red_15mask),"m"(green_15mask));
653cabdff1aSopenharmony_ci    mm_end = end - 15;
654cabdff1aSopenharmony_ci    while (s < mm_end) {
655cabdff1aSopenharmony_ci        __asm__ volatile(
656cabdff1aSopenharmony_ci            PREFETCH" 32(%1)            \n\t"
657cabdff1aSopenharmony_ci            "movd       (%1), %%mm0     \n\t"
658cabdff1aSopenharmony_ci            "movd      3(%1), %%mm3     \n\t"
659cabdff1aSopenharmony_ci            "punpckldq 6(%1), %%mm0     \n\t"
660cabdff1aSopenharmony_ci            "punpckldq 9(%1), %%mm3     \n\t"
661cabdff1aSopenharmony_ci            "movq      %%mm0, %%mm1     \n\t"
662cabdff1aSopenharmony_ci            "movq      %%mm0, %%mm2     \n\t"
663cabdff1aSopenharmony_ci            "movq      %%mm3, %%mm4     \n\t"
664cabdff1aSopenharmony_ci            "movq      %%mm3, %%mm5     \n\t"
665cabdff1aSopenharmony_ci            "psllq        $7, %%mm0     \n\t"
666cabdff1aSopenharmony_ci            "psllq        $7, %%mm3     \n\t"
667cabdff1aSopenharmony_ci            "pand      %%mm7, %%mm0     \n\t"
668cabdff1aSopenharmony_ci            "pand      %%mm7, %%mm3     \n\t"
669cabdff1aSopenharmony_ci            "psrlq        $6, %%mm1     \n\t"
670cabdff1aSopenharmony_ci            "psrlq        $6, %%mm4     \n\t"
671cabdff1aSopenharmony_ci            "pand      %%mm6, %%mm1     \n\t"
672cabdff1aSopenharmony_ci            "pand      %%mm6, %%mm4     \n\t"
673cabdff1aSopenharmony_ci            "psrlq       $19, %%mm2     \n\t"
674cabdff1aSopenharmony_ci            "psrlq       $19, %%mm5     \n\t"
675cabdff1aSopenharmony_ci            "pand         %2, %%mm2     \n\t"
676cabdff1aSopenharmony_ci            "pand         %2, %%mm5     \n\t"
677cabdff1aSopenharmony_ci            "por       %%mm1, %%mm0     \n\t"
678cabdff1aSopenharmony_ci            "por       %%mm4, %%mm3     \n\t"
679cabdff1aSopenharmony_ci            "por       %%mm2, %%mm0     \n\t"
680cabdff1aSopenharmony_ci            "por       %%mm5, %%mm3     \n\t"
681cabdff1aSopenharmony_ci            "psllq       $16, %%mm3     \n\t"
682cabdff1aSopenharmony_ci            "por       %%mm3, %%mm0     \n\t"
683cabdff1aSopenharmony_ci            MOVNTQ"    %%mm0, (%0)      \n\t"
684cabdff1aSopenharmony_ci            ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
685cabdff1aSopenharmony_ci        d += 4;
686cabdff1aSopenharmony_ci        s += 12;
687cabdff1aSopenharmony_ci    }
688cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
689cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
690cabdff1aSopenharmony_ci    while (s < end) {
691cabdff1aSopenharmony_ci        const int r = *s++;
692cabdff1aSopenharmony_ci        const int g = *s++;
693cabdff1aSopenharmony_ci        const int b = *s++;
694cabdff1aSopenharmony_ci        *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
695cabdff1aSopenharmony_ci    }
696cabdff1aSopenharmony_ci}
697cabdff1aSopenharmony_ci
698cabdff1aSopenharmony_cistatic inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
699cabdff1aSopenharmony_ci{
700cabdff1aSopenharmony_ci    const uint16_t *end;
701cabdff1aSopenharmony_ci    const uint16_t *mm_end;
702cabdff1aSopenharmony_ci    uint8_t *d = dst;
703cabdff1aSopenharmony_ci    const uint16_t *s = (const uint16_t*)src;
704cabdff1aSopenharmony_ci    end = s + src_size/2;
705cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
706cabdff1aSopenharmony_ci    mm_end = end - 7;
707cabdff1aSopenharmony_ci    while (s < mm_end) {
708cabdff1aSopenharmony_ci        __asm__ volatile(
709cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
710cabdff1aSopenharmony_ci            "movq        (%1), %%mm0    \n\t"
711cabdff1aSopenharmony_ci            "movq        (%1), %%mm1    \n\t"
712cabdff1aSopenharmony_ci            "movq        (%1), %%mm2    \n\t"
713cabdff1aSopenharmony_ci            "pand          %2, %%mm0    \n\t"
714cabdff1aSopenharmony_ci            "pand          %3, %%mm1    \n\t"
715cabdff1aSopenharmony_ci            "pand          %4, %%mm2    \n\t"
716cabdff1aSopenharmony_ci            "psllq         $5, %%mm0    \n\t"
717cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
718cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
719cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
720cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm3    \n\t"
721cabdff1aSopenharmony_ci            "movq       %%mm1, %%mm4    \n\t"
722cabdff1aSopenharmony_ci            "movq       %%mm2, %%mm5    \n\t"
723cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm0    \n\t"
724cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm1    \n\t"
725cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm2    \n\t"
726cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm3    \n\t"
727cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm4    \n\t"
728cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm5    \n\t"
729cabdff1aSopenharmony_ci            "psllq         $8, %%mm1    \n\t"
730cabdff1aSopenharmony_ci            "psllq        $16, %%mm2    \n\t"
731cabdff1aSopenharmony_ci            "por        %%mm1, %%mm0    \n\t"
732cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t"
733cabdff1aSopenharmony_ci            "psllq         $8, %%mm4    \n\t"
734cabdff1aSopenharmony_ci            "psllq        $16, %%mm5    \n\t"
735cabdff1aSopenharmony_ci            "por        %%mm4, %%mm3    \n\t"
736cabdff1aSopenharmony_ci            "por        %%mm5, %%mm3    \n\t"
737cabdff1aSopenharmony_ci
738cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm6    \n\t"
739cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm7    \n\t"
740cabdff1aSopenharmony_ci
741cabdff1aSopenharmony_ci            "movq       8(%1), %%mm0    \n\t"
742cabdff1aSopenharmony_ci            "movq       8(%1), %%mm1    \n\t"
743cabdff1aSopenharmony_ci            "movq       8(%1), %%mm2    \n\t"
744cabdff1aSopenharmony_ci            "pand          %2, %%mm0    \n\t"
745cabdff1aSopenharmony_ci            "pand          %3, %%mm1    \n\t"
746cabdff1aSopenharmony_ci            "pand          %4, %%mm2    \n\t"
747cabdff1aSopenharmony_ci            "psllq         $5, %%mm0    \n\t"
748cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
749cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_mid)", %%mm1    \n\t"
750cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
751cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm3    \n\t"
752cabdff1aSopenharmony_ci            "movq       %%mm1, %%mm4    \n\t"
753cabdff1aSopenharmony_ci            "movq       %%mm2, %%mm5    \n\t"
754cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm0    \n\t"
755cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm1    \n\t"
756cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm2    \n\t"
757cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm3    \n\t"
758cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm4    \n\t"
759cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm5    \n\t"
760cabdff1aSopenharmony_ci            "psllq         $8, %%mm1    \n\t"
761cabdff1aSopenharmony_ci            "psllq        $16, %%mm2    \n\t"
762cabdff1aSopenharmony_ci            "por        %%mm1, %%mm0    \n\t"
763cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t"
764cabdff1aSopenharmony_ci            "psllq         $8, %%mm4    \n\t"
765cabdff1aSopenharmony_ci            "psllq        $16, %%mm5    \n\t"
766cabdff1aSopenharmony_ci            "por        %%mm4, %%mm3    \n\t"
767cabdff1aSopenharmony_ci            "por        %%mm5, %%mm3    \n\t"
768cabdff1aSopenharmony_ci
769cabdff1aSopenharmony_ci            :"=m"(*d)
770cabdff1aSopenharmony_ci            :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
771cabdff1aSopenharmony_ci             NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
772cabdff1aSopenharmony_ci            :"memory");
773cabdff1aSopenharmony_ci        /* borrowed 32 to 24 */
774cabdff1aSopenharmony_ci        __asm__ volatile(
775cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm4    \n\t"
776cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm5    \n\t"
777cabdff1aSopenharmony_ci            "movq       %%mm6, %%mm0    \n\t"
778cabdff1aSopenharmony_ci            "movq       %%mm7, %%mm1    \n\t"
779cabdff1aSopenharmony_ci
780cabdff1aSopenharmony_ci            "movq       %%mm4, %%mm6    \n\t"
781cabdff1aSopenharmony_ci            "movq       %%mm5, %%mm7    \n\t"
782cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm2    \n\t"
783cabdff1aSopenharmony_ci            "movq       %%mm1, %%mm3    \n\t"
784cabdff1aSopenharmony_ci
785cabdff1aSopenharmony_ci            STORE_BGR24_MMX
786cabdff1aSopenharmony_ci
787cabdff1aSopenharmony_ci            :: "r"(d), "m"(*s)
788cabdff1aSopenharmony_ci              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
789cabdff1aSopenharmony_ci            :"memory");
790cabdff1aSopenharmony_ci        d += 24;
791cabdff1aSopenharmony_ci        s += 8;
792cabdff1aSopenharmony_ci    }
793cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
794cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
795cabdff1aSopenharmony_ci    while (s < end) {
796cabdff1aSopenharmony_ci        register uint16_t bgr;
797cabdff1aSopenharmony_ci        bgr = *s++;
798cabdff1aSopenharmony_ci        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
799cabdff1aSopenharmony_ci        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
800cabdff1aSopenharmony_ci        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
801cabdff1aSopenharmony_ci    }
802cabdff1aSopenharmony_ci}
803cabdff1aSopenharmony_ci
804cabdff1aSopenharmony_cistatic inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
805cabdff1aSopenharmony_ci{
806cabdff1aSopenharmony_ci    const uint16_t *end;
807cabdff1aSopenharmony_ci    const uint16_t *mm_end;
808cabdff1aSopenharmony_ci    uint8_t *d = (uint8_t *)dst;
809cabdff1aSopenharmony_ci    const uint16_t *s = (const uint16_t *)src;
810cabdff1aSopenharmony_ci    end = s + src_size/2;
811cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
812cabdff1aSopenharmony_ci    mm_end = end - 7;
813cabdff1aSopenharmony_ci    while (s < mm_end) {
814cabdff1aSopenharmony_ci        __asm__ volatile(
815cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
816cabdff1aSopenharmony_ci            "movq        (%1), %%mm0    \n\t"
817cabdff1aSopenharmony_ci            "movq        (%1), %%mm1    \n\t"
818cabdff1aSopenharmony_ci            "movq        (%1), %%mm2    \n\t"
819cabdff1aSopenharmony_ci            "pand          %2, %%mm0    \n\t"
820cabdff1aSopenharmony_ci            "pand          %3, %%mm1    \n\t"
821cabdff1aSopenharmony_ci            "pand          %4, %%mm2    \n\t"
822cabdff1aSopenharmony_ci            "psllq         $5, %%mm0    \n\t"
823cabdff1aSopenharmony_ci            "psrlq         $1, %%mm2    \n\t"
824cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
825cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
826cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
827cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm3    \n\t"
828cabdff1aSopenharmony_ci            "movq       %%mm1, %%mm4    \n\t"
829cabdff1aSopenharmony_ci            "movq       %%mm2, %%mm5    \n\t"
830cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm0    \n\t"
831cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm1    \n\t"
832cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm2    \n\t"
833cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm3    \n\t"
834cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm4    \n\t"
835cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm5    \n\t"
836cabdff1aSopenharmony_ci            "psllq         $8, %%mm1    \n\t"
837cabdff1aSopenharmony_ci            "psllq        $16, %%mm2    \n\t"
838cabdff1aSopenharmony_ci            "por        %%mm1, %%mm0    \n\t"
839cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t"
840cabdff1aSopenharmony_ci            "psllq         $8, %%mm4    \n\t"
841cabdff1aSopenharmony_ci            "psllq        $16, %%mm5    \n\t"
842cabdff1aSopenharmony_ci            "por        %%mm4, %%mm3    \n\t"
843cabdff1aSopenharmony_ci            "por        %%mm5, %%mm3    \n\t"
844cabdff1aSopenharmony_ci
845cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm6    \n\t"
846cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm7    \n\t"
847cabdff1aSopenharmony_ci
848cabdff1aSopenharmony_ci            "movq       8(%1), %%mm0    \n\t"
849cabdff1aSopenharmony_ci            "movq       8(%1), %%mm1    \n\t"
850cabdff1aSopenharmony_ci            "movq       8(%1), %%mm2    \n\t"
851cabdff1aSopenharmony_ci            "pand          %2, %%mm0    \n\t"
852cabdff1aSopenharmony_ci            "pand          %3, %%mm1    \n\t"
853cabdff1aSopenharmony_ci            "pand          %4, %%mm2    \n\t"
854cabdff1aSopenharmony_ci            "psllq         $5, %%mm0    \n\t"
855cabdff1aSopenharmony_ci            "psrlq         $1, %%mm2    \n\t"
856cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_mid)", %%mm0    \n\t"
857cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
858cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
859cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm3    \n\t"
860cabdff1aSopenharmony_ci            "movq       %%mm1, %%mm4    \n\t"
861cabdff1aSopenharmony_ci            "movq       %%mm2, %%mm5    \n\t"
862cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm0    \n\t"
863cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm1    \n\t"
864cabdff1aSopenharmony_ci            "punpcklwd     %5, %%mm2    \n\t"
865cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm3    \n\t"
866cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm4    \n\t"
867cabdff1aSopenharmony_ci            "punpckhwd     %5, %%mm5    \n\t"
868cabdff1aSopenharmony_ci            "psllq         $8, %%mm1    \n\t"
869cabdff1aSopenharmony_ci            "psllq        $16, %%mm2    \n\t"
870cabdff1aSopenharmony_ci            "por        %%mm1, %%mm0    \n\t"
871cabdff1aSopenharmony_ci            "por        %%mm2, %%mm0    \n\t"
872cabdff1aSopenharmony_ci            "psllq         $8, %%mm4    \n\t"
873cabdff1aSopenharmony_ci            "psllq        $16, %%mm5    \n\t"
874cabdff1aSopenharmony_ci            "por        %%mm4, %%mm3    \n\t"
875cabdff1aSopenharmony_ci            "por        %%mm5, %%mm3    \n\t"
876cabdff1aSopenharmony_ci            :"=m"(*d)
877cabdff1aSopenharmony_ci            :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
878cabdff1aSopenharmony_ci             NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
879cabdff1aSopenharmony_ci            :"memory");
880cabdff1aSopenharmony_ci        /* borrowed 32 to 24 */
881cabdff1aSopenharmony_ci        __asm__ volatile(
882cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm4    \n\t"
883cabdff1aSopenharmony_ci            "movq       %%mm3, %%mm5    \n\t"
884cabdff1aSopenharmony_ci            "movq       %%mm6, %%mm0    \n\t"
885cabdff1aSopenharmony_ci            "movq       %%mm7, %%mm1    \n\t"
886cabdff1aSopenharmony_ci
887cabdff1aSopenharmony_ci            "movq       %%mm4, %%mm6    \n\t"
888cabdff1aSopenharmony_ci            "movq       %%mm5, %%mm7    \n\t"
889cabdff1aSopenharmony_ci            "movq       %%mm0, %%mm2    \n\t"
890cabdff1aSopenharmony_ci            "movq       %%mm1, %%mm3    \n\t"
891cabdff1aSopenharmony_ci
892cabdff1aSopenharmony_ci            STORE_BGR24_MMX
893cabdff1aSopenharmony_ci
894cabdff1aSopenharmony_ci            :: "r"(d), "m"(*s)
895cabdff1aSopenharmony_ci              NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
896cabdff1aSopenharmony_ci            :"memory");
897cabdff1aSopenharmony_ci        d += 24;
898cabdff1aSopenharmony_ci        s += 8;
899cabdff1aSopenharmony_ci    }
900cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
901cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
902cabdff1aSopenharmony_ci    while (s < end) {
903cabdff1aSopenharmony_ci        register uint16_t bgr;
904cabdff1aSopenharmony_ci        bgr = *s++;
905cabdff1aSopenharmony_ci        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
906cabdff1aSopenharmony_ci        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
907cabdff1aSopenharmony_ci        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
908cabdff1aSopenharmony_ci    }
909cabdff1aSopenharmony_ci}
910cabdff1aSopenharmony_ci
911cabdff1aSopenharmony_ci/*
912cabdff1aSopenharmony_ci * mm0 = 00 B3 00 B2 00 B1 00 B0
913cabdff1aSopenharmony_ci * mm1 = 00 G3 00 G2 00 G1 00 G0
914cabdff1aSopenharmony_ci * mm2 = 00 R3 00 R2 00 R1 00 R0
915cabdff1aSopenharmony_ci * mm6 = FF FF FF FF FF FF FF FF
916cabdff1aSopenharmony_ci * mm7 = 00 00 00 00 00 00 00 00
917cabdff1aSopenharmony_ci */
918cabdff1aSopenharmony_ci#define PACK_RGB32 \
919cabdff1aSopenharmony_ci    "packuswb   %%mm7, %%mm0    \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
920cabdff1aSopenharmony_ci    "packuswb   %%mm7, %%mm1    \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
921cabdff1aSopenharmony_ci    "packuswb   %%mm7, %%mm2    \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
922cabdff1aSopenharmony_ci    "punpcklbw  %%mm1, %%mm0    \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
923cabdff1aSopenharmony_ci    "punpcklbw  %%mm6, %%mm2    \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
924cabdff1aSopenharmony_ci    "movq       %%mm0, %%mm3    \n\t"                               \
925cabdff1aSopenharmony_ci    "punpcklwd  %%mm2, %%mm0    \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
926cabdff1aSopenharmony_ci    "punpckhwd  %%mm2, %%mm3    \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
927cabdff1aSopenharmony_ci    MOVNTQ"     %%mm0,  (%0)    \n\t"                               \
928cabdff1aSopenharmony_ci    MOVNTQ"     %%mm3, 8(%0)    \n\t"                               \
929cabdff1aSopenharmony_ci
930cabdff1aSopenharmony_cistatic inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
931cabdff1aSopenharmony_ci{
932cabdff1aSopenharmony_ci    const uint16_t *end;
933cabdff1aSopenharmony_ci    const uint16_t *mm_end;
934cabdff1aSopenharmony_ci    uint8_t *d = dst;
935cabdff1aSopenharmony_ci    const uint16_t *s = (const uint16_t *)src;
936cabdff1aSopenharmony_ci    end = s + src_size/2;
937cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
938cabdff1aSopenharmony_ci    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
939cabdff1aSopenharmony_ci    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
940cabdff1aSopenharmony_ci    mm_end = end - 3;
941cabdff1aSopenharmony_ci    while (s < mm_end) {
942cabdff1aSopenharmony_ci        __asm__ volatile(
943cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
944cabdff1aSopenharmony_ci            "movq        (%1), %%mm0    \n\t"
945cabdff1aSopenharmony_ci            "movq        (%1), %%mm1    \n\t"
946cabdff1aSopenharmony_ci            "movq        (%1), %%mm2    \n\t"
947cabdff1aSopenharmony_ci            "pand          %2, %%mm0    \n\t"
948cabdff1aSopenharmony_ci            "pand          %3, %%mm1    \n\t"
949cabdff1aSopenharmony_ci            "pand          %4, %%mm2    \n\t"
950cabdff1aSopenharmony_ci            "psllq         $5, %%mm0    \n\t"
951cabdff1aSopenharmony_ci            "pmulhw        %5, %%mm0    \n\t"
952cabdff1aSopenharmony_ci            "pmulhw        %5, %%mm1    \n\t"
953cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
954cabdff1aSopenharmony_ci            PACK_RGB32
955cabdff1aSopenharmony_ci            ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
956cabdff1aSopenharmony_ci              NAMED_CONSTRAINTS_ADD(mul15_hi)
957cabdff1aSopenharmony_ci            :"memory");
958cabdff1aSopenharmony_ci        d += 16;
959cabdff1aSopenharmony_ci        s += 4;
960cabdff1aSopenharmony_ci    }
961cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
962cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
963cabdff1aSopenharmony_ci    while (s < end) {
964cabdff1aSopenharmony_ci        register uint16_t bgr;
965cabdff1aSopenharmony_ci        bgr = *s++;
966cabdff1aSopenharmony_ci        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
967cabdff1aSopenharmony_ci        *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
968cabdff1aSopenharmony_ci        *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
969cabdff1aSopenharmony_ci        *d++ = 255;
970cabdff1aSopenharmony_ci    }
971cabdff1aSopenharmony_ci}
972cabdff1aSopenharmony_ci
973cabdff1aSopenharmony_cistatic inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
974cabdff1aSopenharmony_ci{
975cabdff1aSopenharmony_ci    const uint16_t *end;
976cabdff1aSopenharmony_ci    const uint16_t *mm_end;
977cabdff1aSopenharmony_ci    uint8_t *d = dst;
978cabdff1aSopenharmony_ci    const uint16_t *s = (const uint16_t*)src;
979cabdff1aSopenharmony_ci    end = s + src_size/2;
980cabdff1aSopenharmony_ci    __asm__ volatile(PREFETCH"    %0"::"m"(*s):"memory");
981cabdff1aSopenharmony_ci    __asm__ volatile("pxor    %%mm7,%%mm7    \n\t":::"memory");
982cabdff1aSopenharmony_ci    __asm__ volatile("pcmpeqd %%mm6,%%mm6    \n\t":::"memory");
983cabdff1aSopenharmony_ci    mm_end = end - 3;
984cabdff1aSopenharmony_ci    while (s < mm_end) {
985cabdff1aSopenharmony_ci        __asm__ volatile(
986cabdff1aSopenharmony_ci            PREFETCH"  32(%1)           \n\t"
987cabdff1aSopenharmony_ci            "movq        (%1), %%mm0    \n\t"
988cabdff1aSopenharmony_ci            "movq        (%1), %%mm1    \n\t"
989cabdff1aSopenharmony_ci            "movq        (%1), %%mm2    \n\t"
990cabdff1aSopenharmony_ci            "pand          %2, %%mm0    \n\t"
991cabdff1aSopenharmony_ci            "pand          %3, %%mm1    \n\t"
992cabdff1aSopenharmony_ci            "pand          %4, %%mm2    \n\t"
993cabdff1aSopenharmony_ci            "psllq         $5, %%mm0    \n\t"
994cabdff1aSopenharmony_ci            "psrlq         $1, %%mm2    \n\t"
995cabdff1aSopenharmony_ci            "pmulhw        %5, %%mm0    \n\t"
996cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul16_mid)", %%mm1    \n\t"
997cabdff1aSopenharmony_ci            "pmulhw        "MANGLE(mul15_hi)", %%mm2    \n\t"
998cabdff1aSopenharmony_ci            PACK_RGB32
999cabdff1aSopenharmony_ci            ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
1000cabdff1aSopenharmony_ci              NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
1001cabdff1aSopenharmony_ci            :"memory");
1002cabdff1aSopenharmony_ci        d += 16;
1003cabdff1aSopenharmony_ci        s += 4;
1004cabdff1aSopenharmony_ci    }
1005cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
1006cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
1007cabdff1aSopenharmony_ci    while (s < end) {
1008cabdff1aSopenharmony_ci        register uint16_t bgr;
1009cabdff1aSopenharmony_ci        bgr = *s++;
1010cabdff1aSopenharmony_ci        *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1011cabdff1aSopenharmony_ci        *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1012cabdff1aSopenharmony_ci        *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1013cabdff1aSopenharmony_ci        *d++ = 255;
1014cabdff1aSopenharmony_ci    }
1015cabdff1aSopenharmony_ci}
1016cabdff1aSopenharmony_ci
1017cabdff1aSopenharmony_cistatic inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1018cabdff1aSopenharmony_ci{
1019cabdff1aSopenharmony_ci    unsigned i;
1020cabdff1aSopenharmony_ci    x86_reg mmx_size= 23 - src_size;
1021cabdff1aSopenharmony_ci    __asm__ volatile (
1022cabdff1aSopenharmony_ci        "test             %%"FF_REG_a", %%"FF_REG_a"    \n\t"
1023cabdff1aSopenharmony_ci        "jns                     2f                     \n\t"
1024cabdff1aSopenharmony_ci        "movq     "MANGLE(mask24r)", %%mm5              \n\t"
1025cabdff1aSopenharmony_ci        "movq     "MANGLE(mask24g)", %%mm6              \n\t"
1026cabdff1aSopenharmony_ci        "movq     "MANGLE(mask24b)", %%mm7              \n\t"
1027cabdff1aSopenharmony_ci        ".p2align                 4                     \n\t"
1028cabdff1aSopenharmony_ci        "1:                                             \n\t"
1029cabdff1aSopenharmony_ci        PREFETCH" 32(%1, %%"FF_REG_a")                  \n\t"
1030cabdff1aSopenharmony_ci        "movq    (%1, %%"FF_REG_a"), %%mm0              \n\t" // BGR BGR BG
1031cabdff1aSopenharmony_ci        "movq    (%1, %%"FF_REG_a"), %%mm1              \n\t" // BGR BGR BG
1032cabdff1aSopenharmony_ci        "movq   2(%1, %%"FF_REG_a"), %%mm2              \n\t" // R BGR BGR B
1033cabdff1aSopenharmony_ci        "psllq                  $16, %%mm0              \n\t" // 00 BGR BGR
1034cabdff1aSopenharmony_ci        "pand                 %%mm5, %%mm0              \n\t"
1035cabdff1aSopenharmony_ci        "pand                 %%mm6, %%mm1              \n\t"
1036cabdff1aSopenharmony_ci        "pand                 %%mm7, %%mm2              \n\t"
1037cabdff1aSopenharmony_ci        "por                  %%mm0, %%mm1              \n\t"
1038cabdff1aSopenharmony_ci        "por                  %%mm2, %%mm1              \n\t"
1039cabdff1aSopenharmony_ci        "movq   6(%1, %%"FF_REG_a"), %%mm0              \n\t" // BGR BGR BG
1040cabdff1aSopenharmony_ci        MOVNTQ"               %%mm1,(%2, %%"FF_REG_a")  \n\t" // RGB RGB RG
1041cabdff1aSopenharmony_ci        "movq   8(%1, %%"FF_REG_a"), %%mm1              \n\t" // R BGR BGR B
1042cabdff1aSopenharmony_ci        "movq  10(%1, %%"FF_REG_a"), %%mm2              \n\t" // GR BGR BGR
1043cabdff1aSopenharmony_ci        "pand                 %%mm7, %%mm0              \n\t"
1044cabdff1aSopenharmony_ci        "pand                 %%mm5, %%mm1              \n\t"
1045cabdff1aSopenharmony_ci        "pand                 %%mm6, %%mm2              \n\t"
1046cabdff1aSopenharmony_ci        "por                  %%mm0, %%mm1              \n\t"
1047cabdff1aSopenharmony_ci        "por                  %%mm2, %%mm1              \n\t"
1048cabdff1aSopenharmony_ci        "movq  14(%1, %%"FF_REG_a"), %%mm0              \n\t" // R BGR BGR B
1049cabdff1aSopenharmony_ci        MOVNTQ"               %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R
1050cabdff1aSopenharmony_ci        "movq  16(%1, %%"FF_REG_a"), %%mm1              \n\t" // GR BGR BGR
1051cabdff1aSopenharmony_ci        "movq  18(%1, %%"FF_REG_a"), %%mm2              \n\t" // BGR BGR BG
1052cabdff1aSopenharmony_ci        "pand                 %%mm6, %%mm0              \n\t"
1053cabdff1aSopenharmony_ci        "pand                 %%mm7, %%mm1              \n\t"
1054cabdff1aSopenharmony_ci        "pand                 %%mm5, %%mm2              \n\t"
1055cabdff1aSopenharmony_ci        "por                  %%mm0, %%mm1              \n\t"
1056cabdff1aSopenharmony_ci        "por                  %%mm2, %%mm1              \n\t"
1057cabdff1aSopenharmony_ci        MOVNTQ"               %%mm1, 16(%2, %%"FF_REG_a") \n\t"
1058cabdff1aSopenharmony_ci        "add                    $24, %%"FF_REG_a"       \n\t"
1059cabdff1aSopenharmony_ci        " js                     1b                     \n\t"
1060cabdff1aSopenharmony_ci        "2:                                             \n\t"
1061cabdff1aSopenharmony_ci        : "+a" (mmx_size)
1062cabdff1aSopenharmony_ci        : "r" (src-mmx_size), "r"(dst-mmx_size)
1063cabdff1aSopenharmony_ci          NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
1064cabdff1aSopenharmony_ci    );
1065cabdff1aSopenharmony_ci
1066cabdff1aSopenharmony_ci    __asm__ volatile(SFENCE:::"memory");
1067cabdff1aSopenharmony_ci    __asm__ volatile(EMMS:::"memory");
1068cabdff1aSopenharmony_ci
1069cabdff1aSopenharmony_ci    if (mmx_size==23) return; //finished, was multiple of 8
1070cabdff1aSopenharmony_ci
1071cabdff1aSopenharmony_ci    src+= src_size;
1072cabdff1aSopenharmony_ci    dst+= src_size;
1073cabdff1aSopenharmony_ci    src_size= 23-mmx_size;
1074cabdff1aSopenharmony_ci    src-= src_size;
1075cabdff1aSopenharmony_ci    dst-= src_size;
1076cabdff1aSopenharmony_ci    for (i=0; i<src_size; i+=3) {
1077cabdff1aSopenharmony_ci        register uint8_t x;
1078cabdff1aSopenharmony_ci        x          = src[i + 2];
1079cabdff1aSopenharmony_ci        dst[i + 1] = src[i + 1];
1080cabdff1aSopenharmony_ci        dst[i + 2] = src[i + 0];
1081cabdff1aSopenharmony_ci        dst[i + 0] = x;
1082cabdff1aSopenharmony_ci    }
1083cabdff1aSopenharmony_ci}
1084cabdff1aSopenharmony_ci
1085cabdff1aSopenharmony_cistatic inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1086cabdff1aSopenharmony_ci                                           int width, int height,
1087cabdff1aSopenharmony_ci                                           int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1088cabdff1aSopenharmony_ci{
1089cabdff1aSopenharmony_ci    int y;
1090cabdff1aSopenharmony_ci    const x86_reg chromWidth= width>>1;
1091cabdff1aSopenharmony_ci    for (y=0; y<height; y++) {
1092cabdff1aSopenharmony_ci        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1093cabdff1aSopenharmony_ci        __asm__ volatile(
1094cabdff1aSopenharmony_ci            "xor                 %%"FF_REG_a", %%"FF_REG_a" \n\t"
1095cabdff1aSopenharmony_ci            ".p2align                    4              \n\t"
1096cabdff1aSopenharmony_ci            "1:                                         \n\t"
1097cabdff1aSopenharmony_ci            PREFETCH" 32(%1, %%"FF_REG_a", 2)           \n\t"
1098cabdff1aSopenharmony_ci            PREFETCH" 32(%2, %%"FF_REG_a")              \n\t"
1099cabdff1aSopenharmony_ci            PREFETCH" 32(%3, %%"FF_REG_a")              \n\t"
1100cabdff1aSopenharmony_ci            "movq       (%2, %%"FF_REG_a"), %%mm0       \n\t" // U(0)
1101cabdff1aSopenharmony_ci            "movq                    %%mm0, %%mm2       \n\t" // U(0)
1102cabdff1aSopenharmony_ci            "movq       (%3, %%"FF_REG_a"), %%mm1       \n\t" // V(0)
1103cabdff1aSopenharmony_ci            "punpcklbw               %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1104cabdff1aSopenharmony_ci            "punpckhbw               %%mm1, %%mm2       \n\t" // UVUV UVUV(8)
1105cabdff1aSopenharmony_ci
1106cabdff1aSopenharmony_ci            "movq     (%1, %%"FF_REG_a",2), %%mm3       \n\t" // Y(0)
1107cabdff1aSopenharmony_ci            "movq    8(%1, %%"FF_REG_a",2), %%mm5       \n\t" // Y(8)
1108cabdff1aSopenharmony_ci            "movq                    %%mm3, %%mm4       \n\t" // Y(0)
1109cabdff1aSopenharmony_ci            "movq                    %%mm5, %%mm6       \n\t" // Y(8)
1110cabdff1aSopenharmony_ci            "punpcklbw               %%mm0, %%mm3       \n\t" // YUYV YUYV(0)
1111cabdff1aSopenharmony_ci            "punpckhbw               %%mm0, %%mm4       \n\t" // YUYV YUYV(4)
1112cabdff1aSopenharmony_ci            "punpcklbw               %%mm2, %%mm5       \n\t" // YUYV YUYV(8)
1113cabdff1aSopenharmony_ci            "punpckhbw               %%mm2, %%mm6       \n\t" // YUYV YUYV(12)
1114cabdff1aSopenharmony_ci
1115cabdff1aSopenharmony_ci            MOVNTQ"                  %%mm3,   (%0, %%"FF_REG_a", 4)    \n\t"
1116cabdff1aSopenharmony_ci            MOVNTQ"                  %%mm4,  8(%0, %%"FF_REG_a", 4)    \n\t"
1117cabdff1aSopenharmony_ci            MOVNTQ"                  %%mm5, 16(%0, %%"FF_REG_a", 4)    \n\t"
1118cabdff1aSopenharmony_ci            MOVNTQ"                  %%mm6, 24(%0, %%"FF_REG_a", 4)    \n\t"
1119cabdff1aSopenharmony_ci
1120cabdff1aSopenharmony_ci            "add                        $8, %%"FF_REG_a" \n\t"
1121cabdff1aSopenharmony_ci            "cmp                        %4, %%"FF_REG_a" \n\t"
1122cabdff1aSopenharmony_ci            " jb                        1b               \n\t"
1123cabdff1aSopenharmony_ci            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1124cabdff1aSopenharmony_ci            : "%"FF_REG_a
1125cabdff1aSopenharmony_ci        );
1126cabdff1aSopenharmony_ci        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1127cabdff1aSopenharmony_ci            usrc += chromStride;
1128cabdff1aSopenharmony_ci            vsrc += chromStride;
1129cabdff1aSopenharmony_ci        }
1130cabdff1aSopenharmony_ci        ysrc += lumStride;
1131cabdff1aSopenharmony_ci        dst  += dstStride;
1132cabdff1aSopenharmony_ci    }
1133cabdff1aSopenharmony_ci    __asm__(EMMS"       \n\t"
1134cabdff1aSopenharmony_ci            SFENCE"     \n\t"
1135cabdff1aSopenharmony_ci            :::"memory");
1136cabdff1aSopenharmony_ci}
1137cabdff1aSopenharmony_ci
1138cabdff1aSopenharmony_ci/**
1139cabdff1aSopenharmony_ci * Height should be a multiple of 2 and width should be a multiple of 16.
1140cabdff1aSopenharmony_ci * (If this is a problem for anyone then tell me, and I will fix it.)
1141cabdff1aSopenharmony_ci */
1142cabdff1aSopenharmony_cistatic inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1143cabdff1aSopenharmony_ci                                      int width, int height,
1144cabdff1aSopenharmony_ci                                      int lumStride, int chromStride, int dstStride)
1145cabdff1aSopenharmony_ci{
1146cabdff1aSopenharmony_ci    //FIXME interpolate chroma
1147cabdff1aSopenharmony_ci    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1148cabdff1aSopenharmony_ci}
1149cabdff1aSopenharmony_ci
1150cabdff1aSopenharmony_cistatic inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1151cabdff1aSopenharmony_ci                                           int width, int height,
1152cabdff1aSopenharmony_ci                                           int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1153cabdff1aSopenharmony_ci{
1154cabdff1aSopenharmony_ci    int y;
1155cabdff1aSopenharmony_ci    const x86_reg chromWidth= width>>1;
1156cabdff1aSopenharmony_ci    for (y=0; y<height; y++) {
1157cabdff1aSopenharmony_ci        //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1158cabdff1aSopenharmony_ci        __asm__ volatile(
1159cabdff1aSopenharmony_ci            "xor             %%"FF_REG_a", %%"FF_REG_a" \n\t"
1160cabdff1aSopenharmony_ci            ".p2align                   4               \n\t"
1161cabdff1aSopenharmony_ci            "1:                                         \n\t"
1162cabdff1aSopenharmony_ci            PREFETCH" 32(%1, %%"FF_REG_a", 2)           \n\t"
1163cabdff1aSopenharmony_ci            PREFETCH" 32(%2, %%"FF_REG_a")              \n\t"
1164cabdff1aSopenharmony_ci            PREFETCH" 32(%3, %%"FF_REG_a")              \n\t"
1165cabdff1aSopenharmony_ci            "movq      (%2, %%"FF_REG_a"), %%mm0        \n\t" // U(0)
1166cabdff1aSopenharmony_ci            "movq                   %%mm0, %%mm2        \n\t" // U(0)
1167cabdff1aSopenharmony_ci            "movq      (%3, %%"FF_REG_a"), %%mm1        \n\t" // V(0)
1168cabdff1aSopenharmony_ci            "punpcklbw              %%mm1, %%mm0        \n\t" // UVUV UVUV(0)
1169cabdff1aSopenharmony_ci            "punpckhbw              %%mm1, %%mm2        \n\t" // UVUV UVUV(8)
1170cabdff1aSopenharmony_ci
1171cabdff1aSopenharmony_ci            "movq    (%1, %%"FF_REG_a",2), %%mm3        \n\t" // Y(0)
1172cabdff1aSopenharmony_ci            "movq   8(%1, %%"FF_REG_a",2), %%mm5        \n\t" // Y(8)
1173cabdff1aSopenharmony_ci            "movq                   %%mm0, %%mm4        \n\t" // Y(0)
1174cabdff1aSopenharmony_ci            "movq                   %%mm2, %%mm6        \n\t" // Y(8)
1175cabdff1aSopenharmony_ci            "punpcklbw              %%mm3, %%mm0        \n\t" // YUYV YUYV(0)
1176cabdff1aSopenharmony_ci            "punpckhbw              %%mm3, %%mm4        \n\t" // YUYV YUYV(4)
1177cabdff1aSopenharmony_ci            "punpcklbw              %%mm5, %%mm2        \n\t" // YUYV YUYV(8)
1178cabdff1aSopenharmony_ci            "punpckhbw              %%mm5, %%mm6        \n\t" // YUYV YUYV(12)
1179cabdff1aSopenharmony_ci
1180cabdff1aSopenharmony_ci            MOVNTQ"                 %%mm0,   (%0, %%"FF_REG_a", 4)     \n\t"
1181cabdff1aSopenharmony_ci            MOVNTQ"                 %%mm4,  8(%0, %%"FF_REG_a", 4)     \n\t"
1182cabdff1aSopenharmony_ci            MOVNTQ"                 %%mm2, 16(%0, %%"FF_REG_a", 4)     \n\t"
1183cabdff1aSopenharmony_ci            MOVNTQ"                 %%mm6, 24(%0, %%"FF_REG_a", 4)     \n\t"
1184cabdff1aSopenharmony_ci
1185cabdff1aSopenharmony_ci            "add                       $8, %%"FF_REG_a" \n\t"
1186cabdff1aSopenharmony_ci            "cmp                       %4, %%"FF_REG_a" \n\t"
1187cabdff1aSopenharmony_ci            " jb                       1b               \n\t"
1188cabdff1aSopenharmony_ci            ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1189cabdff1aSopenharmony_ci            : "%"FF_REG_a
1190cabdff1aSopenharmony_ci        );
1191cabdff1aSopenharmony_ci        if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1192cabdff1aSopenharmony_ci            usrc += chromStride;
1193cabdff1aSopenharmony_ci            vsrc += chromStride;
1194cabdff1aSopenharmony_ci        }
1195cabdff1aSopenharmony_ci        ysrc += lumStride;
1196cabdff1aSopenharmony_ci        dst += dstStride;
1197cabdff1aSopenharmony_ci    }
1198cabdff1aSopenharmony_ci    __asm__(EMMS"       \n\t"
1199cabdff1aSopenharmony_ci            SFENCE"     \n\t"
1200cabdff1aSopenharmony_ci            :::"memory");
1201cabdff1aSopenharmony_ci}
1202cabdff1aSopenharmony_ci
1203cabdff1aSopenharmony_ci/**
1204cabdff1aSopenharmony_ci * Height should be a multiple of 2 and width should be a multiple of 16
1205cabdff1aSopenharmony_ci * (If this is a problem for anyone then tell me, and I will fix it.)
1206cabdff1aSopenharmony_ci */
1207cabdff1aSopenharmony_cistatic inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1208cabdff1aSopenharmony_ci                                      int width, int height,
1209cabdff1aSopenharmony_ci                                      int lumStride, int chromStride, int dstStride)
1210cabdff1aSopenharmony_ci{
1211cabdff1aSopenharmony_ci    //FIXME interpolate chroma
1212cabdff1aSopenharmony_ci    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1213cabdff1aSopenharmony_ci}
1214cabdff1aSopenharmony_ci
1215cabdff1aSopenharmony_ci/**
1216cabdff1aSopenharmony_ci * Width should be a multiple of 16.
1217cabdff1aSopenharmony_ci */
1218cabdff1aSopenharmony_cistatic inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1219cabdff1aSopenharmony_ci                                         int width, int height,
1220cabdff1aSopenharmony_ci                                         int lumStride, int chromStride, int dstStride)
1221cabdff1aSopenharmony_ci{
1222cabdff1aSopenharmony_ci    RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1223cabdff1aSopenharmony_ci}
1224cabdff1aSopenharmony_ci
1225cabdff1aSopenharmony_ci/**
1226cabdff1aSopenharmony_ci * Width should be a multiple of 16.
1227cabdff1aSopenharmony_ci */
1228cabdff1aSopenharmony_cistatic inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1229cabdff1aSopenharmony_ci                                         int width, int height,
1230cabdff1aSopenharmony_ci                                         int lumStride, int chromStride, int dstStride)
1231cabdff1aSopenharmony_ci{
1232cabdff1aSopenharmony_ci    RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1233cabdff1aSopenharmony_ci}
1234cabdff1aSopenharmony_ci
1235cabdff1aSopenharmony_ci/**
1236cabdff1aSopenharmony_ci * Height should be a multiple of 2 and width should be a multiple of 16.
1237cabdff1aSopenharmony_ci * (If this is a problem for anyone then tell me, and I will fix it.)
1238cabdff1aSopenharmony_ci */
1239cabdff1aSopenharmony_cistatic inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1240cabdff1aSopenharmony_ci                                      int width, int height,
1241cabdff1aSopenharmony_ci                                      int lumStride, int chromStride, int srcStride)
1242cabdff1aSopenharmony_ci{
1243cabdff1aSopenharmony_ci    int y;
1244cabdff1aSopenharmony_ci    const x86_reg chromWidth= width>>1;
1245cabdff1aSopenharmony_ci    for (y=0; y<height; y+=2) {
1246cabdff1aSopenharmony_ci        __asm__ volatile(
1247cabdff1aSopenharmony_ci            "xor              %%"FF_REG_a", %%"FF_REG_a"\n\t"
1248cabdff1aSopenharmony_ci            "pcmpeqw                 %%mm7, %%mm7       \n\t"
1249cabdff1aSopenharmony_ci            "psrlw                      $8, %%mm7       \n\t" // FF,00,FF,00...
1250cabdff1aSopenharmony_ci            ".p2align                    4              \n\t"
1251cabdff1aSopenharmony_ci            "1:                \n\t"
1252cabdff1aSopenharmony_ci            PREFETCH" 64(%0, %%"FF_REG_a", 4)           \n\t"
1253cabdff1aSopenharmony_ci            "movq    (%0, %%"FF_REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1254cabdff1aSopenharmony_ci            "movq   8(%0, %%"FF_REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1255cabdff1aSopenharmony_ci            "movq                    %%mm0, %%mm2       \n\t" // YUYV YUYV(0)
1256cabdff1aSopenharmony_ci            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(4)
1257cabdff1aSopenharmony_ci            "psrlw                      $8, %%mm0       \n\t" // U0V0 U0V0(0)
1258cabdff1aSopenharmony_ci            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(4)
1259cabdff1aSopenharmony_ci            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(0)
1260cabdff1aSopenharmony_ci            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(4)
1261cabdff1aSopenharmony_ci            "packuswb                %%mm1, %%mm0       \n\t" // UVUV UVUV(0)
1262cabdff1aSopenharmony_ci            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(0)
1263cabdff1aSopenharmony_ci
1264cabdff1aSopenharmony_ci            MOVNTQ"                  %%mm2, (%1, %%"FF_REG_a", 2) \n\t"
1265cabdff1aSopenharmony_ci
1266cabdff1aSopenharmony_ci            "movq  16(%0, %%"FF_REG_a", 4), %%mm1       \n\t" // YUYV YUYV(8)
1267cabdff1aSopenharmony_ci            "movq  24(%0, %%"FF_REG_a", 4), %%mm2       \n\t" // YUYV YUYV(12)
1268cabdff1aSopenharmony_ci            "movq                    %%mm1, %%mm3       \n\t" // YUYV YUYV(8)
1269cabdff1aSopenharmony_ci            "movq                    %%mm2, %%mm4       \n\t" // YUYV YUYV(12)
1270cabdff1aSopenharmony_ci            "psrlw                      $8, %%mm1       \n\t" // U0V0 U0V0(8)
1271cabdff1aSopenharmony_ci            "psrlw                      $8, %%mm2       \n\t" // U0V0 U0V0(12)
1272cabdff1aSopenharmony_ci            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(8)
1273cabdff1aSopenharmony_ci            "pand                    %%mm7, %%mm4       \n\t" // Y0Y0 Y0Y0(12)
1274cabdff1aSopenharmony_ci            "packuswb                %%mm2, %%mm1       \n\t" // UVUV UVUV(8)
1275cabdff1aSopenharmony_ci            "packuswb                %%mm4, %%mm3       \n\t" // YYYY YYYY(8)
1276cabdff1aSopenharmony_ci
1277cabdff1aSopenharmony_ci            MOVNTQ"                  %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t"
1278cabdff1aSopenharmony_ci
1279cabdff1aSopenharmony_ci            "movq                    %%mm0, %%mm2       \n\t" // UVUV UVUV(0)
1280cabdff1aSopenharmony_ci            "movq                    %%mm1, %%mm3       \n\t" // UVUV UVUV(8)
1281cabdff1aSopenharmony_ci            "psrlw                      $8, %%mm0       \n\t" // V0V0 V0V0(0)
1282cabdff1aSopenharmony_ci            "psrlw                      $8, %%mm1       \n\t" // V0V0 V0V0(8)
1283cabdff1aSopenharmony_ci            "pand                    %%mm7, %%mm2       \n\t" // U0U0 U0U0(0)
1284cabdff1aSopenharmony_ci            "pand                    %%mm7, %%mm3       \n\t" // U0U0 U0U0(8)
1285cabdff1aSopenharmony_ci            "packuswb                %%mm1, %%mm0       \n\t" // VVVV VVVV(0)
1286cabdff1aSopenharmony_ci            "packuswb                %%mm3, %%mm2       \n\t" // UUUU UUUU(0)
1287cabdff1aSopenharmony_ci
1288cabdff1aSopenharmony_ci            MOVNTQ"                  %%mm0, (%3, %%"FF_REG_a")     \n\t"
1289cabdff1aSopenharmony_ci            MOVNTQ"                  %%mm2, (%2, %%"FF_REG_a")     \n\t"
1290cabdff1aSopenharmony_ci
1291cabdff1aSopenharmony_ci            "add                        $8, %%"FF_REG_a" \n\t"
1292cabdff1aSopenharmony_ci            "cmp                        %4, %%"FF_REG_a" \n\t"
1293cabdff1aSopenharmony_ci            " jb                        1b               \n\t"
1294cabdff1aSopenharmony_ci            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1295cabdff1aSopenharmony_ci            : "memory", "%"FF_REG_a
1296cabdff1aSopenharmony_ci        );
1297cabdff1aSopenharmony_ci
1298cabdff1aSopenharmony_ci        ydst += lumStride;
1299cabdff1aSopenharmony_ci        src  += srcStride;
1300cabdff1aSopenharmony_ci
1301cabdff1aSopenharmony_ci        __asm__ volatile(
1302cabdff1aSopenharmony_ci            "xor              %%"FF_REG_a", %%"FF_REG_a"\n\t"
1303cabdff1aSopenharmony_ci            ".p2align                    4              \n\t"
1304cabdff1aSopenharmony_ci            "1:                                         \n\t"
1305cabdff1aSopenharmony_ci            PREFETCH" 64(%0, %%"FF_REG_a", 4)           \n\t"
1306cabdff1aSopenharmony_ci            "movq    (%0, %%"FF_REG_a", 4), %%mm0       \n\t" // YUYV YUYV(0)
1307cabdff1aSopenharmony_ci            "movq   8(%0, %%"FF_REG_a", 4), %%mm1       \n\t" // YUYV YUYV(4)
1308cabdff1aSopenharmony_ci            "movq  16(%0, %%"FF_REG_a", 4), %%mm2       \n\t" // YUYV YUYV(8)
1309cabdff1aSopenharmony_ci            "movq  24(%0, %%"FF_REG_a", 4), %%mm3       \n\t" // YUYV YUYV(12)
1310cabdff1aSopenharmony_ci            "pand                    %%mm7, %%mm0       \n\t" // Y0Y0 Y0Y0(0)
1311cabdff1aSopenharmony_ci            "pand                    %%mm7, %%mm1       \n\t" // Y0Y0 Y0Y0(4)
1312cabdff1aSopenharmony_ci            "pand                    %%mm7, %%mm2       \n\t" // Y0Y0 Y0Y0(8)
1313cabdff1aSopenharmony_ci            "pand                    %%mm7, %%mm3       \n\t" // Y0Y0 Y0Y0(12)
1314cabdff1aSopenharmony_ci            "packuswb                %%mm1, %%mm0       \n\t" // YYYY YYYY(0)
1315cabdff1aSopenharmony_ci            "packuswb                %%mm3, %%mm2       \n\t" // YYYY YYYY(8)
1316cabdff1aSopenharmony_ci
1317cabdff1aSopenharmony_ci            MOVNTQ"                  %%mm0,  (%1, %%"FF_REG_a", 2) \n\t"
1318cabdff1aSopenharmony_ci            MOVNTQ"                  %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t"
1319cabdff1aSopenharmony_ci
1320cabdff1aSopenharmony_ci            "add                        $8, %%"FF_REG_a"\n\t"
1321cabdff1aSopenharmony_ci            "cmp                        %4, %%"FF_REG_a"\n\t"
1322cabdff1aSopenharmony_ci            " jb                        1b              \n\t"
1323cabdff1aSopenharmony_ci
1324cabdff1aSopenharmony_ci            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1325cabdff1aSopenharmony_ci            : "memory", "%"FF_REG_a
1326cabdff1aSopenharmony_ci        );
1327cabdff1aSopenharmony_ci        udst += chromStride;
1328cabdff1aSopenharmony_ci        vdst += chromStride;
1329cabdff1aSopenharmony_ci        ydst += lumStride;
1330cabdff1aSopenharmony_ci        src  += srcStride;
1331cabdff1aSopenharmony_ci    }
1332cabdff1aSopenharmony_ci    __asm__ volatile(EMMS"       \n\t"
1333cabdff1aSopenharmony_ci                     SFENCE"     \n\t"
1334cabdff1aSopenharmony_ci                     :::"memory");
1335cabdff1aSopenharmony_ci}
1336cabdff1aSopenharmony_ci
1337cabdff1aSopenharmony_cistatic inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1338cabdff1aSopenharmony_ci{
1339cabdff1aSopenharmony_ci    int x,y;
1340cabdff1aSopenharmony_ci
1341cabdff1aSopenharmony_ci    dst[0]= src[0];
1342cabdff1aSopenharmony_ci
1343cabdff1aSopenharmony_ci    // first line
1344cabdff1aSopenharmony_ci    for (x=0; x<srcWidth-1; x++) {
1345cabdff1aSopenharmony_ci        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1346cabdff1aSopenharmony_ci        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1347cabdff1aSopenharmony_ci    }
1348cabdff1aSopenharmony_ci    dst[2*srcWidth-1]= src[srcWidth-1];
1349cabdff1aSopenharmony_ci
1350cabdff1aSopenharmony_ci    dst+= dstStride;
1351cabdff1aSopenharmony_ci
1352cabdff1aSopenharmony_ci    for (y=1; y<srcHeight; y++) {
1353cabdff1aSopenharmony_ci        x86_reg mmxSize= srcWidth&~15;
1354cabdff1aSopenharmony_ci
1355cabdff1aSopenharmony_ci        if (mmxSize) {
1356cabdff1aSopenharmony_ci        __asm__ volatile(
1357cabdff1aSopenharmony_ci            "mov                       %4, %%"FF_REG_a" \n\t"
1358cabdff1aSopenharmony_ci            "movq        "MANGLE(mmx_ff)", %%mm0    \n\t"
1359cabdff1aSopenharmony_ci            "movq      (%0, %%"FF_REG_a"), %%mm4    \n\t"
1360cabdff1aSopenharmony_ci            "movq                   %%mm4, %%mm2    \n\t"
1361cabdff1aSopenharmony_ci            "psllq                     $8, %%mm4    \n\t"
1362cabdff1aSopenharmony_ci            "pand                   %%mm0, %%mm2    \n\t"
1363cabdff1aSopenharmony_ci            "por                    %%mm2, %%mm4    \n\t"
1364cabdff1aSopenharmony_ci            "movq      (%1, %%"FF_REG_a"), %%mm5    \n\t"
1365cabdff1aSopenharmony_ci            "movq                   %%mm5, %%mm3    \n\t"
1366cabdff1aSopenharmony_ci            "psllq                     $8, %%mm5    \n\t"
1367cabdff1aSopenharmony_ci            "pand                   %%mm0, %%mm3    \n\t"
1368cabdff1aSopenharmony_ci            "por                    %%mm3, %%mm5    \n\t"
1369cabdff1aSopenharmony_ci            "1:                                     \n\t"
1370cabdff1aSopenharmony_ci            "movq      (%0, %%"FF_REG_a"), %%mm0    \n\t"
1371cabdff1aSopenharmony_ci            "movq      (%1, %%"FF_REG_a"), %%mm1    \n\t"
1372cabdff1aSopenharmony_ci            "movq     1(%0, %%"FF_REG_a"), %%mm2    \n\t"
1373cabdff1aSopenharmony_ci            "movq     1(%1, %%"FF_REG_a"), %%mm3    \n\t"
1374cabdff1aSopenharmony_ci            PAVGB"                  %%mm0, %%mm5    \n\t"
1375cabdff1aSopenharmony_ci            PAVGB"                  %%mm0, %%mm3    \n\t"
1376cabdff1aSopenharmony_ci            PAVGB"                  %%mm0, %%mm5    \n\t"
1377cabdff1aSopenharmony_ci            PAVGB"                  %%mm0, %%mm3    \n\t"
1378cabdff1aSopenharmony_ci            PAVGB"                  %%mm1, %%mm4    \n\t"
1379cabdff1aSopenharmony_ci            PAVGB"                  %%mm1, %%mm2    \n\t"
1380cabdff1aSopenharmony_ci            PAVGB"                  %%mm1, %%mm4    \n\t"
1381cabdff1aSopenharmony_ci            PAVGB"                  %%mm1, %%mm2    \n\t"
1382cabdff1aSopenharmony_ci            "movq                   %%mm5, %%mm7    \n\t"
1383cabdff1aSopenharmony_ci            "movq                   %%mm4, %%mm6    \n\t"
1384cabdff1aSopenharmony_ci            "punpcklbw              %%mm3, %%mm5    \n\t"
1385cabdff1aSopenharmony_ci            "punpckhbw              %%mm3, %%mm7    \n\t"
1386cabdff1aSopenharmony_ci            "punpcklbw              %%mm2, %%mm4    \n\t"
1387cabdff1aSopenharmony_ci            "punpckhbw              %%mm2, %%mm6    \n\t"
1388cabdff1aSopenharmony_ci            MOVNTQ"                 %%mm5,  (%2, %%"FF_REG_a", 2)  \n\t"
1389cabdff1aSopenharmony_ci            MOVNTQ"                 %%mm7, 8(%2, %%"FF_REG_a", 2)  \n\t"
1390cabdff1aSopenharmony_ci            MOVNTQ"                 %%mm4,  (%3, %%"FF_REG_a", 2)  \n\t"
1391cabdff1aSopenharmony_ci            MOVNTQ"                 %%mm6, 8(%3, %%"FF_REG_a", 2)  \n\t"
1392cabdff1aSopenharmony_ci            "add                       $8, %%"FF_REG_a"            \n\t"
1393cabdff1aSopenharmony_ci            "movq    -1(%0, %%"FF_REG_a"), %%mm4    \n\t"
1394cabdff1aSopenharmony_ci            "movq    -1(%1, %%"FF_REG_a"), %%mm5    \n\t"
1395cabdff1aSopenharmony_ci            " js                       1b           \n\t"
1396cabdff1aSopenharmony_ci            :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1397cabdff1aSopenharmony_ci               "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1398cabdff1aSopenharmony_ci               "g" (-mmxSize)
1399cabdff1aSopenharmony_ci               NAMED_CONSTRAINTS_ADD(mmx_ff)
1400cabdff1aSopenharmony_ci            : "%"FF_REG_a
1401cabdff1aSopenharmony_ci        );
1402cabdff1aSopenharmony_ci        } else {
1403cabdff1aSopenharmony_ci            mmxSize = 1;
1404cabdff1aSopenharmony_ci            dst[0]         = (src[0] * 3 + src[srcStride]) >> 2;
1405cabdff1aSopenharmony_ci            dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
1406cabdff1aSopenharmony_ci        }
1407cabdff1aSopenharmony_ci
1408cabdff1aSopenharmony_ci        for (x=mmxSize-1; x<srcWidth-1; x++) {
1409cabdff1aSopenharmony_ci            dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1410cabdff1aSopenharmony_ci            dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1411cabdff1aSopenharmony_ci            dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1412cabdff1aSopenharmony_ci            dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1413cabdff1aSopenharmony_ci        }
1414cabdff1aSopenharmony_ci        dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1415cabdff1aSopenharmony_ci        dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1416cabdff1aSopenharmony_ci
1417cabdff1aSopenharmony_ci        dst+=dstStride*2;
1418cabdff1aSopenharmony_ci        src+=srcStride;
1419cabdff1aSopenharmony_ci    }
1420cabdff1aSopenharmony_ci
1421cabdff1aSopenharmony_ci    // last line
1422cabdff1aSopenharmony_ci    dst[0]= src[0];
1423cabdff1aSopenharmony_ci
1424cabdff1aSopenharmony_ci    for (x=0; x<srcWidth-1; x++) {
1425cabdff1aSopenharmony_ci        dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1426cabdff1aSopenharmony_ci        dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1427cabdff1aSopenharmony_ci    }
1428cabdff1aSopenharmony_ci    dst[2*srcWidth-1]= src[srcWidth-1];
1429cabdff1aSopenharmony_ci
1430cabdff1aSopenharmony_ci    __asm__ volatile(EMMS"       \n\t"
1431cabdff1aSopenharmony_ci                     SFENCE"     \n\t"
1432cabdff1aSopenharmony_ci                     :::"memory");
1433cabdff1aSopenharmony_ci}
1434cabdff1aSopenharmony_ci
1435cabdff1aSopenharmony_ci/**
1436cabdff1aSopenharmony_ci * Height should be a multiple of 2 and width should be a multiple of 16.
1437cabdff1aSopenharmony_ci * (If this is a problem for anyone then tell me, and I will fix it.)
1438cabdff1aSopenharmony_ci * Chrominance data is only taken from every second line, others are ignored.
1439cabdff1aSopenharmony_ci * FIXME: Write HQ version.
1440cabdff1aSopenharmony_ci */
1441cabdff1aSopenharmony_cistatic inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1442cabdff1aSopenharmony_ci                                      int width, int height,
1443cabdff1aSopenharmony_ci                                      int lumStride, int chromStride, int srcStride)
1444cabdff1aSopenharmony_ci{
1445cabdff1aSopenharmony_ci    int y;
1446cabdff1aSopenharmony_ci    const x86_reg chromWidth= width>>1;
1447cabdff1aSopenharmony_ci    for (y=0; y<height; y+=2) {
1448cabdff1aSopenharmony_ci        __asm__ volatile(
1449cabdff1aSopenharmony_ci            "xor          %%"FF_REG_a", %%"FF_REG_a" \n\t"
1450cabdff1aSopenharmony_ci            "pcmpeqw             %%mm7, %%mm7   \n\t"
1451cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm7   \n\t" // FF,00,FF,00...
1452cabdff1aSopenharmony_ci            ".p2align                4          \n\t"
1453cabdff1aSopenharmony_ci            "1:                                 \n\t"
1454cabdff1aSopenharmony_ci            PREFETCH" 64(%0, %%"FF_REG_a", 4)          \n\t"
1455cabdff1aSopenharmony_ci            "movq       (%0, %%"FF_REG_a", 4), %%mm0   \n\t" // UYVY UYVY(0)
1456cabdff1aSopenharmony_ci            "movq      8(%0, %%"FF_REG_a", 4), %%mm1   \n\t" // UYVY UYVY(4)
1457cabdff1aSopenharmony_ci            "movq                %%mm0, %%mm2   \n\t" // UYVY UYVY(0)
1458cabdff1aSopenharmony_ci            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(4)
1459cabdff1aSopenharmony_ci            "pand                %%mm7, %%mm0   \n\t" // U0V0 U0V0(0)
1460cabdff1aSopenharmony_ci            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(4)
1461cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(0)
1462cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(4)
1463cabdff1aSopenharmony_ci            "packuswb            %%mm1, %%mm0   \n\t" // UVUV UVUV(0)
1464cabdff1aSopenharmony_ci            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(0)
1465cabdff1aSopenharmony_ci
1466cabdff1aSopenharmony_ci            MOVNTQ"              %%mm2,  (%1, %%"FF_REG_a", 2) \n\t"
1467cabdff1aSopenharmony_ci
1468cabdff1aSopenharmony_ci            "movq     16(%0, %%"FF_REG_a", 4), %%mm1   \n\t" // UYVY UYVY(8)
1469cabdff1aSopenharmony_ci            "movq     24(%0, %%"FF_REG_a", 4), %%mm2   \n\t" // UYVY UYVY(12)
1470cabdff1aSopenharmony_ci            "movq                %%mm1, %%mm3   \n\t" // UYVY UYVY(8)
1471cabdff1aSopenharmony_ci            "movq                %%mm2, %%mm4   \n\t" // UYVY UYVY(12)
1472cabdff1aSopenharmony_ci            "pand                %%mm7, %%mm1   \n\t" // U0V0 U0V0(8)
1473cabdff1aSopenharmony_ci            "pand                %%mm7, %%mm2   \n\t" // U0V0 U0V0(12)
1474cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(8)
1475cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm4   \n\t" // Y0Y0 Y0Y0(12)
1476cabdff1aSopenharmony_ci            "packuswb            %%mm2, %%mm1   \n\t" // UVUV UVUV(8)
1477cabdff1aSopenharmony_ci            "packuswb            %%mm4, %%mm3   \n\t" // YYYY YYYY(8)
1478cabdff1aSopenharmony_ci
1479cabdff1aSopenharmony_ci            MOVNTQ"              %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t"
1480cabdff1aSopenharmony_ci
1481cabdff1aSopenharmony_ci            "movq                %%mm0, %%mm2   \n\t" // UVUV UVUV(0)
1482cabdff1aSopenharmony_ci            "movq                %%mm1, %%mm3   \n\t" // UVUV UVUV(8)
1483cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm0   \n\t" // V0V0 V0V0(0)
1484cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm1   \n\t" // V0V0 V0V0(8)
1485cabdff1aSopenharmony_ci            "pand                %%mm7, %%mm2   \n\t" // U0U0 U0U0(0)
1486cabdff1aSopenharmony_ci            "pand                %%mm7, %%mm3   \n\t" // U0U0 U0U0(8)
1487cabdff1aSopenharmony_ci            "packuswb            %%mm1, %%mm0   \n\t" // VVVV VVVV(0)
1488cabdff1aSopenharmony_ci            "packuswb            %%mm3, %%mm2   \n\t" // UUUU UUUU(0)
1489cabdff1aSopenharmony_ci
1490cabdff1aSopenharmony_ci            MOVNTQ"              %%mm0, (%3, %%"FF_REG_a") \n\t"
1491cabdff1aSopenharmony_ci            MOVNTQ"              %%mm2, (%2, %%"FF_REG_a") \n\t"
1492cabdff1aSopenharmony_ci
1493cabdff1aSopenharmony_ci            "add                    $8, %%"FF_REG_a" \n\t"
1494cabdff1aSopenharmony_ci            "cmp                    %4, %%"FF_REG_a" \n\t"
1495cabdff1aSopenharmony_ci            " jb                    1b               \n\t"
1496cabdff1aSopenharmony_ci            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1497cabdff1aSopenharmony_ci            : "memory", "%"FF_REG_a
1498cabdff1aSopenharmony_ci        );
1499cabdff1aSopenharmony_ci
1500cabdff1aSopenharmony_ci        ydst += lumStride;
1501cabdff1aSopenharmony_ci        src  += srcStride;
1502cabdff1aSopenharmony_ci
1503cabdff1aSopenharmony_ci        __asm__ volatile(
1504cabdff1aSopenharmony_ci            "xor          %%"FF_REG_a", %%"FF_REG_a"  \n\t"
1505cabdff1aSopenharmony_ci            ".p2align                4                \n\t"
1506cabdff1aSopenharmony_ci            "1:                                       \n\t"
1507cabdff1aSopenharmony_ci            PREFETCH" 64(%0, %%"FF_REG_a", 4)         \n\t"
1508cabdff1aSopenharmony_ci            "movq       (%0, %%"FF_REG_a", 4), %%mm0  \n\t" // YUYV YUYV(0)
1509cabdff1aSopenharmony_ci            "movq      8(%0, %%"FF_REG_a", 4), %%mm1  \n\t" // YUYV YUYV(4)
1510cabdff1aSopenharmony_ci            "movq     16(%0, %%"FF_REG_a", 4), %%mm2  \n\t" // YUYV YUYV(8)
1511cabdff1aSopenharmony_ci            "movq     24(%0, %%"FF_REG_a", 4), %%mm3  \n\t" // YUYV YUYV(12)
1512cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm0   \n\t" // Y0Y0 Y0Y0(0)
1513cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm1   \n\t" // Y0Y0 Y0Y0(4)
1514cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm2   \n\t" // Y0Y0 Y0Y0(8)
1515cabdff1aSopenharmony_ci            "psrlw                  $8, %%mm3   \n\t" // Y0Y0 Y0Y0(12)
1516cabdff1aSopenharmony_ci            "packuswb            %%mm1, %%mm0   \n\t" // YYYY YYYY(0)
1517cabdff1aSopenharmony_ci            "packuswb            %%mm3, %%mm2   \n\t" // YYYY YYYY(8)
1518cabdff1aSopenharmony_ci
1519cabdff1aSopenharmony_ci            MOVNTQ"              %%mm0,  (%1, %%"FF_REG_a", 2) \n\t"
1520cabdff1aSopenharmony_ci            MOVNTQ"              %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t"
1521cabdff1aSopenharmony_ci
1522cabdff1aSopenharmony_ci            "add                    $8, %%"FF_REG_a" \n\t"
1523cabdff1aSopenharmony_ci            "cmp                    %4, %%"FF_REG_a" \n\t"
1524cabdff1aSopenharmony_ci            " jb                    1b               \n\t"
1525cabdff1aSopenharmony_ci
1526cabdff1aSopenharmony_ci            ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1527cabdff1aSopenharmony_ci            : "memory", "%"FF_REG_a
1528cabdff1aSopenharmony_ci        );
1529cabdff1aSopenharmony_ci        udst += chromStride;
1530cabdff1aSopenharmony_ci        vdst += chromStride;
1531cabdff1aSopenharmony_ci        ydst += lumStride;
1532cabdff1aSopenharmony_ci        src  += srcStride;
1533cabdff1aSopenharmony_ci    }
1534cabdff1aSopenharmony_ci    __asm__ volatile(EMMS"       \n\t"
1535cabdff1aSopenharmony_ci                     SFENCE"     \n\t"
1536cabdff1aSopenharmony_ci                     :::"memory");
1537cabdff1aSopenharmony_ci}
1538cabdff1aSopenharmony_ci
1539cabdff1aSopenharmony_ci/**
1540cabdff1aSopenharmony_ci * Height should be a multiple of 2 and width should be a multiple of 2.
1541cabdff1aSopenharmony_ci * (If this is a problem for anyone then tell me, and I will fix it.)
1542cabdff1aSopenharmony_ci * Chrominance data is only taken from every second line,
1543cabdff1aSopenharmony_ci * others are ignored in the C version.
1544cabdff1aSopenharmony_ci * FIXME: Write HQ version.
1545cabdff1aSopenharmony_ci */
1546cabdff1aSopenharmony_ci#if HAVE_7REGS
1547cabdff1aSopenharmony_cistatic inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1548cabdff1aSopenharmony_ci                                       int width, int height,
1549cabdff1aSopenharmony_ci                                       int lumStride, int chromStride, int srcStride,
1550cabdff1aSopenharmony_ci                                       int32_t *rgb2yuv)
1551cabdff1aSopenharmony_ci{
1552cabdff1aSopenharmony_ci#define BGR2Y_IDX "16*4+16*32"
1553cabdff1aSopenharmony_ci#define BGR2U_IDX "16*4+16*33"
1554cabdff1aSopenharmony_ci#define BGR2V_IDX "16*4+16*34"
1555cabdff1aSopenharmony_ci    int y;
1556cabdff1aSopenharmony_ci    const x86_reg chromWidth= width>>1;
1557cabdff1aSopenharmony_ci
1558cabdff1aSopenharmony_ci    if (height > 2) {
1559cabdff1aSopenharmony_ci        ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
1560cabdff1aSopenharmony_ci        src  += 2*srcStride;
1561cabdff1aSopenharmony_ci        ydst += 2*lumStride;
1562cabdff1aSopenharmony_ci        udst += chromStride;
1563cabdff1aSopenharmony_ci        vdst += chromStride;
1564cabdff1aSopenharmony_ci        height -= 2;
1565cabdff1aSopenharmony_ci    }
1566cabdff1aSopenharmony_ci
1567cabdff1aSopenharmony_ci    for (y=0; y<height-2; y+=2) {
1568cabdff1aSopenharmony_ci        int i;
1569cabdff1aSopenharmony_ci        for (i=0; i<2; i++) {
1570cabdff1aSopenharmony_ci            __asm__ volatile(
1571cabdff1aSopenharmony_ci                "mov                        %2, %%"FF_REG_a"\n\t"
1572cabdff1aSopenharmony_ci                "movq          "BGR2Y_IDX"(%3), %%mm6       \n\t"
1573cabdff1aSopenharmony_ci                "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1574cabdff1aSopenharmony_ci                "pxor                    %%mm7, %%mm7       \n\t"
1575cabdff1aSopenharmony_ci                "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1576cabdff1aSopenharmony_ci                ".p2align                    4              \n\t"
1577cabdff1aSopenharmony_ci                "1:                                         \n\t"
1578cabdff1aSopenharmony_ci                PREFETCH" 64(%0, %%"FF_REG_d")              \n\t"
1579cabdff1aSopenharmony_ci                "movd       (%0, %%"FF_REG_d"), %%mm0       \n\t"
1580cabdff1aSopenharmony_ci                "movd      3(%0, %%"FF_REG_d"), %%mm1       \n\t"
1581cabdff1aSopenharmony_ci                "punpcklbw               %%mm7, %%mm0       \n\t"
1582cabdff1aSopenharmony_ci                "punpcklbw               %%mm7, %%mm1       \n\t"
1583cabdff1aSopenharmony_ci                "movd      6(%0, %%"FF_REG_d"), %%mm2       \n\t"
1584cabdff1aSopenharmony_ci                "movd      9(%0, %%"FF_REG_d"), %%mm3       \n\t"
1585cabdff1aSopenharmony_ci                "punpcklbw               %%mm7, %%mm2       \n\t"
1586cabdff1aSopenharmony_ci                "punpcklbw               %%mm7, %%mm3       \n\t"
1587cabdff1aSopenharmony_ci                "pmaddwd                 %%mm6, %%mm0       \n\t"
1588cabdff1aSopenharmony_ci                "pmaddwd                 %%mm6, %%mm1       \n\t"
1589cabdff1aSopenharmony_ci                "pmaddwd                 %%mm6, %%mm2       \n\t"
1590cabdff1aSopenharmony_ci                "pmaddwd                 %%mm6, %%mm3       \n\t"
1591cabdff1aSopenharmony_ci                "psrad                      $8, %%mm0       \n\t"
1592cabdff1aSopenharmony_ci                "psrad                      $8, %%mm1       \n\t"
1593cabdff1aSopenharmony_ci                "psrad                      $8, %%mm2       \n\t"
1594cabdff1aSopenharmony_ci                "psrad                      $8, %%mm3       \n\t"
1595cabdff1aSopenharmony_ci                "packssdw                %%mm1, %%mm0       \n\t"
1596cabdff1aSopenharmony_ci                "packssdw                %%mm3, %%mm2       \n\t"
1597cabdff1aSopenharmony_ci                "pmaddwd                 %%mm5, %%mm0       \n\t"
1598cabdff1aSopenharmony_ci                "pmaddwd                 %%mm5, %%mm2       \n\t"
1599cabdff1aSopenharmony_ci                "packssdw                %%mm2, %%mm0       \n\t"
1600cabdff1aSopenharmony_ci                "psraw                      $7, %%mm0       \n\t"
1601cabdff1aSopenharmony_ci
1602cabdff1aSopenharmony_ci                "movd     12(%0, %%"FF_REG_d"), %%mm4       \n\t"
1603cabdff1aSopenharmony_ci                "movd     15(%0, %%"FF_REG_d"), %%mm1       \n\t"
1604cabdff1aSopenharmony_ci                "punpcklbw               %%mm7, %%mm4       \n\t"
1605cabdff1aSopenharmony_ci                "punpcklbw               %%mm7, %%mm1       \n\t"
1606cabdff1aSopenharmony_ci                "movd     18(%0, %%"FF_REG_d"), %%mm2       \n\t"
1607cabdff1aSopenharmony_ci                "movd     21(%0, %%"FF_REG_d"), %%mm3       \n\t"
1608cabdff1aSopenharmony_ci                "punpcklbw               %%mm7, %%mm2       \n\t"
1609cabdff1aSopenharmony_ci                "punpcklbw               %%mm7, %%mm3       \n\t"
1610cabdff1aSopenharmony_ci                "pmaddwd                 %%mm6, %%mm4       \n\t"
1611cabdff1aSopenharmony_ci                "pmaddwd                 %%mm6, %%mm1       \n\t"
1612cabdff1aSopenharmony_ci                "pmaddwd                 %%mm6, %%mm2       \n\t"
1613cabdff1aSopenharmony_ci                "pmaddwd                 %%mm6, %%mm3       \n\t"
1614cabdff1aSopenharmony_ci                "psrad                      $8, %%mm4       \n\t"
1615cabdff1aSopenharmony_ci                "psrad                      $8, %%mm1       \n\t"
1616cabdff1aSopenharmony_ci                "psrad                      $8, %%mm2       \n\t"
1617cabdff1aSopenharmony_ci                "psrad                      $8, %%mm3       \n\t"
1618cabdff1aSopenharmony_ci                "packssdw                %%mm1, %%mm4       \n\t"
1619cabdff1aSopenharmony_ci                "packssdw                %%mm3, %%mm2       \n\t"
1620cabdff1aSopenharmony_ci                "pmaddwd                 %%mm5, %%mm4       \n\t"
1621cabdff1aSopenharmony_ci                "pmaddwd                 %%mm5, %%mm2       \n\t"
1622cabdff1aSopenharmony_ci                "add                       $24, %%"FF_REG_d"\n\t"
1623cabdff1aSopenharmony_ci                "packssdw                %%mm2, %%mm4       \n\t"
1624cabdff1aSopenharmony_ci                "psraw                      $7, %%mm4       \n\t"
1625cabdff1aSopenharmony_ci
1626cabdff1aSopenharmony_ci                "packuswb                %%mm4, %%mm0       \n\t"
1627cabdff1aSopenharmony_ci                "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0    \n\t"
1628cabdff1aSopenharmony_ci
1629cabdff1aSopenharmony_ci                MOVNTQ"                  %%mm0, (%1, %%"FF_REG_a") \n\t"
1630cabdff1aSopenharmony_ci                "add                        $8,      %%"FF_REG_a"  \n\t"
1631cabdff1aSopenharmony_ci                " js                        1b                     \n\t"
1632cabdff1aSopenharmony_ci                : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
1633cabdff1aSopenharmony_ci                  NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
1634cabdff1aSopenharmony_ci                : "%"FF_REG_a, "%"FF_REG_d
1635cabdff1aSopenharmony_ci            );
1636cabdff1aSopenharmony_ci            ydst += lumStride;
1637cabdff1aSopenharmony_ci            src  += srcStride;
1638cabdff1aSopenharmony_ci        }
1639cabdff1aSopenharmony_ci        src -= srcStride*2;
1640cabdff1aSopenharmony_ci        __asm__ volatile(
1641cabdff1aSopenharmony_ci            "mov                        %4, %%"FF_REG_a"\n\t"
1642cabdff1aSopenharmony_ci            "movq       "MANGLE(ff_w1111)", %%mm5       \n\t"
1643cabdff1aSopenharmony_ci            "movq          "BGR2U_IDX"(%5), %%mm6       \n\t"
1644cabdff1aSopenharmony_ci            "pxor                    %%mm7, %%mm7       \n\t"
1645cabdff1aSopenharmony_ci            "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1646cabdff1aSopenharmony_ci            "add              %%"FF_REG_d", %%"FF_REG_d"\n\t"
1647cabdff1aSopenharmony_ci            ".p2align                    4              \n\t"
1648cabdff1aSopenharmony_ci            "1:                                         \n\t"
1649cabdff1aSopenharmony_ci            PREFETCH" 64(%0, %%"FF_REG_d")              \n\t"
1650cabdff1aSopenharmony_ci            PREFETCH" 64(%1, %%"FF_REG_d")              \n\t"
1651cabdff1aSopenharmony_ci            "movq       (%0, %%"FF_REG_d"), %%mm0       \n\t"
1652cabdff1aSopenharmony_ci            "movq       (%1, %%"FF_REG_d"), %%mm1       \n\t"
1653cabdff1aSopenharmony_ci            "movq      6(%0, %%"FF_REG_d"), %%mm2       \n\t"
1654cabdff1aSopenharmony_ci            "movq      6(%1, %%"FF_REG_d"), %%mm3       \n\t"
1655cabdff1aSopenharmony_ci            PAVGB"                   %%mm1, %%mm0       \n\t"
1656cabdff1aSopenharmony_ci            PAVGB"                   %%mm3, %%mm2       \n\t"
1657cabdff1aSopenharmony_ci            "movq                    %%mm0, %%mm1       \n\t"
1658cabdff1aSopenharmony_ci            "movq                    %%mm2, %%mm3       \n\t"
1659cabdff1aSopenharmony_ci            "psrlq                     $24, %%mm0       \n\t"
1660cabdff1aSopenharmony_ci            "psrlq                     $24, %%mm2       \n\t"
1661cabdff1aSopenharmony_ci            PAVGB"                   %%mm1, %%mm0       \n\t"
1662cabdff1aSopenharmony_ci            PAVGB"                   %%mm3, %%mm2       \n\t"
1663cabdff1aSopenharmony_ci            "punpcklbw               %%mm7, %%mm0       \n\t"
1664cabdff1aSopenharmony_ci            "punpcklbw               %%mm7, %%mm2       \n\t"
1665cabdff1aSopenharmony_ci            "movq          "BGR2V_IDX"(%5), %%mm1       \n\t"
1666cabdff1aSopenharmony_ci            "movq          "BGR2V_IDX"(%5), %%mm3       \n\t"
1667cabdff1aSopenharmony_ci
1668cabdff1aSopenharmony_ci            "pmaddwd                 %%mm0, %%mm1       \n\t"
1669cabdff1aSopenharmony_ci            "pmaddwd                 %%mm2, %%mm3       \n\t"
1670cabdff1aSopenharmony_ci            "pmaddwd                 %%mm6, %%mm0       \n\t"
1671cabdff1aSopenharmony_ci            "pmaddwd                 %%mm6, %%mm2       \n\t"
1672cabdff1aSopenharmony_ci            "psrad                      $8, %%mm0       \n\t"
1673cabdff1aSopenharmony_ci            "psrad                      $8, %%mm1       \n\t"
1674cabdff1aSopenharmony_ci            "psrad                      $8, %%mm2       \n\t"
1675cabdff1aSopenharmony_ci            "psrad                      $8, %%mm3       \n\t"
1676cabdff1aSopenharmony_ci            "packssdw                %%mm2, %%mm0       \n\t"
1677cabdff1aSopenharmony_ci            "packssdw                %%mm3, %%mm1       \n\t"
1678cabdff1aSopenharmony_ci            "pmaddwd                 %%mm5, %%mm0       \n\t"
1679cabdff1aSopenharmony_ci            "pmaddwd                 %%mm5, %%mm1       \n\t"
1680cabdff1aSopenharmony_ci            "packssdw                %%mm1, %%mm0       \n\t" // V1 V0 U1 U0
1681cabdff1aSopenharmony_ci            "psraw                      $7, %%mm0       \n\t"
1682cabdff1aSopenharmony_ci
1683cabdff1aSopenharmony_ci            "movq     12(%0, %%"FF_REG_d"), %%mm4       \n\t"
1684cabdff1aSopenharmony_ci            "movq     12(%1, %%"FF_REG_d"), %%mm1       \n\t"
1685cabdff1aSopenharmony_ci            "movq     18(%0, %%"FF_REG_d"), %%mm2       \n\t"
1686cabdff1aSopenharmony_ci            "movq     18(%1, %%"FF_REG_d"), %%mm3       \n\t"
1687cabdff1aSopenharmony_ci            PAVGB"                   %%mm1, %%mm4       \n\t"
1688cabdff1aSopenharmony_ci            PAVGB"                   %%mm3, %%mm2       \n\t"
1689cabdff1aSopenharmony_ci            "movq                    %%mm4, %%mm1       \n\t"
1690cabdff1aSopenharmony_ci            "movq                    %%mm2, %%mm3       \n\t"
1691cabdff1aSopenharmony_ci            "psrlq                     $24, %%mm4       \n\t"
1692cabdff1aSopenharmony_ci            "psrlq                     $24, %%mm2       \n\t"
1693cabdff1aSopenharmony_ci            PAVGB"                   %%mm1, %%mm4       \n\t"
1694cabdff1aSopenharmony_ci            PAVGB"                   %%mm3, %%mm2       \n\t"
1695cabdff1aSopenharmony_ci            "punpcklbw               %%mm7, %%mm4       \n\t"
1696cabdff1aSopenharmony_ci            "punpcklbw               %%mm7, %%mm2       \n\t"
1697cabdff1aSopenharmony_ci            "movq          "BGR2V_IDX"(%5), %%mm1       \n\t"
1698cabdff1aSopenharmony_ci            "movq          "BGR2V_IDX"(%5), %%mm3       \n\t"
1699cabdff1aSopenharmony_ci
1700cabdff1aSopenharmony_ci            "pmaddwd                 %%mm4, %%mm1       \n\t"
1701cabdff1aSopenharmony_ci            "pmaddwd                 %%mm2, %%mm3       \n\t"
1702cabdff1aSopenharmony_ci            "pmaddwd                 %%mm6, %%mm4       \n\t"
1703cabdff1aSopenharmony_ci            "pmaddwd                 %%mm6, %%mm2       \n\t"
1704cabdff1aSopenharmony_ci            "psrad                      $8, %%mm4       \n\t"
1705cabdff1aSopenharmony_ci            "psrad                      $8, %%mm1       \n\t"
1706cabdff1aSopenharmony_ci            "psrad                      $8, %%mm2       \n\t"
1707cabdff1aSopenharmony_ci            "psrad                      $8, %%mm3       \n\t"
1708cabdff1aSopenharmony_ci            "packssdw                %%mm2, %%mm4       \n\t"
1709cabdff1aSopenharmony_ci            "packssdw                %%mm3, %%mm1       \n\t"
1710cabdff1aSopenharmony_ci            "pmaddwd                 %%mm5, %%mm4       \n\t"
1711cabdff1aSopenharmony_ci            "pmaddwd                 %%mm5, %%mm1       \n\t"
1712cabdff1aSopenharmony_ci            "add                       $24, %%"FF_REG_d"\n\t"
1713cabdff1aSopenharmony_ci            "packssdw                %%mm1, %%mm4       \n\t" // V3 V2 U3 U2
1714cabdff1aSopenharmony_ci            "psraw                      $7, %%mm4       \n\t"
1715cabdff1aSopenharmony_ci
1716cabdff1aSopenharmony_ci            "movq                    %%mm0, %%mm1           \n\t"
1717cabdff1aSopenharmony_ci            "punpckldq               %%mm4, %%mm0           \n\t"
1718cabdff1aSopenharmony_ci            "punpckhdq               %%mm4, %%mm1           \n\t"
1719cabdff1aSopenharmony_ci            "packsswb                %%mm1, %%mm0           \n\t"
1720cabdff1aSopenharmony_ci            "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0         \n\t"
1721cabdff1aSopenharmony_ci            "movd                    %%mm0, (%2, %%"FF_REG_a") \n\t"
1722cabdff1aSopenharmony_ci            "punpckhdq               %%mm0, %%mm0              \n\t"
1723cabdff1aSopenharmony_ci            "movd                    %%mm0, (%3, %%"FF_REG_a") \n\t"
1724cabdff1aSopenharmony_ci            "add                        $4, %%"FF_REG_a"       \n\t"
1725cabdff1aSopenharmony_ci            " js                        1b              \n\t"
1726cabdff1aSopenharmony_ci            : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
1727cabdff1aSopenharmony_ci              NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
1728cabdff1aSopenharmony_ci            : "%"FF_REG_a, "%"FF_REG_d
1729cabdff1aSopenharmony_ci        );
1730cabdff1aSopenharmony_ci
1731cabdff1aSopenharmony_ci        udst += chromStride;
1732cabdff1aSopenharmony_ci        vdst += chromStride;
1733cabdff1aSopenharmony_ci        src  += srcStride*2;
1734cabdff1aSopenharmony_ci    }
1735cabdff1aSopenharmony_ci
1736cabdff1aSopenharmony_ci    __asm__ volatile(EMMS"       \n\t"
1737cabdff1aSopenharmony_ci                     SFENCE"     \n\t"
1738cabdff1aSopenharmony_ci                     :::"memory");
1739cabdff1aSopenharmony_ci
1740cabdff1aSopenharmony_ci     ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
1741cabdff1aSopenharmony_ci}
1742cabdff1aSopenharmony_ci#endif /* HAVE_7REGS */
1743cabdff1aSopenharmony_ci#endif /* !COMPILE_TEMPLATE_SSE2 */
1744cabdff1aSopenharmony_ci
1745cabdff1aSopenharmony_ci#if !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2
1746cabdff1aSopenharmony_cistatic void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1747cabdff1aSopenharmony_ci                                    int width, int height, int src1Stride,
1748cabdff1aSopenharmony_ci                                    int src2Stride, int dstStride)
1749cabdff1aSopenharmony_ci{
1750cabdff1aSopenharmony_ci    int h;
1751cabdff1aSopenharmony_ci
1752cabdff1aSopenharmony_ci    for (h=0; h < height; h++) {
1753cabdff1aSopenharmony_ci        int w;
1754cabdff1aSopenharmony_ci
1755cabdff1aSopenharmony_ci        if (width >= 16) {
1756cabdff1aSopenharmony_ci            if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
1757cabdff1aSopenharmony_ci        __asm__(
1758cabdff1aSopenharmony_ci            "xor              %%"FF_REG_a", %%"FF_REG_a"  \n\t"
1759cabdff1aSopenharmony_ci            "1:                                     \n\t"
1760cabdff1aSopenharmony_ci            PREFETCH" 64(%1, %%"FF_REG_a")          \n\t"
1761cabdff1aSopenharmony_ci            PREFETCH" 64(%2, %%"FF_REG_a")          \n\t"
1762cabdff1aSopenharmony_ci            "movdqa  (%1, %%"FF_REG_a"), %%xmm0     \n\t"
1763cabdff1aSopenharmony_ci            "movdqa  (%1, %%"FF_REG_a"), %%xmm1     \n\t"
1764cabdff1aSopenharmony_ci            "movdqa  (%2, %%"FF_REG_a"), %%xmm2     \n\t"
1765cabdff1aSopenharmony_ci            "punpcklbw           %%xmm2, %%xmm0     \n\t"
1766cabdff1aSopenharmony_ci            "punpckhbw           %%xmm2, %%xmm1     \n\t"
1767cabdff1aSopenharmony_ci            "movntdq             %%xmm0,   (%0, %%"FF_REG_a", 2) \n\t"
1768cabdff1aSopenharmony_ci            "movntdq             %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t"
1769cabdff1aSopenharmony_ci            "add                    $16, %%"FF_REG_a"            \n\t"
1770cabdff1aSopenharmony_ci            "cmp                     %3, %%"FF_REG_a"            \n\t"
1771cabdff1aSopenharmony_ci            " jb                     1b             \n\t"
1772cabdff1aSopenharmony_ci            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1773cabdff1aSopenharmony_ci            : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a
1774cabdff1aSopenharmony_ci        );
1775cabdff1aSopenharmony_ci            } else
1776cabdff1aSopenharmony_ci        __asm__(
1777cabdff1aSopenharmony_ci            "xor %%"FF_REG_a", %%"FF_REG_a"         \n\t"
1778cabdff1aSopenharmony_ci            "1:                                     \n\t"
1779cabdff1aSopenharmony_ci            PREFETCH" 64(%1, %%"FF_REG_a")          \n\t"
1780cabdff1aSopenharmony_ci            PREFETCH" 64(%2, %%"FF_REG_a")          \n\t"
1781cabdff1aSopenharmony_ci            "movq    (%1, %%"FF_REG_a"), %%mm0      \n\t"
1782cabdff1aSopenharmony_ci            "movq   8(%1, %%"FF_REG_a"), %%mm2      \n\t"
1783cabdff1aSopenharmony_ci            "movq                 %%mm0, %%mm1      \n\t"
1784cabdff1aSopenharmony_ci            "movq                 %%mm2, %%mm3      \n\t"
1785cabdff1aSopenharmony_ci            "movq    (%2, %%"FF_REG_a"), %%mm4      \n\t"
1786cabdff1aSopenharmony_ci            "movq   8(%2, %%"FF_REG_a"), %%mm5      \n\t"
1787cabdff1aSopenharmony_ci            "punpcklbw            %%mm4, %%mm0      \n\t"
1788cabdff1aSopenharmony_ci            "punpckhbw            %%mm4, %%mm1      \n\t"
1789cabdff1aSopenharmony_ci            "punpcklbw            %%mm5, %%mm2      \n\t"
1790cabdff1aSopenharmony_ci            "punpckhbw            %%mm5, %%mm3      \n\t"
1791cabdff1aSopenharmony_ci            MOVNTQ"               %%mm0,   (%0, %%"FF_REG_a", 2) \n\t"
1792cabdff1aSopenharmony_ci            MOVNTQ"               %%mm1,  8(%0, %%"FF_REG_a", 2) \n\t"
1793cabdff1aSopenharmony_ci            MOVNTQ"               %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t"
1794cabdff1aSopenharmony_ci            MOVNTQ"               %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t"
1795cabdff1aSopenharmony_ci            "add                    $16, %%"FF_REG_a"            \n\t"
1796cabdff1aSopenharmony_ci            "cmp                     %3, %%"FF_REG_a"            \n\t"
1797cabdff1aSopenharmony_ci            " jb                     1b                          \n\t"
1798cabdff1aSopenharmony_ci            ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1799cabdff1aSopenharmony_ci            : "memory", "%"FF_REG_a
1800cabdff1aSopenharmony_ci        );
1801cabdff1aSopenharmony_ci
1802cabdff1aSopenharmony_ci        }
1803cabdff1aSopenharmony_ci        for (w= (width&(~15)); w < width; w++) {
1804cabdff1aSopenharmony_ci            dest[2*w+0] = src1[w];
1805cabdff1aSopenharmony_ci            dest[2*w+1] = src2[w];
1806cabdff1aSopenharmony_ci        }
1807cabdff1aSopenharmony_ci        dest += dstStride;
1808cabdff1aSopenharmony_ci        src1 += src1Stride;
1809cabdff1aSopenharmony_ci        src2 += src2Stride;
1810cabdff1aSopenharmony_ci    }
1811cabdff1aSopenharmony_ci    __asm__(
1812cabdff1aSopenharmony_ci            EMMS"       \n\t"
1813cabdff1aSopenharmony_ci            SFENCE"     \n\t"
1814cabdff1aSopenharmony_ci            ::: "memory"
1815cabdff1aSopenharmony_ci            );
1816cabdff1aSopenharmony_ci}
1817cabdff1aSopenharmony_ci#endif /* !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2 */
1818cabdff1aSopenharmony_ci
1819cabdff1aSopenharmony_ci#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
1820cabdff1aSopenharmony_ci#if COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM
1821cabdff1aSopenharmony_civoid RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1822cabdff1aSopenharmony_ci                         const uint8_t *unused,
1823cabdff1aSopenharmony_ci                         const uint8_t *src1,
1824cabdff1aSopenharmony_ci                         const uint8_t *src2,
1825cabdff1aSopenharmony_ci                         int w,
1826cabdff1aSopenharmony_ci                         uint32_t *unused2);
1827cabdff1aSopenharmony_cistatic void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
1828cabdff1aSopenharmony_ci                                      int width, int height, int srcStride,
1829cabdff1aSopenharmony_ci                                      int dst1Stride, int dst2Stride)
1830cabdff1aSopenharmony_ci{
1831cabdff1aSopenharmony_ci    int h;
1832cabdff1aSopenharmony_ci
1833cabdff1aSopenharmony_ci    for (h = 0; h < height; h++) {
1834cabdff1aSopenharmony_ci        RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL);
1835cabdff1aSopenharmony_ci        src  += srcStride;
1836cabdff1aSopenharmony_ci        dst1 += dst1Stride;
1837cabdff1aSopenharmony_ci        dst2 += dst2Stride;
1838cabdff1aSopenharmony_ci    }
1839cabdff1aSopenharmony_ci    __asm__(
1840cabdff1aSopenharmony_ci            SFENCE"     \n\t"
1841cabdff1aSopenharmony_ci            ::: "memory"
1842cabdff1aSopenharmony_ci            );
1843cabdff1aSopenharmony_ci}
1844cabdff1aSopenharmony_ci#endif /* COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM */
1845cabdff1aSopenharmony_ci#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */
1846cabdff1aSopenharmony_ci
1847cabdff1aSopenharmony_ci#if !COMPILE_TEMPLATE_SSE2
1848cabdff1aSopenharmony_cistatic inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1849cabdff1aSopenharmony_ci                                       uint8_t *dst1, uint8_t *dst2,
1850cabdff1aSopenharmony_ci                                       int width, int height,
1851cabdff1aSopenharmony_ci                                       int srcStride1, int srcStride2,
1852cabdff1aSopenharmony_ci                                       int dstStride1, int dstStride2)
1853cabdff1aSopenharmony_ci{
1854cabdff1aSopenharmony_ci    x86_reg x, y;
1855cabdff1aSopenharmony_ci    int w,h;
1856cabdff1aSopenharmony_ci    w=width/2; h=height/2;
1857cabdff1aSopenharmony_ci    __asm__ volatile(
1858cabdff1aSopenharmony_ci        PREFETCH" %0    \n\t"
1859cabdff1aSopenharmony_ci        PREFETCH" %1    \n\t"
1860cabdff1aSopenharmony_ci        ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1861cabdff1aSopenharmony_ci    for (y=0;y<h;y++) {
1862cabdff1aSopenharmony_ci        const uint8_t* s1=src1+srcStride1*(y>>1);
1863cabdff1aSopenharmony_ci        uint8_t* d=dst1+dstStride1*y;
1864cabdff1aSopenharmony_ci        x=0;
1865cabdff1aSopenharmony_ci        for (;x<w-31;x+=32) {
1866cabdff1aSopenharmony_ci            __asm__ volatile(
1867cabdff1aSopenharmony_ci                PREFETCH"   32(%1,%2)        \n\t"
1868cabdff1aSopenharmony_ci                "movq         (%1,%2), %%mm0 \n\t"
1869cabdff1aSopenharmony_ci                "movq        8(%1,%2), %%mm2 \n\t"
1870cabdff1aSopenharmony_ci                "movq       16(%1,%2), %%mm4 \n\t"
1871cabdff1aSopenharmony_ci                "movq       24(%1,%2), %%mm6 \n\t"
1872cabdff1aSopenharmony_ci                "movq      %%mm0, %%mm1 \n\t"
1873cabdff1aSopenharmony_ci                "movq      %%mm2, %%mm3 \n\t"
1874cabdff1aSopenharmony_ci                "movq      %%mm4, %%mm5 \n\t"
1875cabdff1aSopenharmony_ci                "movq      %%mm6, %%mm7 \n\t"
1876cabdff1aSopenharmony_ci                "punpcklbw %%mm0, %%mm0 \n\t"
1877cabdff1aSopenharmony_ci                "punpckhbw %%mm1, %%mm1 \n\t"
1878cabdff1aSopenharmony_ci                "punpcklbw %%mm2, %%mm2 \n\t"
1879cabdff1aSopenharmony_ci                "punpckhbw %%mm3, %%mm3 \n\t"
1880cabdff1aSopenharmony_ci                "punpcklbw %%mm4, %%mm4 \n\t"
1881cabdff1aSopenharmony_ci                "punpckhbw %%mm5, %%mm5 \n\t"
1882cabdff1aSopenharmony_ci                "punpcklbw %%mm6, %%mm6 \n\t"
1883cabdff1aSopenharmony_ci                "punpckhbw %%mm7, %%mm7 \n\t"
1884cabdff1aSopenharmony_ci                MOVNTQ"    %%mm0,   (%0,%2,2)  \n\t"
1885cabdff1aSopenharmony_ci                MOVNTQ"    %%mm1,  8(%0,%2,2)  \n\t"
1886cabdff1aSopenharmony_ci                MOVNTQ"    %%mm2, 16(%0,%2,2)  \n\t"
1887cabdff1aSopenharmony_ci                MOVNTQ"    %%mm3, 24(%0,%2,2)  \n\t"
1888cabdff1aSopenharmony_ci                MOVNTQ"    %%mm4, 32(%0,%2,2)  \n\t"
1889cabdff1aSopenharmony_ci                MOVNTQ"    %%mm5, 40(%0,%2,2)  \n\t"
1890cabdff1aSopenharmony_ci                MOVNTQ"    %%mm6, 48(%0,%2,2)  \n\t"
1891cabdff1aSopenharmony_ci                MOVNTQ"    %%mm7, 56(%0,%2,2)"
1892cabdff1aSopenharmony_ci                :: "r"(d), "r"(s1), "r"(x)
1893cabdff1aSopenharmony_ci                :"memory");
1894cabdff1aSopenharmony_ci        }
1895cabdff1aSopenharmony_ci        for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1896cabdff1aSopenharmony_ci    }
1897cabdff1aSopenharmony_ci    for (y=0;y<h;y++) {
1898cabdff1aSopenharmony_ci        const uint8_t* s2=src2+srcStride2*(y>>1);
1899cabdff1aSopenharmony_ci        uint8_t* d=dst2+dstStride2*y;
1900cabdff1aSopenharmony_ci        x=0;
1901cabdff1aSopenharmony_ci        for (;x<w-31;x+=32) {
1902cabdff1aSopenharmony_ci            __asm__ volatile(
1903cabdff1aSopenharmony_ci                PREFETCH"   32(%1,%2)        \n\t"
1904cabdff1aSopenharmony_ci                "movq         (%1,%2), %%mm0 \n\t"
1905cabdff1aSopenharmony_ci                "movq        8(%1,%2), %%mm2 \n\t"
1906cabdff1aSopenharmony_ci                "movq       16(%1,%2), %%mm4 \n\t"
1907cabdff1aSopenharmony_ci                "movq       24(%1,%2), %%mm6 \n\t"
1908cabdff1aSopenharmony_ci                "movq      %%mm0, %%mm1 \n\t"
1909cabdff1aSopenharmony_ci                "movq      %%mm2, %%mm3 \n\t"
1910cabdff1aSopenharmony_ci                "movq      %%mm4, %%mm5 \n\t"
1911cabdff1aSopenharmony_ci                "movq      %%mm6, %%mm7 \n\t"
1912cabdff1aSopenharmony_ci                "punpcklbw %%mm0, %%mm0 \n\t"
1913cabdff1aSopenharmony_ci                "punpckhbw %%mm1, %%mm1 \n\t"
1914cabdff1aSopenharmony_ci                "punpcklbw %%mm2, %%mm2 \n\t"
1915cabdff1aSopenharmony_ci                "punpckhbw %%mm3, %%mm3 \n\t"
1916cabdff1aSopenharmony_ci                "punpcklbw %%mm4, %%mm4 \n\t"
1917cabdff1aSopenharmony_ci                "punpckhbw %%mm5, %%mm5 \n\t"
1918cabdff1aSopenharmony_ci                "punpcklbw %%mm6, %%mm6 \n\t"
1919cabdff1aSopenharmony_ci                "punpckhbw %%mm7, %%mm7 \n\t"
1920cabdff1aSopenharmony_ci                MOVNTQ"    %%mm0,   (%0,%2,2)  \n\t"
1921cabdff1aSopenharmony_ci                MOVNTQ"    %%mm1,  8(%0,%2,2)  \n\t"
1922cabdff1aSopenharmony_ci                MOVNTQ"    %%mm2, 16(%0,%2,2)  \n\t"
1923cabdff1aSopenharmony_ci                MOVNTQ"    %%mm3, 24(%0,%2,2)  \n\t"
1924cabdff1aSopenharmony_ci                MOVNTQ"    %%mm4, 32(%0,%2,2)  \n\t"
1925cabdff1aSopenharmony_ci                MOVNTQ"    %%mm5, 40(%0,%2,2)  \n\t"
1926cabdff1aSopenharmony_ci                MOVNTQ"    %%mm6, 48(%0,%2,2)  \n\t"
1927cabdff1aSopenharmony_ci                MOVNTQ"    %%mm7, 56(%0,%2,2)"
1928cabdff1aSopenharmony_ci                :: "r"(d), "r"(s2), "r"(x)
1929cabdff1aSopenharmony_ci                :"memory");
1930cabdff1aSopenharmony_ci        }
1931cabdff1aSopenharmony_ci        for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
1932cabdff1aSopenharmony_ci    }
1933cabdff1aSopenharmony_ci    __asm__(
1934cabdff1aSopenharmony_ci            EMMS"       \n\t"
1935cabdff1aSopenharmony_ci            SFENCE"     \n\t"
1936cabdff1aSopenharmony_ci            ::: "memory"
1937cabdff1aSopenharmony_ci        );
1938cabdff1aSopenharmony_ci}
1939cabdff1aSopenharmony_ci
1940cabdff1aSopenharmony_cistatic inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
1941cabdff1aSopenharmony_ci                                        uint8_t *dst,
1942cabdff1aSopenharmony_ci                                        int width, int height,
1943cabdff1aSopenharmony_ci                                        int srcStride1, int srcStride2,
1944cabdff1aSopenharmony_ci                                        int srcStride3, int dstStride)
1945cabdff1aSopenharmony_ci{
1946cabdff1aSopenharmony_ci    x86_reg x;
1947cabdff1aSopenharmony_ci    int y,w,h;
1948cabdff1aSopenharmony_ci    w=width/2; h=height;
1949cabdff1aSopenharmony_ci    for (y=0;y<h;y++) {
1950cabdff1aSopenharmony_ci        const uint8_t* yp=src1+srcStride1*y;
1951cabdff1aSopenharmony_ci        const uint8_t* up=src2+srcStride2*(y>>2);
1952cabdff1aSopenharmony_ci        const uint8_t* vp=src3+srcStride3*(y>>2);
1953cabdff1aSopenharmony_ci        uint8_t* d=dst+dstStride*y;
1954cabdff1aSopenharmony_ci        x=0;
1955cabdff1aSopenharmony_ci        for (;x<w-7;x+=8) {
1956cabdff1aSopenharmony_ci            __asm__ volatile(
1957cabdff1aSopenharmony_ci                PREFETCH"   32(%1, %0)          \n\t"
1958cabdff1aSopenharmony_ci                PREFETCH"   32(%2, %0)          \n\t"
1959cabdff1aSopenharmony_ci                PREFETCH"   32(%3, %0)          \n\t"
1960cabdff1aSopenharmony_ci                "movq      (%1, %0, 4), %%mm0   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1961cabdff1aSopenharmony_ci                "movq         (%2, %0), %%mm1   \n\t" /* U0U1U2U3U4U5U6U7 */
1962cabdff1aSopenharmony_ci                "movq         (%3, %0), %%mm2   \n\t" /* V0V1V2V3V4V5V6V7 */
1963cabdff1aSopenharmony_ci                "movq            %%mm0, %%mm3   \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1964cabdff1aSopenharmony_ci                "movq            %%mm1, %%mm4   \n\t" /* U0U1U2U3U4U5U6U7 */
1965cabdff1aSopenharmony_ci                "movq            %%mm2, %%mm5   \n\t" /* V0V1V2V3V4V5V6V7 */
1966cabdff1aSopenharmony_ci                "punpcklbw       %%mm1, %%mm1   \n\t" /* U0U0 U1U1 U2U2 U3U3 */
1967cabdff1aSopenharmony_ci                "punpcklbw       %%mm2, %%mm2   \n\t" /* V0V0 V1V1 V2V2 V3V3 */
1968cabdff1aSopenharmony_ci                "punpckhbw       %%mm4, %%mm4   \n\t" /* U4U4 U5U5 U6U6 U7U7 */
1969cabdff1aSopenharmony_ci                "punpckhbw       %%mm5, %%mm5   \n\t" /* V4V4 V5V5 V6V6 V7V7 */
1970cabdff1aSopenharmony_ci
1971cabdff1aSopenharmony_ci                "movq            %%mm1, %%mm6   \n\t"
1972cabdff1aSopenharmony_ci                "punpcklbw       %%mm2, %%mm1   \n\t" /* U0V0 U0V0 U1V1 U1V1*/
1973cabdff1aSopenharmony_ci                "punpcklbw       %%mm1, %%mm0   \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
1974cabdff1aSopenharmony_ci                "punpckhbw       %%mm1, %%mm3   \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
1975cabdff1aSopenharmony_ci                MOVNTQ"          %%mm0,  (%4, %0, 8)    \n\t"
1976cabdff1aSopenharmony_ci                MOVNTQ"          %%mm3, 8(%4, %0, 8)    \n\t"
1977cabdff1aSopenharmony_ci
1978cabdff1aSopenharmony_ci                "punpckhbw       %%mm2, %%mm6   \n\t" /* U2V2 U2V2 U3V3 U3V3*/
1979cabdff1aSopenharmony_ci                "movq     8(%1, %0, 4), %%mm0   \n\t"
1980cabdff1aSopenharmony_ci                "movq            %%mm0, %%mm3   \n\t"
1981cabdff1aSopenharmony_ci                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U2 Y V2 Y U2 Y V2*/
1982cabdff1aSopenharmony_ci                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U3 Y V3 Y U3 Y V3*/
1983cabdff1aSopenharmony_ci                MOVNTQ"          %%mm0, 16(%4, %0, 8)   \n\t"
1984cabdff1aSopenharmony_ci                MOVNTQ"          %%mm3, 24(%4, %0, 8)   \n\t"
1985cabdff1aSopenharmony_ci
1986cabdff1aSopenharmony_ci                "movq            %%mm4, %%mm6   \n\t"
1987cabdff1aSopenharmony_ci                "movq    16(%1, %0, 4), %%mm0   \n\t"
1988cabdff1aSopenharmony_ci                "movq            %%mm0, %%mm3   \n\t"
1989cabdff1aSopenharmony_ci                "punpcklbw       %%mm5, %%mm4   \n\t"
1990cabdff1aSopenharmony_ci                "punpcklbw       %%mm4, %%mm0   \n\t" /* Y U4 Y V4 Y U4 Y V4*/
1991cabdff1aSopenharmony_ci                "punpckhbw       %%mm4, %%mm3   \n\t" /* Y U5 Y V5 Y U5 Y V5*/
1992cabdff1aSopenharmony_ci                MOVNTQ"          %%mm0, 32(%4, %0, 8)   \n\t"
1993cabdff1aSopenharmony_ci                MOVNTQ"          %%mm3, 40(%4, %0, 8)   \n\t"
1994cabdff1aSopenharmony_ci
1995cabdff1aSopenharmony_ci                "punpckhbw       %%mm5, %%mm6   \n\t"
1996cabdff1aSopenharmony_ci                "movq    24(%1, %0, 4), %%mm0   \n\t"
1997cabdff1aSopenharmony_ci                "movq            %%mm0, %%mm3   \n\t"
1998cabdff1aSopenharmony_ci                "punpcklbw       %%mm6, %%mm0   \n\t" /* Y U6 Y V6 Y U6 Y V6*/
1999cabdff1aSopenharmony_ci                "punpckhbw       %%mm6, %%mm3   \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2000cabdff1aSopenharmony_ci                MOVNTQ"          %%mm0, 48(%4, %0, 8)   \n\t"
2001cabdff1aSopenharmony_ci                MOVNTQ"          %%mm3, 56(%4, %0, 8)   \n\t"
2002cabdff1aSopenharmony_ci
2003cabdff1aSopenharmony_ci                : "+r" (x)
2004cabdff1aSopenharmony_ci                : "r"(yp), "r" (up), "r"(vp), "r"(d)
2005cabdff1aSopenharmony_ci                :"memory");
2006cabdff1aSopenharmony_ci        }
2007cabdff1aSopenharmony_ci        for (; x<w; x++) {
2008cabdff1aSopenharmony_ci            const int x2 = x<<2;
2009cabdff1aSopenharmony_ci            d[8*x+0] = yp[x2];
2010cabdff1aSopenharmony_ci            d[8*x+1] = up[x];
2011cabdff1aSopenharmony_ci            d[8*x+2] = yp[x2+1];
2012cabdff1aSopenharmony_ci            d[8*x+3] = vp[x];
2013cabdff1aSopenharmony_ci            d[8*x+4] = yp[x2+2];
2014cabdff1aSopenharmony_ci            d[8*x+5] = up[x];
2015cabdff1aSopenharmony_ci            d[8*x+6] = yp[x2+3];
2016cabdff1aSopenharmony_ci            d[8*x+7] = vp[x];
2017cabdff1aSopenharmony_ci        }
2018cabdff1aSopenharmony_ci    }
2019cabdff1aSopenharmony_ci    __asm__(
2020cabdff1aSopenharmony_ci            EMMS"       \n\t"
2021cabdff1aSopenharmony_ci            SFENCE"     \n\t"
2022cabdff1aSopenharmony_ci            ::: "memory"
2023cabdff1aSopenharmony_ci        );
2024cabdff1aSopenharmony_ci}
2025cabdff1aSopenharmony_ci
2026cabdff1aSopenharmony_cistatic void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2027cabdff1aSopenharmony_ci{
2028cabdff1aSopenharmony_ci    dst +=   count;
2029cabdff1aSopenharmony_ci    src += 2*count;
2030cabdff1aSopenharmony_ci    count= - count;
2031cabdff1aSopenharmony_ci
2032cabdff1aSopenharmony_ci    if(count <= -16) {
2033cabdff1aSopenharmony_ci        count += 15;
2034cabdff1aSopenharmony_ci        __asm__ volatile(
2035cabdff1aSopenharmony_ci            "pcmpeqw       %%mm7, %%mm7        \n\t"
2036cabdff1aSopenharmony_ci            "psrlw            $8, %%mm7        \n\t"
2037cabdff1aSopenharmony_ci            "1:                                \n\t"
2038cabdff1aSopenharmony_ci            "movq -30(%1, %0, 2), %%mm0        \n\t"
2039cabdff1aSopenharmony_ci            "movq -22(%1, %0, 2), %%mm1        \n\t"
2040cabdff1aSopenharmony_ci            "movq -14(%1, %0, 2), %%mm2        \n\t"
2041cabdff1aSopenharmony_ci            "movq  -6(%1, %0, 2), %%mm3        \n\t"
2042cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm0        \n\t"
2043cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm1        \n\t"
2044cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm2        \n\t"
2045cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm3        \n\t"
2046cabdff1aSopenharmony_ci            "packuswb      %%mm1, %%mm0        \n\t"
2047cabdff1aSopenharmony_ci            "packuswb      %%mm3, %%mm2        \n\t"
2048cabdff1aSopenharmony_ci            MOVNTQ"        %%mm0,-15(%2, %0)   \n\t"
2049cabdff1aSopenharmony_ci            MOVNTQ"        %%mm2,- 7(%2, %0)   \n\t"
2050cabdff1aSopenharmony_ci            "add             $16, %0           \n\t"
2051cabdff1aSopenharmony_ci            " js 1b                            \n\t"
2052cabdff1aSopenharmony_ci            : "+r"(count)
2053cabdff1aSopenharmony_ci            : "r"(src), "r"(dst)
2054cabdff1aSopenharmony_ci        );
2055cabdff1aSopenharmony_ci        count -= 15;
2056cabdff1aSopenharmony_ci    }
2057cabdff1aSopenharmony_ci    while(count<0) {
2058cabdff1aSopenharmony_ci        dst[count]= src[2*count];
2059cabdff1aSopenharmony_ci        count++;
2060cabdff1aSopenharmony_ci    }
2061cabdff1aSopenharmony_ci}
2062cabdff1aSopenharmony_ci
2063cabdff1aSopenharmony_cistatic void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count)
2064cabdff1aSopenharmony_ci{
2065cabdff1aSopenharmony_ci    src ++;
2066cabdff1aSopenharmony_ci    dst +=   count;
2067cabdff1aSopenharmony_ci    src += 2*count;
2068cabdff1aSopenharmony_ci    count= - count;
2069cabdff1aSopenharmony_ci
2070cabdff1aSopenharmony_ci    if(count < -16) {
2071cabdff1aSopenharmony_ci        count += 16;
2072cabdff1aSopenharmony_ci        __asm__ volatile(
2073cabdff1aSopenharmony_ci            "pcmpeqw       %%mm7, %%mm7        \n\t"
2074cabdff1aSopenharmony_ci            "psrlw            $8, %%mm7        \n\t"
2075cabdff1aSopenharmony_ci            "1:                                \n\t"
2076cabdff1aSopenharmony_ci            "movq -32(%1, %0, 2), %%mm0        \n\t"
2077cabdff1aSopenharmony_ci            "movq -24(%1, %0, 2), %%mm1        \n\t"
2078cabdff1aSopenharmony_ci            "movq -16(%1, %0, 2), %%mm2        \n\t"
2079cabdff1aSopenharmony_ci            "movq  -8(%1, %0, 2), %%mm3        \n\t"
2080cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm0        \n\t"
2081cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm1        \n\t"
2082cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm2        \n\t"
2083cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm3        \n\t"
2084cabdff1aSopenharmony_ci            "packuswb      %%mm1, %%mm0        \n\t"
2085cabdff1aSopenharmony_ci            "packuswb      %%mm3, %%mm2        \n\t"
2086cabdff1aSopenharmony_ci            MOVNTQ"        %%mm0,-16(%2, %0)   \n\t"
2087cabdff1aSopenharmony_ci            MOVNTQ"        %%mm2,- 8(%2, %0)   \n\t"
2088cabdff1aSopenharmony_ci            "add             $16, %0           \n\t"
2089cabdff1aSopenharmony_ci            " js 1b                            \n\t"
2090cabdff1aSopenharmony_ci            : "+r"(count)
2091cabdff1aSopenharmony_ci            : "r"(src), "r"(dst)
2092cabdff1aSopenharmony_ci        );
2093cabdff1aSopenharmony_ci        count -= 16;
2094cabdff1aSopenharmony_ci    }
2095cabdff1aSopenharmony_ci    while(count<0) {
2096cabdff1aSopenharmony_ci        dst[count]= src[2*count];
2097cabdff1aSopenharmony_ci        count++;
2098cabdff1aSopenharmony_ci    }
2099cabdff1aSopenharmony_ci}
2100cabdff1aSopenharmony_ci
2101cabdff1aSopenharmony_ci#if ARCH_X86_32
2102cabdff1aSopenharmony_cistatic void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2103cabdff1aSopenharmony_ci{
2104cabdff1aSopenharmony_ci    dst0+=   count;
2105cabdff1aSopenharmony_ci    dst1+=   count;
2106cabdff1aSopenharmony_ci    src += 4*count;
2107cabdff1aSopenharmony_ci    count= - count;
2108cabdff1aSopenharmony_ci    if(count <= -8) {
2109cabdff1aSopenharmony_ci        count += 7;
2110cabdff1aSopenharmony_ci        __asm__ volatile(
2111cabdff1aSopenharmony_ci            "pcmpeqw       %%mm7, %%mm7        \n\t"
2112cabdff1aSopenharmony_ci            "psrlw            $8, %%mm7        \n\t"
2113cabdff1aSopenharmony_ci            "1:                                \n\t"
2114cabdff1aSopenharmony_ci            "movq -28(%1, %0, 4), %%mm0        \n\t"
2115cabdff1aSopenharmony_ci            "movq -20(%1, %0, 4), %%mm1        \n\t"
2116cabdff1aSopenharmony_ci            "movq -12(%1, %0, 4), %%mm2        \n\t"
2117cabdff1aSopenharmony_ci            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2118cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm0        \n\t"
2119cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm1        \n\t"
2120cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm2        \n\t"
2121cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm3        \n\t"
2122cabdff1aSopenharmony_ci            "packuswb      %%mm1, %%mm0        \n\t"
2123cabdff1aSopenharmony_ci            "packuswb      %%mm3, %%mm2        \n\t"
2124cabdff1aSopenharmony_ci            "movq          %%mm0, %%mm1        \n\t"
2125cabdff1aSopenharmony_ci            "movq          %%mm2, %%mm3        \n\t"
2126cabdff1aSopenharmony_ci            "psrlw            $8, %%mm0        \n\t"
2127cabdff1aSopenharmony_ci            "psrlw            $8, %%mm2        \n\t"
2128cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm1        \n\t"
2129cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm3        \n\t"
2130cabdff1aSopenharmony_ci            "packuswb      %%mm2, %%mm0        \n\t"
2131cabdff1aSopenharmony_ci            "packuswb      %%mm3, %%mm1        \n\t"
2132cabdff1aSopenharmony_ci            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2133cabdff1aSopenharmony_ci            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2134cabdff1aSopenharmony_ci            "add              $8, %0           \n\t"
2135cabdff1aSopenharmony_ci            " js 1b                            \n\t"
2136cabdff1aSopenharmony_ci            : "+r"(count)
2137cabdff1aSopenharmony_ci            : "r"(src), "r"(dst0), "r"(dst1)
2138cabdff1aSopenharmony_ci        );
2139cabdff1aSopenharmony_ci        count -= 7;
2140cabdff1aSopenharmony_ci    }
2141cabdff1aSopenharmony_ci    while(count<0) {
2142cabdff1aSopenharmony_ci        dst0[count]= src[4*count+0];
2143cabdff1aSopenharmony_ci        dst1[count]= src[4*count+2];
2144cabdff1aSopenharmony_ci        count++;
2145cabdff1aSopenharmony_ci    }
2146cabdff1aSopenharmony_ci}
2147cabdff1aSopenharmony_ci#endif /* ARCH_X86_32 */
2148cabdff1aSopenharmony_ci
2149cabdff1aSopenharmony_cistatic void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2150cabdff1aSopenharmony_ci{
2151cabdff1aSopenharmony_ci    dst0 +=   count;
2152cabdff1aSopenharmony_ci    dst1 +=   count;
2153cabdff1aSopenharmony_ci    src0 += 4*count;
2154cabdff1aSopenharmony_ci    src1 += 4*count;
2155cabdff1aSopenharmony_ci    count= - count;
2156cabdff1aSopenharmony_ci#ifdef PAVGB
2157cabdff1aSopenharmony_ci    if(count <= -8) {
2158cabdff1aSopenharmony_ci        count += 7;
2159cabdff1aSopenharmony_ci        __asm__ volatile(
2160cabdff1aSopenharmony_ci            "pcmpeqw        %%mm7, %%mm7        \n\t"
2161cabdff1aSopenharmony_ci            "psrlw             $8, %%mm7        \n\t"
2162cabdff1aSopenharmony_ci            "1:                                \n\t"
2163cabdff1aSopenharmony_ci            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2164cabdff1aSopenharmony_ci            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2165cabdff1aSopenharmony_ci            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2166cabdff1aSopenharmony_ci            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2167cabdff1aSopenharmony_ci            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2168cabdff1aSopenharmony_ci            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2169cabdff1aSopenharmony_ci            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2170cabdff1aSopenharmony_ci            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2171cabdff1aSopenharmony_ci            "pand           %%mm7, %%mm0        \n\t"
2172cabdff1aSopenharmony_ci            "pand           %%mm7, %%mm1        \n\t"
2173cabdff1aSopenharmony_ci            "pand           %%mm7, %%mm2        \n\t"
2174cabdff1aSopenharmony_ci            "pand           %%mm7, %%mm3        \n\t"
2175cabdff1aSopenharmony_ci            "packuswb       %%mm1, %%mm0        \n\t"
2176cabdff1aSopenharmony_ci            "packuswb       %%mm3, %%mm2        \n\t"
2177cabdff1aSopenharmony_ci            "movq           %%mm0, %%mm1        \n\t"
2178cabdff1aSopenharmony_ci            "movq           %%mm2, %%mm3        \n\t"
2179cabdff1aSopenharmony_ci            "psrlw             $8, %%mm0        \n\t"
2180cabdff1aSopenharmony_ci            "psrlw             $8, %%mm2        \n\t"
2181cabdff1aSopenharmony_ci            "pand           %%mm7, %%mm1        \n\t"
2182cabdff1aSopenharmony_ci            "pand           %%mm7, %%mm3        \n\t"
2183cabdff1aSopenharmony_ci            "packuswb       %%mm2, %%mm0        \n\t"
2184cabdff1aSopenharmony_ci            "packuswb       %%mm3, %%mm1        \n\t"
2185cabdff1aSopenharmony_ci            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2186cabdff1aSopenharmony_ci            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2187cabdff1aSopenharmony_ci            "add               $8, %0           \n\t"
2188cabdff1aSopenharmony_ci            " js 1b                            \n\t"
2189cabdff1aSopenharmony_ci            : "+r"(count)
2190cabdff1aSopenharmony_ci            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2191cabdff1aSopenharmony_ci        );
2192cabdff1aSopenharmony_ci        count -= 7;
2193cabdff1aSopenharmony_ci    }
2194cabdff1aSopenharmony_ci#endif
2195cabdff1aSopenharmony_ci    while(count<0) {
2196cabdff1aSopenharmony_ci        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2197cabdff1aSopenharmony_ci        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2198cabdff1aSopenharmony_ci        count++;
2199cabdff1aSopenharmony_ci    }
2200cabdff1aSopenharmony_ci}
2201cabdff1aSopenharmony_ci
2202cabdff1aSopenharmony_cistatic void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2203cabdff1aSopenharmony_ci{
2204cabdff1aSopenharmony_ci    dst0+=   count;
2205cabdff1aSopenharmony_ci    dst1+=   count;
2206cabdff1aSopenharmony_ci    src += 4*count;
2207cabdff1aSopenharmony_ci    count= - count;
2208cabdff1aSopenharmony_ci    if(count <= -8) {
2209cabdff1aSopenharmony_ci        count += 7;
2210cabdff1aSopenharmony_ci        __asm__ volatile(
2211cabdff1aSopenharmony_ci            "pcmpeqw       %%mm7, %%mm7        \n\t"
2212cabdff1aSopenharmony_ci            "psrlw            $8, %%mm7        \n\t"
2213cabdff1aSopenharmony_ci            "1:                                \n\t"
2214cabdff1aSopenharmony_ci            "movq -28(%1, %0, 4), %%mm0        \n\t"
2215cabdff1aSopenharmony_ci            "movq -20(%1, %0, 4), %%mm1        \n\t"
2216cabdff1aSopenharmony_ci            "movq -12(%1, %0, 4), %%mm2        \n\t"
2217cabdff1aSopenharmony_ci            "movq  -4(%1, %0, 4), %%mm3        \n\t"
2218cabdff1aSopenharmony_ci            "psrlw            $8, %%mm0        \n\t"
2219cabdff1aSopenharmony_ci            "psrlw            $8, %%mm1        \n\t"
2220cabdff1aSopenharmony_ci            "psrlw            $8, %%mm2        \n\t"
2221cabdff1aSopenharmony_ci            "psrlw            $8, %%mm3        \n\t"
2222cabdff1aSopenharmony_ci            "packuswb      %%mm1, %%mm0        \n\t"
2223cabdff1aSopenharmony_ci            "packuswb      %%mm3, %%mm2        \n\t"
2224cabdff1aSopenharmony_ci            "movq          %%mm0, %%mm1        \n\t"
2225cabdff1aSopenharmony_ci            "movq          %%mm2, %%mm3        \n\t"
2226cabdff1aSopenharmony_ci            "psrlw            $8, %%mm0        \n\t"
2227cabdff1aSopenharmony_ci            "psrlw            $8, %%mm2        \n\t"
2228cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm1        \n\t"
2229cabdff1aSopenharmony_ci            "pand          %%mm7, %%mm3        \n\t"
2230cabdff1aSopenharmony_ci            "packuswb      %%mm2, %%mm0        \n\t"
2231cabdff1aSopenharmony_ci            "packuswb      %%mm3, %%mm1        \n\t"
2232cabdff1aSopenharmony_ci            MOVNTQ"        %%mm0,- 7(%3, %0)   \n\t"
2233cabdff1aSopenharmony_ci            MOVNTQ"        %%mm1,- 7(%2, %0)   \n\t"
2234cabdff1aSopenharmony_ci            "add              $8, %0           \n\t"
2235cabdff1aSopenharmony_ci            " js 1b                            \n\t"
2236cabdff1aSopenharmony_ci            : "+r"(count)
2237cabdff1aSopenharmony_ci            : "r"(src), "r"(dst0), "r"(dst1)
2238cabdff1aSopenharmony_ci        );
2239cabdff1aSopenharmony_ci        count -= 7;
2240cabdff1aSopenharmony_ci    }
2241cabdff1aSopenharmony_ci    src++;
2242cabdff1aSopenharmony_ci    while(count<0) {
2243cabdff1aSopenharmony_ci        dst0[count]= src[4*count+0];
2244cabdff1aSopenharmony_ci        dst1[count]= src[4*count+2];
2245cabdff1aSopenharmony_ci        count++;
2246cabdff1aSopenharmony_ci    }
2247cabdff1aSopenharmony_ci}
2248cabdff1aSopenharmony_ci
2249cabdff1aSopenharmony_cistatic void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2250cabdff1aSopenharmony_ci{
2251cabdff1aSopenharmony_ci    dst0 +=   count;
2252cabdff1aSopenharmony_ci    dst1 +=   count;
2253cabdff1aSopenharmony_ci    src0 += 4*count;
2254cabdff1aSopenharmony_ci    src1 += 4*count;
2255cabdff1aSopenharmony_ci    count= - count;
2256cabdff1aSopenharmony_ci#ifdef PAVGB
2257cabdff1aSopenharmony_ci    if(count <= -8) {
2258cabdff1aSopenharmony_ci        count += 7;
2259cabdff1aSopenharmony_ci        __asm__ volatile(
2260cabdff1aSopenharmony_ci            "pcmpeqw        %%mm7, %%mm7        \n\t"
2261cabdff1aSopenharmony_ci            "psrlw             $8, %%mm7        \n\t"
2262cabdff1aSopenharmony_ci            "1:                                \n\t"
2263cabdff1aSopenharmony_ci            "movq  -28(%1, %0, 4), %%mm0        \n\t"
2264cabdff1aSopenharmony_ci            "movq  -20(%1, %0, 4), %%mm1        \n\t"
2265cabdff1aSopenharmony_ci            "movq  -12(%1, %0, 4), %%mm2        \n\t"
2266cabdff1aSopenharmony_ci            "movq   -4(%1, %0, 4), %%mm3        \n\t"
2267cabdff1aSopenharmony_ci            PAVGB" -28(%2, %0, 4), %%mm0        \n\t"
2268cabdff1aSopenharmony_ci            PAVGB" -20(%2, %0, 4), %%mm1        \n\t"
2269cabdff1aSopenharmony_ci            PAVGB" -12(%2, %0, 4), %%mm2        \n\t"
2270cabdff1aSopenharmony_ci            PAVGB" - 4(%2, %0, 4), %%mm3        \n\t"
2271cabdff1aSopenharmony_ci            "psrlw             $8, %%mm0        \n\t"
2272cabdff1aSopenharmony_ci            "psrlw             $8, %%mm1        \n\t"
2273cabdff1aSopenharmony_ci            "psrlw             $8, %%mm2        \n\t"
2274cabdff1aSopenharmony_ci            "psrlw             $8, %%mm3        \n\t"
2275cabdff1aSopenharmony_ci            "packuswb       %%mm1, %%mm0        \n\t"
2276cabdff1aSopenharmony_ci            "packuswb       %%mm3, %%mm2        \n\t"
2277cabdff1aSopenharmony_ci            "movq           %%mm0, %%mm1        \n\t"
2278cabdff1aSopenharmony_ci            "movq           %%mm2, %%mm3        \n\t"
2279cabdff1aSopenharmony_ci            "psrlw             $8, %%mm0        \n\t"
2280cabdff1aSopenharmony_ci            "psrlw             $8, %%mm2        \n\t"
2281cabdff1aSopenharmony_ci            "pand           %%mm7, %%mm1        \n\t"
2282cabdff1aSopenharmony_ci            "pand           %%mm7, %%mm3        \n\t"
2283cabdff1aSopenharmony_ci            "packuswb       %%mm2, %%mm0        \n\t"
2284cabdff1aSopenharmony_ci            "packuswb       %%mm3, %%mm1        \n\t"
2285cabdff1aSopenharmony_ci            MOVNTQ"         %%mm0,- 7(%4, %0)   \n\t"
2286cabdff1aSopenharmony_ci            MOVNTQ"         %%mm1,- 7(%3, %0)   \n\t"
2287cabdff1aSopenharmony_ci            "add               $8, %0           \n\t"
2288cabdff1aSopenharmony_ci            " js 1b                            \n\t"
2289cabdff1aSopenharmony_ci            : "+r"(count)
2290cabdff1aSopenharmony_ci            : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2291cabdff1aSopenharmony_ci        );
2292cabdff1aSopenharmony_ci        count -= 7;
2293cabdff1aSopenharmony_ci    }
2294cabdff1aSopenharmony_ci#endif
2295cabdff1aSopenharmony_ci    src0++;
2296cabdff1aSopenharmony_ci    src1++;
2297cabdff1aSopenharmony_ci    while(count<0) {
2298cabdff1aSopenharmony_ci        dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2299cabdff1aSopenharmony_ci        dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2300cabdff1aSopenharmony_ci        count++;
2301cabdff1aSopenharmony_ci    }
2302cabdff1aSopenharmony_ci}
2303cabdff1aSopenharmony_ci
2304cabdff1aSopenharmony_cistatic void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2305cabdff1aSopenharmony_ci                                 int width, int height,
2306cabdff1aSopenharmony_ci                                 int lumStride, int chromStride, int srcStride)
2307cabdff1aSopenharmony_ci{
2308cabdff1aSopenharmony_ci    int y;
2309cabdff1aSopenharmony_ci    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2310cabdff1aSopenharmony_ci
2311cabdff1aSopenharmony_ci    for (y=0; y<height; y++) {
2312cabdff1aSopenharmony_ci        RENAME(extract_even)(src, ydst, width);
2313cabdff1aSopenharmony_ci        if(y&1) {
2314cabdff1aSopenharmony_ci            RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2315cabdff1aSopenharmony_ci            udst+= chromStride;
2316cabdff1aSopenharmony_ci            vdst+= chromStride;
2317cabdff1aSopenharmony_ci        }
2318cabdff1aSopenharmony_ci
2319cabdff1aSopenharmony_ci        src += srcStride;
2320cabdff1aSopenharmony_ci        ydst+= lumStride;
2321cabdff1aSopenharmony_ci    }
2322cabdff1aSopenharmony_ci    __asm__(
2323cabdff1aSopenharmony_ci            EMMS"       \n\t"
2324cabdff1aSopenharmony_ci            SFENCE"     \n\t"
2325cabdff1aSopenharmony_ci            ::: "memory"
2326cabdff1aSopenharmony_ci        );
2327cabdff1aSopenharmony_ci}
2328cabdff1aSopenharmony_ci
2329cabdff1aSopenharmony_cistatic void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2330cabdff1aSopenharmony_ci                                 int width, int height,
2331cabdff1aSopenharmony_ci                                 int lumStride, int chromStride, int srcStride)
2332cabdff1aSopenharmony_ci{
2333cabdff1aSopenharmony_ci    int y;
2334cabdff1aSopenharmony_ci    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2335cabdff1aSopenharmony_ci
2336cabdff1aSopenharmony_ci    for (y=0; y<height; y++) {
2337cabdff1aSopenharmony_ci        RENAME(extract_even)(src, ydst, width);
2338cabdff1aSopenharmony_ci        RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2339cabdff1aSopenharmony_ci
2340cabdff1aSopenharmony_ci        src += srcStride;
2341cabdff1aSopenharmony_ci        ydst+= lumStride;
2342cabdff1aSopenharmony_ci        udst+= chromStride;
2343cabdff1aSopenharmony_ci        vdst+= chromStride;
2344cabdff1aSopenharmony_ci    }
2345cabdff1aSopenharmony_ci    __asm__(
2346cabdff1aSopenharmony_ci            EMMS"       \n\t"
2347cabdff1aSopenharmony_ci            SFENCE"     \n\t"
2348cabdff1aSopenharmony_ci            ::: "memory"
2349cabdff1aSopenharmony_ci        );
2350cabdff1aSopenharmony_ci}
2351cabdff1aSopenharmony_ci
2352cabdff1aSopenharmony_cistatic void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2353cabdff1aSopenharmony_ci                                 int width, int height,
2354cabdff1aSopenharmony_ci                                 int lumStride, int chromStride, int srcStride)
2355cabdff1aSopenharmony_ci{
2356cabdff1aSopenharmony_ci    int y;
2357cabdff1aSopenharmony_ci    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2358cabdff1aSopenharmony_ci
2359cabdff1aSopenharmony_ci    for (y=0; y<height; y++) {
2360cabdff1aSopenharmony_ci        RENAME(extract_odd)(src, ydst, width);
2361cabdff1aSopenharmony_ci        if(y&1) {
2362cabdff1aSopenharmony_ci            RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2363cabdff1aSopenharmony_ci            udst+= chromStride;
2364cabdff1aSopenharmony_ci            vdst+= chromStride;
2365cabdff1aSopenharmony_ci        }
2366cabdff1aSopenharmony_ci
2367cabdff1aSopenharmony_ci        src += srcStride;
2368cabdff1aSopenharmony_ci        ydst+= lumStride;
2369cabdff1aSopenharmony_ci    }
2370cabdff1aSopenharmony_ci    __asm__(
2371cabdff1aSopenharmony_ci            EMMS"       \n\t"
2372cabdff1aSopenharmony_ci            SFENCE"     \n\t"
2373cabdff1aSopenharmony_ci            ::: "memory"
2374cabdff1aSopenharmony_ci        );
2375cabdff1aSopenharmony_ci}
2376cabdff1aSopenharmony_ci
2377cabdff1aSopenharmony_ci#if ARCH_X86_32
2378cabdff1aSopenharmony_cistatic void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2379cabdff1aSopenharmony_ci                                 int width, int height,
2380cabdff1aSopenharmony_ci                                 int lumStride, int chromStride, int srcStride)
2381cabdff1aSopenharmony_ci{
2382cabdff1aSopenharmony_ci    int y;
2383cabdff1aSopenharmony_ci    const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2384cabdff1aSopenharmony_ci
2385cabdff1aSopenharmony_ci    for (y=0; y<height; y++) {
2386cabdff1aSopenharmony_ci        RENAME(extract_odd)(src, ydst, width);
2387cabdff1aSopenharmony_ci        RENAME(extract_even2)(src, udst, vdst, chromWidth);
2388cabdff1aSopenharmony_ci
2389cabdff1aSopenharmony_ci        src += srcStride;
2390cabdff1aSopenharmony_ci        ydst+= lumStride;
2391cabdff1aSopenharmony_ci        udst+= chromStride;
2392cabdff1aSopenharmony_ci        vdst+= chromStride;
2393cabdff1aSopenharmony_ci    }
2394cabdff1aSopenharmony_ci    __asm__(
2395cabdff1aSopenharmony_ci            EMMS"       \n\t"
2396cabdff1aSopenharmony_ci            SFENCE"     \n\t"
2397cabdff1aSopenharmony_ci            ::: "memory"
2398cabdff1aSopenharmony_ci        );
2399cabdff1aSopenharmony_ci}
2400cabdff1aSopenharmony_ci#endif /* ARCH_X86_32 */
2401cabdff1aSopenharmony_ci#endif /* !COMPILE_TEMPLATE_SSE2 */
2402cabdff1aSopenharmony_ci
2403cabdff1aSopenharmony_cistatic av_cold void RENAME(rgb2rgb_init)(void)
2404cabdff1aSopenharmony_ci{
2405cabdff1aSopenharmony_ci#if !COMPILE_TEMPLATE_SSE2
2406cabdff1aSopenharmony_ci    rgb15to16          = RENAME(rgb15to16);
2407cabdff1aSopenharmony_ci    rgb15tobgr24       = RENAME(rgb15tobgr24);
2408cabdff1aSopenharmony_ci    rgb15to32          = RENAME(rgb15to32);
2409cabdff1aSopenharmony_ci    rgb16tobgr24       = RENAME(rgb16tobgr24);
2410cabdff1aSopenharmony_ci    rgb16to32          = RENAME(rgb16to32);
2411cabdff1aSopenharmony_ci    rgb16to15          = RENAME(rgb16to15);
2412cabdff1aSopenharmony_ci    rgb24tobgr16       = RENAME(rgb24tobgr16);
2413cabdff1aSopenharmony_ci    rgb24tobgr15       = RENAME(rgb24tobgr15);
2414cabdff1aSopenharmony_ci    rgb24tobgr32       = RENAME(rgb24tobgr32);
2415cabdff1aSopenharmony_ci    rgb32to16          = RENAME(rgb32to16);
2416cabdff1aSopenharmony_ci    rgb32to15          = RENAME(rgb32to15);
2417cabdff1aSopenharmony_ci    rgb32tobgr24       = RENAME(rgb32tobgr24);
2418cabdff1aSopenharmony_ci    rgb24to15          = RENAME(rgb24to15);
2419cabdff1aSopenharmony_ci    rgb24to16          = RENAME(rgb24to16);
2420cabdff1aSopenharmony_ci    rgb24tobgr24       = RENAME(rgb24tobgr24);
2421cabdff1aSopenharmony_ci    rgb32tobgr16       = RENAME(rgb32tobgr16);
2422cabdff1aSopenharmony_ci    rgb32tobgr15       = RENAME(rgb32tobgr15);
2423cabdff1aSopenharmony_ci    yv12toyuy2         = RENAME(yv12toyuy2);
2424cabdff1aSopenharmony_ci    yv12touyvy         = RENAME(yv12touyvy);
2425cabdff1aSopenharmony_ci    yuv422ptoyuy2      = RENAME(yuv422ptoyuy2);
2426cabdff1aSopenharmony_ci    yuv422ptouyvy      = RENAME(yuv422ptouyvy);
2427cabdff1aSopenharmony_ci    yuy2toyv12         = RENAME(yuy2toyv12);
2428cabdff1aSopenharmony_ci    vu9_to_vu12        = RENAME(vu9_to_vu12);
2429cabdff1aSopenharmony_ci    yvu9_to_yuy2       = RENAME(yvu9_to_yuy2);
2430cabdff1aSopenharmony_ci#if ARCH_X86_32
2431cabdff1aSopenharmony_ci    uyvytoyuv422       = RENAME(uyvytoyuv422);
2432cabdff1aSopenharmony_ci#endif
2433cabdff1aSopenharmony_ci    yuyvtoyuv422       = RENAME(yuyvtoyuv422);
2434cabdff1aSopenharmony_ci
2435cabdff1aSopenharmony_ci    planar2x           = RENAME(planar2x);
2436cabdff1aSopenharmony_ci#if HAVE_7REGS
2437cabdff1aSopenharmony_ci    ff_rgb24toyv12     = RENAME(rgb24toyv12);
2438cabdff1aSopenharmony_ci#endif /* HAVE_7REGS */
2439cabdff1aSopenharmony_ci
2440cabdff1aSopenharmony_ci    yuyvtoyuv420       = RENAME(yuyvtoyuv420);
2441cabdff1aSopenharmony_ci    uyvytoyuv420       = RENAME(uyvytoyuv420);
2442cabdff1aSopenharmony_ci#endif /* !COMPILE_TEMPLATE_SSE2 */
2443cabdff1aSopenharmony_ci
2444cabdff1aSopenharmony_ci#if !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2
2445cabdff1aSopenharmony_ci    interleaveBytes    = RENAME(interleaveBytes);
2446cabdff1aSopenharmony_ci#endif /* !COMPILE_TEMPLATE_AVX && COMPILE_TEMPLATE_SSE2 */
2447cabdff1aSopenharmony_ci#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
2448cabdff1aSopenharmony_ci#if COMPILE_TEMPLATE_SSE2 && HAVE_X86ASM
2449cabdff1aSopenharmony_ci    deinterleaveBytes  = RENAME(deinterleaveBytes);
2450cabdff1aSopenharmony_ci#endif
2451cabdff1aSopenharmony_ci#endif
2452cabdff1aSopenharmony_ci}
2453