1;******************************************************************************
2;* SIMD-optimized clear block functions
3;* Copyright (c) 2002 Michael Niedermayer
4;* Copyright (c) 2008 Loren Merritt
5;* Copyright (c) 2009 Fiona Glaser
6;*
7;* AVX version by Jokyo Images
8;*
9;* This file is part of FFmpeg.
10;*
11;* FFmpeg is free software; you can redistribute it and/or
12;* modify it under the terms of the GNU Lesser General Public
13;* License as published by the Free Software Foundation; either
14;* version 2.1 of the License, or (at your option) any later version.
15;*
16;* FFmpeg is distributed in the hope that it will be useful,
17;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19;* Lesser General Public License for more details.
20;*
21;* You should have received a copy of the GNU Lesser General Public
22;* License along with FFmpeg; if not, write to the Free Software
23;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24;******************************************************************************
25
26%include "libavutil/x86/x86util.asm"
27
28SECTION .text
29
30;----------------------------------------
31; void ff_clear_block(int16_t *blocks);
32;----------------------------------------
33; %1 = number of xmm registers used
34; %2 = number of inline store loops
35%macro CLEAR_BLOCK 2
36cglobal clear_block, 1, 1, %1, blocks
37    ZERO  m0, m0, m0
38%assign %%i 0
39%rep %2
40    mova  [blocksq+mmsize*(0+%%i)], m0
41    mova  [blocksq+mmsize*(1+%%i)], m0
42    mova  [blocksq+mmsize*(2+%%i)], m0
43    mova  [blocksq+mmsize*(3+%%i)], m0
44%assign %%i %%i+4
45%endrep
46    RET
47%endmacro
48
49INIT_XMM sse
50%define ZERO xorps
51CLEAR_BLOCK 1, 2
52INIT_YMM avx
53CLEAR_BLOCK 1, 1
54
55;-----------------------------------------
56; void ff_clear_blocks(int16_t *blocks);
57;-----------------------------------------
58; %1 = number of xmm registers used
59%macro CLEAR_BLOCKS 1
60cglobal clear_blocks, 1, 2, %1, blocks, len
61    add   blocksq, 768
62    mov      lenq, -768
63    ZERO       m0, m0, m0
64.loop:
65    mova  [blocksq+lenq+mmsize*0], m0
66    mova  [blocksq+lenq+mmsize*1], m0
67    mova  [blocksq+lenq+mmsize*2], m0
68    mova  [blocksq+lenq+mmsize*3], m0
69    mova  [blocksq+lenq+mmsize*4], m0
70    mova  [blocksq+lenq+mmsize*5], m0
71    mova  [blocksq+lenq+mmsize*6], m0
72    mova  [blocksq+lenq+mmsize*7], m0
73    add   lenq, mmsize*8
74    js .loop
75    RET
76%endmacro
77
78INIT_XMM sse
79%define ZERO xorps
80CLEAR_BLOCKS 1
81INIT_YMM avx
82CLEAR_BLOCKS 1
83