1/*
2 * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21#include "config.h"
22#if HAVE_UNISTD_H
23#include <unistd.h>
24#endif
25
26#include "libavutil/avassert.h"
27#include "libavutil/mem.h"
28#include "libavutil/ppc/util_altivec.h"
29
30#define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
31
32#if HAVE_BIGENDIAN
33#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
34    vec_u8 srcR1 = vec_ld(-2, s);\
35    vec_u8 srcR2 = vec_ld(14, s);\
36    switch (ali) {\
37    default: {\
38        srcM2 = vec_perm(srcR1, srcR2, pm2);\
39        srcM1 = vec_perm(srcR1, srcR2, pm1);\
40        srcP0 = vec_perm(srcR1, srcR2, pp0);\
41        srcP1 = vec_perm(srcR1, srcR2, pp1);\
42        srcP2 = vec_perm(srcR1, srcR2, pp2);\
43        srcP3 = vec_perm(srcR1, srcR2, pp3);\
44    } break;\
45    case 11: {\
46        srcM2 = vec_perm(srcR1, srcR2, pm2);\
47        srcM1 = vec_perm(srcR1, srcR2, pm1);\
48        srcP0 = vec_perm(srcR1, srcR2, pp0);\
49        srcP1 = vec_perm(srcR1, srcR2, pp1);\
50        srcP2 = vec_perm(srcR1, srcR2, pp2);\
51        srcP3 = srcR2;\
52    } break;\
53    case 12: {\
54        vec_u8 srcR3 = vec_ld(30, s);\
55        srcM2 = vec_perm(srcR1, srcR2, pm2);\
56        srcM1 = vec_perm(srcR1, srcR2, pm1);\
57        srcP0 = vec_perm(srcR1, srcR2, pp0);\
58        srcP1 = vec_perm(srcR1, srcR2, pp1);\
59        srcP2 = srcR2;\
60        srcP3 = vec_perm(srcR2, srcR3, pp3);\
61    } break;\
62    case 13: {\
63        vec_u8 srcR3 = vec_ld(30, s);\
64        srcM2 = vec_perm(srcR1, srcR2, pm2);\
65        srcM1 = vec_perm(srcR1, srcR2, pm1);\
66        srcP0 = vec_perm(srcR1, srcR2, pp0);\
67        srcP1 = srcR2;\
68        srcP2 = vec_perm(srcR2, srcR3, pp2);\
69        srcP3 = vec_perm(srcR2, srcR3, pp3);\
70    } break;\
71    case 14: {\
72        vec_u8 srcR3 = vec_ld(30, s);\
73        srcM2 = vec_perm(srcR1, srcR2, pm2);\
74        srcM1 = vec_perm(srcR1, srcR2, pm1);\
75        srcP0 = srcR2;\
76        srcP1 = vec_perm(srcR2, srcR3, pp1);\
77        srcP2 = vec_perm(srcR2, srcR3, pp2);\
78        srcP3 = vec_perm(srcR2, srcR3, pp3);\
79    } break;\
80    case 15: {\
81        vec_u8 srcR3 = vec_ld(30, s);\
82        srcM2 = vec_perm(srcR1, srcR2, pm2);\
83        srcM1 = srcR2;\
84        srcP0 = vec_perm(srcR2, srcR3, pp0);\
85        srcP1 = vec_perm(srcR2, srcR3, pp1);\
86        srcP2 = vec_perm(srcR2, srcR3, pp2);\
87        srcP3 = vec_perm(srcR2, srcR3, pp3);\
88    } break;\
89    }\
90 }
91#else
92#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
93    srcM2 =  vec_vsx_ld(-2, s);\
94    srcM1 = vec_vsx_ld(-1, s);\
95    srcP0 = vec_vsx_ld(0, s);\
96    srcP1 = vec_vsx_ld(1, s);\
97    srcP2 = vec_vsx_ld(2, s);\
98    srcP3 = vec_vsx_ld(3, s);\
99 }
100#endif /* HAVE_BIGENDIAN */
101
102/* this code assume stride % 16 == 0 */
103#ifdef PREFIX_h264_qpel16_h_lowpass_altivec
104static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
105                                                 const uint8_t *src,
106                                                 int dstStride, int srcStride)
107{
108    register int i;
109
110    LOAD_ZERO;
111    vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
112    const vec_s16 v5ss = vec_splat_s16(5);
113    const vec_u16 v5us = vec_splat_u16(5);
114    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
115    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
116
117    vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
118
119    register int align = ((((unsigned long)src) - 2) % 16);
120
121    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
122              srcP2A, srcP2B, srcP3A, srcP3B,
123              srcM1A, srcM1B, srcM2A, srcM2B,
124              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
125              pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
126              psumA, psumB, sumA, sumB;
127
128    vec_u8 sum, fsum;
129
130#if HAVE_BIGENDIAN
131    permM2 = vec_lvsl(-2, src);
132    permM1 = vec_lvsl(-1, src);
133    permP0 = vec_lvsl(+0, src);
134    permP1 = vec_lvsl(+1, src);
135    permP2 = vec_lvsl(+2, src);
136    permP3 = vec_lvsl(+3, src);
137#endif /* HAVE_BIGENDIAN */
138
139    for (i = 0 ; i < 16 ; i ++) {
140        load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
141
142        srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
143        srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
144        srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
145        srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
146
147        srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
148        srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
149        srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
150        srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
151
152        srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
153        srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
154        srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
155        srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
156
157        sum1A = vec_adds(srcP0A, srcP1A);
158        sum1B = vec_adds(srcP0B, srcP1B);
159        sum2A = vec_adds(srcM1A, srcP2A);
160        sum2B = vec_adds(srcM1B, srcP2B);
161        sum3A = vec_adds(srcM2A, srcP3A);
162        sum3B = vec_adds(srcM2B, srcP3B);
163
164        pp1A = vec_mladd(sum1A, v20ss, v16ss);
165        pp1B = vec_mladd(sum1B, v20ss, v16ss);
166
167        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
168        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
169
170        pp3A = vec_add(sum3A, pp1A);
171        pp3B = vec_add(sum3B, pp1B);
172
173        psumA = vec_sub(pp3A, pp2A);
174        psumB = vec_sub(pp3B, pp2B);
175
176        sumA = vec_sra(psumA, v5us);
177        sumB = vec_sra(psumB, v5us);
178
179        sum = vec_packsu(sumA, sumB);
180
181        ASSERT_ALIGNED(dst);
182
183        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
184
185        vec_st(fsum, 0, dst);
186
187        src += srcStride;
188        dst += dstStride;
189    }
190}
191#endif /* PREFIX_h264_qpel16_h_lowpass_altivec */
192
193/* this code assume stride % 16 == 0 */
194#ifdef PREFIX_h264_qpel16_v_lowpass_altivec
195static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
196                                                 const uint8_t *src,
197                                                 int dstStride, int srcStride)
198{
199    register int i;
200
201    LOAD_ZERO;
202    vec_u8 perm;
203#if HAVE_BIGENDIAN
204    perm = vec_lvsl(0, src);
205#endif
206    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
207    const vec_u16 v5us = vec_splat_u16(5);
208    const vec_s16 v5ss = vec_splat_s16(5);
209    const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
210
211    const uint8_t *srcbis = src - (srcStride * 2);
212
213    const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
214    srcbis += srcStride;
215    const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
216    srcbis += srcStride;
217    const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
218    srcbis += srcStride;
219    const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
220    srcbis += srcStride;
221    const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
222    srcbis += srcStride;
223
224    vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
225    vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
226    vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
227    vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
228    vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
229    vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
230    vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
231    vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
232    vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
233    vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
234
235    vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
236              psumA, psumB, sumA, sumB,
237              srcP3ssA, srcP3ssB,
238              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
239
240    vec_u8 sum, fsum, srcP3;
241
242    for (i = 0 ; i < 16 ; i++) {
243        srcP3 = load_with_perm_vec(0, srcbis, perm);
244        srcbis += srcStride;
245
246        srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
247        srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
248
249        sum1A = vec_adds(srcP0ssA, srcP1ssA);
250        sum1B = vec_adds(srcP0ssB, srcP1ssB);
251        sum2A = vec_adds(srcM1ssA, srcP2ssA);
252        sum2B = vec_adds(srcM1ssB, srcP2ssB);
253        sum3A = vec_adds(srcM2ssA, srcP3ssA);
254        sum3B = vec_adds(srcM2ssB, srcP3ssB);
255
256        srcM2ssA = srcM1ssA;
257        srcM2ssB = srcM1ssB;
258        srcM1ssA = srcP0ssA;
259        srcM1ssB = srcP0ssB;
260        srcP0ssA = srcP1ssA;
261        srcP0ssB = srcP1ssB;
262        srcP1ssA = srcP2ssA;
263        srcP1ssB = srcP2ssB;
264        srcP2ssA = srcP3ssA;
265        srcP2ssB = srcP3ssB;
266
267        pp1A = vec_mladd(sum1A, v20ss, v16ss);
268        pp1B = vec_mladd(sum1B, v20ss, v16ss);
269
270        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
271        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
272
273        pp3A = vec_add(sum3A, pp1A);
274        pp3B = vec_add(sum3B, pp1B);
275
276        psumA = vec_sub(pp3A, pp2A);
277        psumB = vec_sub(pp3B, pp2B);
278
279        sumA = vec_sra(psumA, v5us);
280        sumB = vec_sra(psumB, v5us);
281
282        sum = vec_packsu(sumA, sumB);
283
284        ASSERT_ALIGNED(dst);
285
286        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
287
288        vec_st(fsum, 0, dst);
289
290        dst += dstStride;
291    }
292}
293#endif /* PREFIX_h264_qpel16_v_lowpass_altivec */
294
295/* this code assume stride % 16 == 0 *and* tmp is properly aligned */
296#ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
297static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
298                                                  const uint8_t *src,
299                                                  int dstStride, int tmpStride,
300                                                  int srcStride)
301{
302    register int i;
303    LOAD_ZERO;
304    vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
305    const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
306    const vec_u32 v10ui = vec_splat_u32(10);
307    const vec_s16 v5ss = vec_splat_s16(5);
308    const vec_s16 v1ss = vec_splat_s16(1);
309    const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
310    const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
311
312    register int align = ((((unsigned long)src) - 2) % 16);
313
314    vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
315              srcP2A, srcP2B, srcP3A, srcP3B,
316              srcM1A, srcM1B, srcM2A, srcM2B,
317              sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
318              pp1A, pp1B, pp2A, pp2B, psumA, psumB;
319
320    const vec_u8 mperm = (const vec_u8)
321        {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
322         0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
323    int16_t *tmpbis = tmp;
324
325    vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
326              tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
327              tmpP2ssA, tmpP2ssB;
328
329    vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
330              pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
331              pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
332              ssumAe, ssumAo, ssumBe, ssumBo;
333    vec_u8 fsum, sumv, sum;
334    vec_s16 ssume, ssumo;
335
336#if HAVE_BIGENDIAN
337    permM2 = vec_lvsl(-2, src);
338    permM1 = vec_lvsl(-1, src);
339    permP0 = vec_lvsl(+0, src);
340    permP1 = vec_lvsl(+1, src);
341    permP2 = vec_lvsl(+2, src);
342    permP3 = vec_lvsl(+3, src);
343#endif /* HAVE_BIGENDIAN */
344
345    src -= (2 * srcStride);
346    for (i = 0 ; i < 21 ; i ++) {
347        vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
348
349        load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
350
351        srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
352        srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
353        srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
354        srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
355
356        srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
357        srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
358        srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
359        srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
360
361        srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
362        srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
363        srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
364        srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
365
366        sum1A = vec_adds(srcP0A, srcP1A);
367        sum1B = vec_adds(srcP0B, srcP1B);
368        sum2A = vec_adds(srcM1A, srcP2A);
369        sum2B = vec_adds(srcM1B, srcP2B);
370        sum3A = vec_adds(srcM2A, srcP3A);
371        sum3B = vec_adds(srcM2B, srcP3B);
372
373        pp1A = vec_mladd(sum1A, v20ss, sum3A);
374        pp1B = vec_mladd(sum1B, v20ss, sum3B);
375
376        pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
377        pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
378
379        psumA = vec_sub(pp1A, pp2A);
380        psumB = vec_sub(pp1B, pp2B);
381
382        vec_st(psumA, 0, tmp);
383        vec_st(psumB, 16, tmp);
384
385        src += srcStride;
386        tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
387    }
388
389    tmpM2ssA = vec_ld(0, tmpbis);
390    tmpM2ssB = vec_ld(16, tmpbis);
391    tmpbis += tmpStride;
392    tmpM1ssA = vec_ld(0, tmpbis);
393    tmpM1ssB = vec_ld(16, tmpbis);
394    tmpbis += tmpStride;
395    tmpP0ssA = vec_ld(0, tmpbis);
396    tmpP0ssB = vec_ld(16, tmpbis);
397    tmpbis += tmpStride;
398    tmpP1ssA = vec_ld(0, tmpbis);
399    tmpP1ssB = vec_ld(16, tmpbis);
400    tmpbis += tmpStride;
401    tmpP2ssA = vec_ld(0, tmpbis);
402    tmpP2ssB = vec_ld(16, tmpbis);
403    tmpbis += tmpStride;
404
405    for (i = 0 ; i < 16 ; i++) {
406        const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
407        const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
408
409        const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
410        const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
411        const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
412        const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
413        vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
414        vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
415
416        tmpbis += tmpStride;
417
418        tmpM2ssA = tmpM1ssA;
419        tmpM2ssB = tmpM1ssB;
420        tmpM1ssA = tmpP0ssA;
421        tmpM1ssB = tmpP0ssB;
422        tmpP0ssA = tmpP1ssA;
423        tmpP0ssB = tmpP1ssB;
424        tmpP1ssA = tmpP2ssA;
425        tmpP1ssB = tmpP2ssB;
426        tmpP2ssA = tmpP3ssA;
427        tmpP2ssB = tmpP3ssB;
428
429        pp1Ae = vec_mule(sum1A, v20ss);
430        pp1Ao = vec_mulo(sum1A, v20ss);
431        pp1Be = vec_mule(sum1B, v20ss);
432        pp1Bo = vec_mulo(sum1B, v20ss);
433
434        pp2Ae = vec_mule(sum2A, v5ss);
435        pp2Ao = vec_mulo(sum2A, v5ss);
436        pp2Be = vec_mule(sum2B, v5ss);
437        pp2Bo = vec_mulo(sum2B, v5ss);
438
439        pp3Ao = vec_mulo(sum3A, v1ss);
440        pp3Bo = vec_mulo(sum3B, v1ss);
441#if !HAVE_BIGENDIAN
442        sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
443        sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
444#endif
445        pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
446        pp3Be = vec_sra((vec_s32)sum3B, v16ui);
447
448        pp1cAe = vec_add(pp1Ae, v512si);
449        pp1cAo = vec_add(pp1Ao, v512si);
450        pp1cBe = vec_add(pp1Be, v512si);
451        pp1cBo = vec_add(pp1Bo, v512si);
452
453        pp32Ae = vec_sub(pp3Ae, pp2Ae);
454        pp32Ao = vec_sub(pp3Ao, pp2Ao);
455        pp32Be = vec_sub(pp3Be, pp2Be);
456        pp32Bo = vec_sub(pp3Bo, pp2Bo);
457
458        sumAe = vec_add(pp1cAe, pp32Ae);
459        sumAo = vec_add(pp1cAo, pp32Ao);
460        sumBe = vec_add(pp1cBe, pp32Be);
461        sumBo = vec_add(pp1cBo, pp32Bo);
462
463        ssumAe = vec_sra(sumAe, v10ui);
464        ssumAo = vec_sra(sumAo, v10ui);
465        ssumBe = vec_sra(sumBe, v10ui);
466        ssumBo = vec_sra(sumBo, v10ui);
467
468        ssume = vec_packs(ssumAe, ssumBe);
469        ssumo = vec_packs(ssumAo, ssumBo);
470
471        sumv = vec_packsu(ssume, ssumo);
472        sum = vec_perm(sumv, sumv, mperm);
473
474        ASSERT_ALIGNED(dst);
475
476        OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
477
478        vec_st(fsum, 0, dst);
479
480        dst += dstStride;
481    }
482}
483#endif /* PREFIX_h264_qpel16_hv_lowpass_altivec */
484