1 /* 2 * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> 3 * 4 * This file is part of FFmpeg. 5 * 6 * FFmpeg is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Lesser General Public 8 * License as published by the Free Software Foundation; either 9 * version 2.1 of the License, or (at your option) any later version. 10 * 11 * FFmpeg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Lesser General Public License for more details. 15 * 16 * You should have received a copy of the GNU Lesser General Public 17 * License along with FFmpeg; if not, write to the Free Software 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 19 */ 20 21 #include "libavutil/arm/asm.S" 22 23 .macro ldcol.8 rd, rs, rt, n=8, hi=0 24 .if \n == 8 || \hi == 0 25 vld1.8 {\rd[0]}, [\rs], \rt 26 vld1.8 {\rd[1]}, [\rs], \rt 27 vld1.8 {\rd[2]}, [\rs], \rt 28 vld1.8 {\rd[3]}, [\rs], \rt 29 .endif 30 .if \n == 8 || \hi == 1 31 vld1.8 {\rd[4]}, [\rs], \rt 32 vld1.8 {\rd[5]}, [\rs], \rt 33 vld1.8 {\rd[6]}, [\rs], \rt 34 vld1.8 {\rd[7]}, [\rs], \rt 35 .endif 36 .endm 37 38 .macro add16x8 dq, dl, dh, rl, rh 39 vaddl.u8 \dq, \rl, \rh 40 vadd.u16 \dl, \dl, \dh 41 vpadd.u16 \dl, \dl, \dl 42 vpadd.u16 \dl, \dl, \dl 43 .endm 44 45 function ff_pred16x16_128_dc_neon, export=1 46 vmov.i8 q0, #128 47 b .L_pred16x16_dc_end 48 endfunc 49 50 function ff_pred16x16_top_dc_neon, export=1 51 sub r2, r0, r1 52 vld1.8 {q0}, [r2,:128] 53 add16x8 q0, d0, d1, d0, d1 54 vrshrn.u16 d0, q0, #4 55 vdup.8 q0, d0[0] 56 b .L_pred16x16_dc_end 57 endfunc 58 59 function ff_pred16x16_left_dc_neon, export=1 60 sub r2, r0, #1 61 ldcol.8 d0, r2, r1 62 ldcol.8 d1, r2, r1 63 add16x8 q0, d0, d1, d0, d1 64 vrshrn.u16 d0, q0, #4 65 vdup.8 q0, d0[0] 66 b .L_pred16x16_dc_end 67 endfunc 68 69 function ff_pred16x16_dc_neon, export=1 70 sub r2, r0, r1 71 vld1.8 {q0}, [r2,:128] 72 sub r2, r0, #1 73 ldcol.8 d2, r2, r1 74 ldcol.8 d3, r2, r1 75 vaddl.u8 q0, d0, d1 76 vaddl.u8 q1, d2, d3 77 vadd.u16 q0, q0, q1 78 vadd.u16 d0, d0, d1 79 vpadd.u16 d0, d0, d0 80 vpadd.u16 d0, d0, d0 81 vrshrn.u16 d0, q0, #5 82 vdup.8 q0, d0[0] 83 .L_pred16x16_dc_end: 84 mov r3, #8 85 6: vst1.8 {q0}, [r0,:128], r1 86 vst1.8 {q0}, [r0,:128], r1 87 subs r3, r3, #1 88 bne 6b 89 bx lr 90 endfunc 91 92 function ff_pred16x16_hor_neon, export=1 93 sub r2, r0, #1 94 mov r3, #16 95 1: vld1.8 {d0[],d1[]},[r2], r1 96 vst1.8 {q0}, [r0,:128], r1 97 subs r3, r3, #1 98 bne 1b 99 bx lr 100 endfunc 101 102 function ff_pred16x16_vert_neon, export=1 103 sub r0, r0, r1 104 vld1.8 {q0}, [r0,:128], r1 105 mov r3, #8 106 1: vst1.8 {q0}, [r0,:128], r1 107 vst1.8 {q0}, [r0,:128], r1 108 subs r3, r3, #1 109 bne 1b 110 bx lr 111 endfunc 112 113 function ff_pred16x16_plane_neon, export=1 114 sub r3, r0, r1 115 add r2, r3, #8 116 sub r3, r3, #1 117 vld1.8 {d0}, [r3] 118 vld1.8 {d2}, [r2,:64], r1 119 ldcol.8 d1, r3, r1 120 add r3, r3, r1 121 ldcol.8 d3, r3, r1 122 vrev64.8 q0, q0 123 vaddl.u8 q8, d2, d3 124 vsubl.u8 q2, d2, d0 125 vsubl.u8 q3, d3, d1 126 movrel r3, p16weight 127 vld1.8 {q0}, [r3,:128] 128 vmul.s16 q2, q2, q0 129 vmul.s16 q3, q3, q0 130 vadd.i16 d4, d4, d5 131 vadd.i16 d5, d6, d7 132 vpadd.i16 d4, d4, d5 133 vpadd.i16 d4, d4, d4 134 vshll.s16 q3, d4, #2 135 vaddw.s16 q2, q3, d4 136 vrshrn.s32 d4, q2, #6 137 mov r3, #0 138 vtrn.16 d4, d5 139 vadd.i16 d2, d4, d5 140 vshl.i16 d3, d2, #3 141 vrev64.16 d16, d17 142 vsub.i16 d3, d3, d2 143 vadd.i16 d16, d16, d0 144 vshl.i16 d2, d16, #4 145 vsub.i16 d2, d2, d3 146 vshl.i16 d3, d4, #4 147 vext.16 q0, q0, q0, #7 148 vsub.i16 d6, d5, d3 149 vmov.16 d0[0], r3 150 vmul.i16 q0, q0, d4[0] 151 vdup.16 q1, d2[0] 152 vdup.16 q2, d4[0] 153 vdup.16 q3, d6[0] 154 vshl.i16 q2, q2, #3 155 vadd.i16 q1, q1, q0 156 vadd.i16 q3, q3, q2 157 mov r3, #16 158 1: 159 vqshrun.s16 d0, q1, #5 160 vadd.i16 q1, q1, q2 161 vqshrun.s16 d1, q1, #5 162 vadd.i16 q1, q1, q3 163 vst1.8 {q0}, [r0,:128], r1 164 subs r3, r3, #1 165 bne 1b 166 bx lr 167 endfunc 168 169 const p16weight, align=4 170 .short 1,2,3,4,5,6,7,8 171 endconst 172 173 function ff_pred8x8_hor_neon, export=1 174 sub r2, r0, #1 175 mov r3, #8 176 1: vld1.8 {d0[]}, [r2], r1 177 vst1.8 {d0}, [r0,:64], r1 178 subs r3, r3, #1 179 bne 1b 180 bx lr 181 endfunc 182 183 function ff_pred8x8_vert_neon, export=1 184 sub r0, r0, r1 185 vld1.8 {d0}, [r0,:64], r1 186 mov r3, #4 187 1: vst1.8 {d0}, [r0,:64], r1 188 vst1.8 {d0}, [r0,:64], r1 189 subs r3, r3, #1 190 bne 1b 191 bx lr 192 endfunc 193 194 function ff_pred8x8_plane_neon, export=1 195 sub r3, r0, r1 196 add r2, r3, #4 197 sub r3, r3, #1 198 vld1.32 {d0[0]}, [r3] 199 vld1.32 {d2[0]}, [r2,:32], r1 200 ldcol.8 d0, r3, r1, 4, hi=1 201 add r3, r3, r1 202 ldcol.8 d3, r3, r1, 4 203 vaddl.u8 q8, d2, d3 204 vrev32.8 d0, d0 205 vtrn.32 d2, d3 206 vsubl.u8 q2, d2, d0 207 movrel r3, p16weight 208 vld1.16 {q0}, [r3,:128] 209 vmul.s16 d4, d4, d0 210 vmul.s16 d5, d5, d0 211 vpadd.i16 d4, d4, d5 212 vpaddl.s16 d4, d4 213 vshl.i32 d5, d4, #4 214 vadd.s32 d4, d4, d5 215 vrshrn.s32 d4, q2, #5 216 mov r3, #0 217 vtrn.16 d4, d5 218 vadd.i16 d2, d4, d5 219 vshl.i16 d3, d2, #2 220 vrev64.16 d16, d16 221 vsub.i16 d3, d3, d2 222 vadd.i16 d16, d16, d0 223 vshl.i16 d2, d16, #4 224 vsub.i16 d2, d2, d3 225 vshl.i16 d3, d4, #3 226 vext.16 q0, q0, q0, #7 227 vsub.i16 d6, d5, d3 228 vmov.16 d0[0], r3 229 vmul.i16 q0, q0, d4[0] 230 vdup.16 q1, d2[0] 231 vdup.16 q2, d4[0] 232 vdup.16 q3, d6[0] 233 vshl.i16 q2, q2, #3 234 vadd.i16 q1, q1, q0 235 vadd.i16 q3, q3, q2 236 mov r3, #8 237 1: 238 vqshrun.s16 d0, q1, #5 239 vadd.i16 q1, q1, q3 240 vst1.8 {d0}, [r0,:64], r1 241 subs r3, r3, #1 242 bne 1b 243 bx lr 244 endfunc 245 246 function ff_pred8x8_128_dc_neon, export=1 247 vmov.i8 q0, #128 248 b .L_pred8x8_dc_end 249 endfunc 250 251 function ff_pred8x8_top_dc_neon, export=1 252 sub r2, r0, r1 253 vld1.8 {d0}, [r2,:64] 254 vpaddl.u8 d0, d0 255 vpadd.u16 d0, d0, d0 256 vrshrn.u16 d0, q0, #2 257 vdup.8 d1, d0[1] 258 vdup.8 d0, d0[0] 259 vtrn.32 d0, d1 260 b .L_pred8x8_dc_end 261 endfunc 262 263 function ff_pred8x8_left_dc_neon, export=1 264 sub r2, r0, #1 265 ldcol.8 d0, r2, r1 266 vpaddl.u8 d0, d0 267 vpadd.u16 d0, d0, d0 268 vrshrn.u16 d0, q0, #2 269 vdup.8 d1, d0[1] 270 vdup.8 d0, d0[0] 271 b .L_pred8x8_dc_end 272 endfunc 273 274 function ff_pred8x8_dc_neon, export=1 275 sub r2, r0, r1 276 vld1.8 {d0}, [r2,:64] 277 sub r2, r0, #1 278 ldcol.8 d1, r2, r1 279 vtrn.32 d0, d1 280 vpaddl.u8 q0, q0 281 vpadd.u16 d0, d0, d1 282 vpadd.u16 d1, d0, d0 283 vrshrn.u16 d2, q0, #3 284 vrshrn.u16 d3, q0, #2 285 vdup.8 d0, d2[4] 286 vdup.8 d1, d3[3] 287 vdup.8 d4, d3[2] 288 vdup.8 d5, d2[5] 289 vtrn.32 q0, q2 290 .L_pred8x8_dc_end: 291 mov r3, #4 292 add r2, r0, r1, lsl #2 293 6: vst1.8 {d0}, [r0,:64], r1 294 vst1.8 {d1}, [r2,:64], r1 295 subs r3, r3, #1 296 bne 6b 297 bx lr 298 endfunc 299 300 function ff_pred8x8_l0t_dc_neon, export=1 301 sub r2, r0, r1 302 vld1.8 {d0}, [r2,:64] 303 sub r2, r0, #1 304 ldcol.8 d1, r2, r1, 4 305 vtrn.32 d0, d1 306 vpaddl.u8 q0, q0 307 vpadd.u16 d0, d0, d1 308 vpadd.u16 d1, d0, d0 309 vrshrn.u16 d2, q0, #3 310 vrshrn.u16 d3, q0, #2 311 vdup.8 d0, d2[4] 312 vdup.8 d1, d3[0] 313 vdup.8 q2, d3[2] 314 vtrn.32 q0, q2 315 b .L_pred8x8_dc_end 316 endfunc 317 318 function ff_pred8x8_l00_dc_neon, export=1 319 sub r2, r0, #1 320 ldcol.8 d0, r2, r1, 4 321 vpaddl.u8 d0, d0 322 vpadd.u16 d0, d0, d0 323 vrshrn.u16 d0, q0, #2 324 vmov.i8 d1, #128 325 vdup.8 d0, d0[0] 326 b .L_pred8x8_dc_end 327 endfunc 328 329 function ff_pred8x8_0lt_dc_neon, export=1 330 sub r2, r0, r1 331 vld1.8 {d0}, [r2,:64] 332 add r2, r0, r1, lsl #2 333 sub r2, r2, #1 334 ldcol.8 d1, r2, r1, 4, hi=1 335 vtrn.32 d0, d1 336 vpaddl.u8 q0, q0 337 vpadd.u16 d0, d0, d1 338 vpadd.u16 d1, d0, d0 339 vrshrn.u16 d3, q0, #2 340 vrshrn.u16 d2, q0, #3 341 vdup.8 d0, d3[0] 342 vdup.8 d1, d3[3] 343 vdup.8 d4, d3[2] 344 vdup.8 d5, d2[5] 345 vtrn.32 q0, q2 346 b .L_pred8x8_dc_end 347 endfunc 348 349 function ff_pred8x8_0l0_dc_neon, export=1 350 add r2, r0, r1, lsl #2 351 sub r2, r2, #1 352 ldcol.8 d1, r2, r1, 4 353 vpaddl.u8 d2, d1 354 vpadd.u16 d2, d2, d2 355 vrshrn.u16 d1, q1, #2 356 vmov.i8 d0, #128 357 vdup.8 d1, d1[0] 358 b .L_pred8x8_dc_end 359 endfunc 360