Mercurial > libavcodec.hg
annotate arm/simple_idct_neon.S @ 9976:e52cd349e708 libavcodec
Only compile in NEON optimizations for H.264 when the H.264 decoder is enabled.
| author | diego |
|---|---|
| date | Wed, 22 Jul 2009 22:33:33 +0000 |
| parents | c65cfd4ad000 |
| children | be725249ea67 |
| rev | line source |
|---|---|
| 8335 | 1 /* |
| 2 * ARM NEON IDCT | |
| 3 * | |
| 4 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
| 5 * | |
| 6 * Based on Simple IDCT | |
| 7 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
| 8 * | |
| 9 * This file is part of FFmpeg. | |
| 10 * | |
| 11 * FFmpeg is free software; you can redistribute it and/or | |
| 12 * modify it under the terms of the GNU Lesser General Public | |
| 13 * License as published by the Free Software Foundation; either | |
| 14 * version 2.1 of the License, or (at your option) any later version. | |
| 15 * | |
| 16 * FFmpeg is distributed in the hope that it will be useful, | |
| 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 19 * Lesser General Public License for more details. | |
| 20 * | |
| 21 * You should have received a copy of the GNU Lesser General Public | |
| 22 * License along with FFmpeg; if not, write to the Free Software | |
| 23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 24 */ | |
| 25 | |
| 26 #include "asm.S" | |
| 27 | |
| 28 #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 29 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 30 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 31 #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 32 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 33 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 34 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 35 #define W4c ((1<<(COL_SHIFT-1))/W4) | |
| 36 #define ROW_SHIFT 11 | |
| 37 #define COL_SHIFT 20 | |
| 38 | |
| 39 #define w1 d0[0] | |
| 40 #define w2 d0[1] | |
| 41 #define w3 d0[2] | |
| 42 #define w4 d0[3] | |
| 43 #define w5 d1[0] | |
| 44 #define w6 d1[1] | |
| 45 #define w7 d1[2] | |
| 46 #define w4c d1[3] | |
| 47 | |
| 48 .fpu neon | |
| 49 | |
| 50 .macro idct_col4_top | |
| 51 vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ | |
| 52 vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ | |
| 53 vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ | |
| 54 vadd.i32 q11, q15, q7 | |
| 55 vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ | |
| 56 vadd.i32 q12, q15, q8 | |
| 57 vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ | |
| 58 vsub.i32 q13, q15, q8 | |
| 59 vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ | |
| 60 vsub.i32 q14, q15, q7 | |
| 61 | |
| 62 vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ | |
| 63 vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ | |
| 64 vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ | |
| 65 vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ | |
| 66 .endm | |
| 67 | |
| 68 .text | |
| 69 .align 6 | |
| 70 | |
| 9724 | 71 function idct_row4_pld_neon |
| 72 pld [r0] | |
| 73 add r3, r0, r1, lsl #2 | |
| 74 pld [r0, r1] | |
| 75 pld [r0, r1, lsl #1] | |
| 76 pld [r3, -r1] | |
| 77 pld [r3] | |
| 78 pld [r3, r1] | |
| 79 add r3, r3, r1, lsl #1 | |
| 80 pld [r3] | |
| 81 pld [r3, r1] | |
| 82 .endfunc | |
| 83 | |
| 8335 | 84 function idct_row4_neon |
| 85 vmov.i32 q15, #(1<<(ROW_SHIFT-1)) | |
| 86 vld1.64 {d2-d5}, [r2,:128]! | |
| 87 vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ | |
| 88 vld1.64 {d6,d7}, [r2,:128]! | |
| 89 vorr d10, d3, d5 | |
| 90 vld1.64 {d8,d9}, [r2,:128]! | |
| 91 add r2, r2, #-64 | |
| 92 | |
| 93 vorr d11, d7, d9 | |
| 94 vorr d10, d10, d11 | |
| 95 vmov r3, r4, d10 | |
| 96 | |
| 97 idct_col4_top | |
| 98 | |
| 99 orrs r3, r3, r4 | |
| 100 beq 1f | |
| 101 | |
| 102 vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ | |
| 103 vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ | |
| 104 vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ | |
| 105 vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ | |
| 106 vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ | |
| 107 vadd.i32 q11, q11, q7 | |
| 108 vsub.i32 q12, q12, q7 | |
| 109 vsub.i32 q13, q13, q7 | |
| 110 vadd.i32 q14, q14, q7 | |
| 111 vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ | |
| 112 vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ | |
| 113 vmlal.s16 q9, d9, w7 | |
| 114 vmlsl.s16 q10, d9, w5 | |
| 115 vmlal.s16 q5, d9, w3 | |
| 116 vmlsl.s16 q6, d9, w1 | |
| 117 vadd.i32 q11, q11, q7 | |
| 118 vsub.i32 q12, q12, q8 | |
| 119 vadd.i32 q13, q13, q8 | |
| 120 vsub.i32 q14, q14, q7 | |
| 121 | |
| 122 1: vadd.i32 q3, q11, q9 | |
| 123 vadd.i32 q4, q12, q10 | |
| 124 vshrn.i32 d2, q3, #ROW_SHIFT | |
| 125 vshrn.i32 d4, q4, #ROW_SHIFT | |
| 126 vadd.i32 q7, q13, q5 | |
| 127 vadd.i32 q8, q14, q6 | |
| 128 vtrn.16 d2, d4 | |
| 129 vshrn.i32 d6, q7, #ROW_SHIFT | |
| 130 vshrn.i32 d8, q8, #ROW_SHIFT | |
| 131 vsub.i32 q14, q14, q6 | |
| 132 vsub.i32 q11, q11, q9 | |
| 133 vtrn.16 d6, d8 | |
| 134 vsub.i32 q13, q13, q5 | |
| 135 vshrn.i32 d3, q14, #ROW_SHIFT | |
| 136 vtrn.32 d2, d6 | |
| 137 vsub.i32 q12, q12, q10 | |
| 138 vtrn.32 d4, d8 | |
| 139 vshrn.i32 d5, q13, #ROW_SHIFT | |
| 140 vshrn.i32 d7, q12, #ROW_SHIFT | |
| 141 vshrn.i32 d9, q11, #ROW_SHIFT | |
| 142 | |
| 143 vtrn.16 d3, d5 | |
| 144 vtrn.16 d7, d9 | |
| 145 vtrn.32 d3, d7 | |
| 146 vtrn.32 d5, d9 | |
| 147 | |
| 148 vst1.64 {d2-d5}, [r2,:128]! | |
| 149 vst1.64 {d6-d9}, [r2,:128]! | |
| 150 | |
| 151 bx lr | |
| 152 .endfunc | |
| 153 | |
| 154 function idct_col4_neon | |
| 155 mov ip, #16 | |
| 156 vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ | |
| 157 vdup.16 d30, w4c | |
| 158 vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ | |
| 159 vadd.i16 d30, d30, d2 | |
| 160 vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ | |
| 161 vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ | |
| 162 vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ | |
| 163 | |
| 164 ldrd r4, [r2] | |
| 165 ldrd r6, [r2, #16] | |
| 166 orrs r4, r4, r5 | |
| 167 | |
| 168 idct_col4_top | |
| 169 addeq r2, r2, #16 | |
| 170 beq 1f | |
| 171 | |
| 172 vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ | |
| 173 vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ | |
| 174 vadd.i32 q11, q11, q7 | |
| 175 vsub.i32 q12, q12, q7 | |
| 176 vsub.i32 q13, q13, q7 | |
| 177 vadd.i32 q14, q14, q7 | |
| 178 | |
| 179 1: orrs r6, r6, r7 | |
| 180 ldrd r4, [r2, #16] | |
| 181 addeq r2, r2, #16 | |
| 182 beq 2f | |
| 183 | |
| 184 vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ | |
| 185 vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ | |
| 186 vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ | |
| 187 vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ | |
| 188 vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ | |
| 189 | |
| 190 2: orrs r4, r4, r5 | |
| 191 ldrd r4, [r2, #16] | |
| 192 addeq r2, r2, #16 | |
| 193 beq 3f | |
| 194 | |
| 195 vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ | |
| 196 vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ | |
| 197 vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ | |
| 198 vadd.i32 q11, q11, q7 | |
| 199 vsub.i32 q14, q14, q7 | |
| 200 vsub.i32 q12, q12, q8 | |
| 201 vadd.i32 q13, q13, q8 | |
| 202 | |
| 203 3: orrs r4, r4, r5 | |
| 204 addeq r2, r2, #16 | |
| 205 beq 4f | |
| 206 | |
| 207 vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ | |
| 208 vmlal.s16 q9, d9, w7 | |
| 209 vmlsl.s16 q10, d9, w5 | |
| 210 vmlal.s16 q5, d9, w3 | |
| 211 vmlsl.s16 q6, d9, w1 | |
| 212 | |
| 213 4: vaddhn.i32 d2, q11, q9 | |
| 214 vaddhn.i32 d3, q12, q10 | |
| 215 vaddhn.i32 d4, q13, q5 | |
| 216 vaddhn.i32 d5, q14, q6 | |
| 217 vsubhn.i32 d9, q11, q9 | |
| 218 vsubhn.i32 d8, q12, q10 | |
| 219 vsubhn.i32 d7, q13, q5 | |
| 220 vsubhn.i32 d6, q14, q6 | |
| 221 | |
| 222 bx lr | |
| 223 .endfunc | |
| 224 | |
| 225 .align 6 | |
| 226 | |
| 227 function idct_col4_st8_neon | |
| 228 vqshrun.s16 d2, q1, #COL_SHIFT-16 | |
| 229 vqshrun.s16 d3, q2, #COL_SHIFT-16 | |
| 230 vqshrun.s16 d4, q3, #COL_SHIFT-16 | |
| 231 vqshrun.s16 d5, q4, #COL_SHIFT-16 | |
| 232 vst1.32 {d2[0]}, [r0,:32], r1 | |
| 233 vst1.32 {d2[1]}, [r0,:32], r1 | |
| 234 vst1.32 {d3[0]}, [r0,:32], r1 | |
| 235 vst1.32 {d3[1]}, [r0,:32], r1 | |
| 236 vst1.32 {d4[0]}, [r0,:32], r1 | |
| 237 vst1.32 {d4[1]}, [r0,:32], r1 | |
| 238 vst1.32 {d5[0]}, [r0,:32], r1 | |
| 239 vst1.32 {d5[1]}, [r0,:32], r1 | |
| 240 | |
| 241 bx lr | |
| 242 .endfunc | |
| 243 | |
| 244 .section .rodata | |
| 245 .align 4 | |
| 8506 | 246 idct_coeff_neon: |
| 247 .short W1, W2, W3, W4, W5, W6, W7, W4c | |
| 8335 | 248 .previous |
| 249 | |
| 250 .macro idct_start data | |
| 251 push {r4-r7, lr} | |
| 252 pld [\data] | |
| 253 pld [\data, #64] | |
| 254 vpush {d8-d15} | |
|
8507
779a9c93bf61
ARM: work around linker bug with movw/movt relocations in shared libs
mru
parents:
8506
diff
changeset
|
255 movrel r3, idct_coeff_neon |
| 8335 | 256 vld1.64 {d0,d1}, [r3,:128] |
| 257 .endm | |
| 258 | |
| 259 .macro idct_end | |
| 260 vpop {d8-d15} | |
| 261 pop {r4-r7, pc} | |
| 262 .endm | |
| 263 | |
| 264 /* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */ | |
| 265 function ff_simple_idct_put_neon, export=1 | |
| 266 idct_start r2 | |
| 267 | |
| 9724 | 268 bl idct_row4_pld_neon |
| 8335 | 269 bl idct_row4_neon |
| 270 add r2, r2, #-128 | |
| 271 bl idct_col4_neon | |
| 272 bl idct_col4_st8_neon | |
| 273 sub r0, r0, r1, lsl #3 | |
| 274 add r0, r0, #4 | |
| 275 add r2, r2, #-120 | |
| 276 bl idct_col4_neon | |
| 277 bl idct_col4_st8_neon | |
| 278 | |
| 279 idct_end | |
| 280 .endfunc | |
| 281 | |
| 282 .align 6 | |
| 283 | |
| 284 function idct_col4_add8_neon | |
| 285 mov ip, r0 | |
| 286 | |
| 287 vld1.32 {d10[0]}, [r0,:32], r1 | |
| 288 vshr.s16 q1, q1, #COL_SHIFT-16 | |
| 289 vld1.32 {d10[1]}, [r0,:32], r1 | |
| 290 vshr.s16 q2, q2, #COL_SHIFT-16 | |
| 291 vld1.32 {d11[0]}, [r0,:32], r1 | |
| 292 vshr.s16 q3, q3, #COL_SHIFT-16 | |
| 293 vld1.32 {d11[1]}, [r0,:32], r1 | |
| 294 vshr.s16 q4, q4, #COL_SHIFT-16 | |
| 295 vld1.32 {d12[0]}, [r0,:32], r1 | |
| 296 vaddw.u8 q1, q1, d10 | |
| 297 vld1.32 {d12[1]}, [r0,:32], r1 | |
| 298 vaddw.u8 q2, q2, d11 | |
| 299 vld1.32 {d13[0]}, [r0,:32], r1 | |
| 300 vqmovun.s16 d2, q1 | |
| 301 vld1.32 {d13[1]}, [r0,:32], r1 | |
| 302 vaddw.u8 q3, q3, d12 | |
| 303 vst1.32 {d2[0]}, [ip,:32], r1 | |
| 304 vqmovun.s16 d3, q2 | |
| 305 vst1.32 {d2[1]}, [ip,:32], r1 | |
| 306 vaddw.u8 q4, q4, d13 | |
| 307 vst1.32 {d3[0]}, [ip,:32], r1 | |
| 308 vqmovun.s16 d4, q3 | |
| 309 vst1.32 {d3[1]}, [ip,:32], r1 | |
| 310 vqmovun.s16 d5, q4 | |
| 311 vst1.32 {d4[0]}, [ip,:32], r1 | |
| 312 vst1.32 {d4[1]}, [ip,:32], r1 | |
| 313 vst1.32 {d5[0]}, [ip,:32], r1 | |
| 314 vst1.32 {d5[1]}, [ip,:32], r1 | |
| 315 | |
| 316 bx lr | |
| 317 .endfunc | |
| 318 | |
| 319 /* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */ | |
| 320 function ff_simple_idct_add_neon, export=1 | |
| 321 idct_start r2 | |
| 322 | |
| 9724 | 323 bl idct_row4_pld_neon |
| 8335 | 324 bl idct_row4_neon |
| 325 add r2, r2, #-128 | |
| 326 bl idct_col4_neon | |
| 327 bl idct_col4_add8_neon | |
| 328 sub r0, r0, r1, lsl #3 | |
| 329 add r0, r0, #4 | |
| 330 add r2, r2, #-120 | |
| 331 bl idct_col4_neon | |
| 332 bl idct_col4_add8_neon | |
| 333 | |
| 334 idct_end | |
| 335 .endfunc | |
| 336 | |
| 337 .align 6 | |
| 338 | |
| 339 function idct_col4_st16_neon | |
| 340 mov ip, #16 | |
| 341 | |
| 342 vshr.s16 q1, q1, #COL_SHIFT-16 | |
| 343 vshr.s16 q2, q2, #COL_SHIFT-16 | |
| 344 vst1.64 {d2}, [r2,:64], ip | |
| 345 vshr.s16 q3, q3, #COL_SHIFT-16 | |
| 346 vst1.64 {d3}, [r2,:64], ip | |
| 347 vshr.s16 q4, q4, #COL_SHIFT-16 | |
| 348 vst1.64 {d4}, [r2,:64], ip | |
| 349 vst1.64 {d5}, [r2,:64], ip | |
| 350 vst1.64 {d6}, [r2,:64], ip | |
| 351 vst1.64 {d7}, [r2,:64], ip | |
| 352 vst1.64 {d8}, [r2,:64], ip | |
| 353 vst1.64 {d9}, [r2,:64], ip | |
| 354 | |
| 355 bx lr | |
| 356 .endfunc | |
| 357 | |
| 358 /* void ff_simple_idct_neon(DCTELEM *data); */ | |
| 359 function ff_simple_idct_neon, export=1 | |
| 360 idct_start r0 | |
| 361 | |
| 362 mov r2, r0 | |
| 363 bl idct_row4_neon | |
| 364 bl idct_row4_neon | |
| 365 add r2, r2, #-128 | |
| 366 bl idct_col4_neon | |
| 367 add r2, r2, #-128 | |
| 368 bl idct_col4_st16_neon | |
| 369 add r2, r2, #-120 | |
| 370 bl idct_col4_neon | |
| 371 add r2, r2, #-128 | |
| 372 bl idct_col4_st16_neon | |
| 373 | |
| 374 idct_end | |
| 375 .endfunc |
