Mercurial > libavcodec.hg
annotate arm/simple_idct_neon.S @ 12530:63edd10ad4bc libavcodec tip
Try to fix crashes introduced by r25218
r25218 made assumptions about the existence of past reference frames that
weren't necessarily true.
| author | darkshikari |
|---|---|
| date | Tue, 28 Sep 2010 09:06:22 +0000 |
| parents | 17a110bfdeb6 |
| children |
| rev | line source |
|---|---|
| 8335 | 1 /* |
| 2 * ARM NEON IDCT | |
| 3 * | |
| 4 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
| 5 * | |
| 6 * Based on Simple IDCT | |
| 7 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> | |
| 8 * | |
| 9 * This file is part of FFmpeg. | |
| 10 * | |
| 11 * FFmpeg is free software; you can redistribute it and/or | |
| 12 * modify it under the terms of the GNU Lesser General Public | |
| 13 * License as published by the Free Software Foundation; either | |
| 14 * version 2.1 of the License, or (at your option) any later version. | |
| 15 * | |
| 16 * FFmpeg is distributed in the hope that it will be useful, | |
| 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 19 * Lesser General Public License for more details. | |
| 20 * | |
| 21 * You should have received a copy of the GNU Lesser General Public | |
| 22 * License along with FFmpeg; if not, write to the Free Software | |
| 23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 24 */ | |
| 25 | |
| 26 #include "asm.S" | |
| 27 | |
| 28 #define W1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 29 #define W2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 30 #define W3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 31 #define W4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 32 #define W5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 33 #define W6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 34 #define W7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 | |
| 35 #define W4c ((1<<(COL_SHIFT-1))/W4) | |
| 36 #define ROW_SHIFT 11 | |
| 37 #define COL_SHIFT 20 | |
| 38 | |
| 39 #define w1 d0[0] | |
| 40 #define w2 d0[1] | |
| 41 #define w3 d0[2] | |
| 42 #define w4 d0[3] | |
| 43 #define w5 d1[0] | |
| 44 #define w6 d1[1] | |
| 45 #define w7 d1[2] | |
| 46 #define w4c d1[3] | |
| 47 | |
| 48 .macro idct_col4_top | |
| 49 vmull.s16 q7, d6, w2 /* q9 = W2 * col[2] */ | |
| 50 vmull.s16 q8, d6, w6 /* q10 = W6 * col[2] */ | |
| 51 vmull.s16 q9, d4, w1 /* q9 = W1 * col[1] */ | |
| 52 vadd.i32 q11, q15, q7 | |
| 53 vmull.s16 q10, d4, w3 /* q10 = W3 * col[1] */ | |
| 54 vadd.i32 q12, q15, q8 | |
| 55 vmull.s16 q5, d4, w5 /* q5 = W5 * col[1] */ | |
| 56 vsub.i32 q13, q15, q8 | |
| 57 vmull.s16 q6, d4, w7 /* q6 = W7 * col[1] */ | |
| 58 vsub.i32 q14, q15, q7 | |
| 59 | |
| 60 vmlal.s16 q9, d8, w3 /* q9 += W3 * col[3] */ | |
| 61 vmlsl.s16 q10, d8, w7 /* q10 -= W7 * col[3] */ | |
| 62 vmlsl.s16 q5, d8, w1 /* q5 -= W1 * col[3] */ | |
| 63 vmlsl.s16 q6, d8, w5 /* q6 -= W5 * col[3] */ | |
| 64 .endm | |
| 65 | |
| 66 .text | |
| 67 .align 6 | |
| 68 | |
| 9724 | 69 function idct_row4_pld_neon |
| 70 pld [r0] | |
| 71 add r3, r0, r1, lsl #2 | |
| 72 pld [r0, r1] | |
| 73 pld [r0, r1, lsl #1] | |
| 74 pld [r3, -r1] | |
| 75 pld [r3] | |
| 76 pld [r3, r1] | |
| 77 add r3, r3, r1, lsl #1 | |
| 78 pld [r3] | |
| 79 pld [r3, r1] | |
| 11443 | 80 endfunc |
| 9724 | 81 |
| 8335 | 82 function idct_row4_neon |
| 83 vmov.i32 q15, #(1<<(ROW_SHIFT-1)) | |
| 84 vld1.64 {d2-d5}, [r2,:128]! | |
| 85 vmlal.s16 q15, d2, w4 /* q15 += W4 * col[0] */ | |
| 86 vld1.64 {d6,d7}, [r2,:128]! | |
| 87 vorr d10, d3, d5 | |
| 88 vld1.64 {d8,d9}, [r2,:128]! | |
| 89 add r2, r2, #-64 | |
| 90 | |
| 91 vorr d11, d7, d9 | |
| 92 vorr d10, d10, d11 | |
| 93 vmov r3, r4, d10 | |
| 94 | |
| 95 idct_col4_top | |
| 96 | |
| 97 orrs r3, r3, r4 | |
| 98 beq 1f | |
| 99 | |
| 100 vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ | |
| 101 vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ | |
| 102 vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ | |
| 103 vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ | |
| 104 vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ | |
| 105 vadd.i32 q11, q11, q7 | |
| 106 vsub.i32 q12, q12, q7 | |
| 107 vsub.i32 q13, q13, q7 | |
| 108 vadd.i32 q14, q14, q7 | |
| 109 vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ | |
| 110 vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ | |
| 111 vmlal.s16 q9, d9, w7 | |
| 112 vmlsl.s16 q10, d9, w5 | |
| 113 vmlal.s16 q5, d9, w3 | |
| 114 vmlsl.s16 q6, d9, w1 | |
| 115 vadd.i32 q11, q11, q7 | |
| 116 vsub.i32 q12, q12, q8 | |
| 117 vadd.i32 q13, q13, q8 | |
| 118 vsub.i32 q14, q14, q7 | |
| 119 | |
| 120 1: vadd.i32 q3, q11, q9 | |
| 121 vadd.i32 q4, q12, q10 | |
| 122 vshrn.i32 d2, q3, #ROW_SHIFT | |
| 123 vshrn.i32 d4, q4, #ROW_SHIFT | |
| 124 vadd.i32 q7, q13, q5 | |
| 125 vadd.i32 q8, q14, q6 | |
| 126 vtrn.16 d2, d4 | |
| 127 vshrn.i32 d6, q7, #ROW_SHIFT | |
| 128 vshrn.i32 d8, q8, #ROW_SHIFT | |
| 129 vsub.i32 q14, q14, q6 | |
| 130 vsub.i32 q11, q11, q9 | |
| 131 vtrn.16 d6, d8 | |
| 132 vsub.i32 q13, q13, q5 | |
| 133 vshrn.i32 d3, q14, #ROW_SHIFT | |
| 134 vtrn.32 d2, d6 | |
| 135 vsub.i32 q12, q12, q10 | |
| 136 vtrn.32 d4, d8 | |
| 137 vshrn.i32 d5, q13, #ROW_SHIFT | |
| 138 vshrn.i32 d7, q12, #ROW_SHIFT | |
| 139 vshrn.i32 d9, q11, #ROW_SHIFT | |
| 140 | |
| 141 vtrn.16 d3, d5 | |
| 142 vtrn.16 d7, d9 | |
| 143 vtrn.32 d3, d7 | |
| 144 vtrn.32 d5, d9 | |
| 145 | |
| 146 vst1.64 {d2-d5}, [r2,:128]! | |
| 147 vst1.64 {d6-d9}, [r2,:128]! | |
| 148 | |
| 149 bx lr | |
| 11443 | 150 endfunc |
| 8335 | 151 |
| 152 function idct_col4_neon | |
| 153 mov ip, #16 | |
| 154 vld1.64 {d2}, [r2,:64], ip /* d2 = col[0] */ | |
| 155 vdup.16 d30, w4c | |
| 156 vld1.64 {d4}, [r2,:64], ip /* d3 = col[1] */ | |
| 157 vadd.i16 d30, d30, d2 | |
| 158 vld1.64 {d6}, [r2,:64], ip /* d4 = col[2] */ | |
| 159 vmull.s16 q15, d30, w4 /* q15 = W4*(col[0]+(1<<COL_SHIFT-1)/W4)*/ | |
| 160 vld1.64 {d8}, [r2,:64], ip /* d5 = col[3] */ | |
| 161 | |
| 162 ldrd r4, [r2] | |
| 163 ldrd r6, [r2, #16] | |
| 164 orrs r4, r4, r5 | |
| 165 | |
| 166 idct_col4_top | |
| 167 addeq r2, r2, #16 | |
| 168 beq 1f | |
| 169 | |
| 170 vld1.64 {d3}, [r2,:64], ip /* d6 = col[4] */ | |
| 171 vmull.s16 q7, d3, w4 /* q7 = W4 * col[4] */ | |
| 172 vadd.i32 q11, q11, q7 | |
| 173 vsub.i32 q12, q12, q7 | |
| 174 vsub.i32 q13, q13, q7 | |
| 175 vadd.i32 q14, q14, q7 | |
| 176 | |
| 177 1: orrs r6, r6, r7 | |
| 178 ldrd r4, [r2, #16] | |
| 179 addeq r2, r2, #16 | |
| 180 beq 2f | |
| 181 | |
| 182 vld1.64 {d5}, [r2,:64], ip /* d7 = col[5] */ | |
| 183 vmlal.s16 q9, d5, w5 /* q9 += W5 * col[5] */ | |
| 184 vmlsl.s16 q10, d5, w1 /* q10 -= W1 * col[5] */ | |
| 185 vmlal.s16 q5, d5, w7 /* q5 += W7 * col[5] */ | |
| 186 vmlal.s16 q6, d5, w3 /* q6 += W3 * col[5] */ | |
| 187 | |
| 188 2: orrs r4, r4, r5 | |
| 189 ldrd r4, [r2, #16] | |
| 190 addeq r2, r2, #16 | |
| 191 beq 3f | |
| 192 | |
| 193 vld1.64 {d7}, [r2,:64], ip /* d8 = col[6] */ | |
| 194 vmull.s16 q7, d7, w6 /* q7 = W6 * col[6] */ | |
| 195 vmull.s16 q8, d7, w2 /* q8 = W2 * col[6] */ | |
| 196 vadd.i32 q11, q11, q7 | |
| 197 vsub.i32 q14, q14, q7 | |
| 198 vsub.i32 q12, q12, q8 | |
| 199 vadd.i32 q13, q13, q8 | |
| 200 | |
| 201 3: orrs r4, r4, r5 | |
| 202 addeq r2, r2, #16 | |
| 203 beq 4f | |
| 204 | |
| 205 vld1.64 {d9}, [r2,:64], ip /* d9 = col[7] */ | |
| 206 vmlal.s16 q9, d9, w7 | |
| 207 vmlsl.s16 q10, d9, w5 | |
| 208 vmlal.s16 q5, d9, w3 | |
| 209 vmlsl.s16 q6, d9, w1 | |
| 210 | |
| 211 4: vaddhn.i32 d2, q11, q9 | |
| 212 vaddhn.i32 d3, q12, q10 | |
| 213 vaddhn.i32 d4, q13, q5 | |
| 214 vaddhn.i32 d5, q14, q6 | |
| 215 vsubhn.i32 d9, q11, q9 | |
| 216 vsubhn.i32 d8, q12, q10 | |
| 217 vsubhn.i32 d7, q13, q5 | |
| 218 vsubhn.i32 d6, q14, q6 | |
| 219 | |
| 220 bx lr | |
| 11443 | 221 endfunc |
| 8335 | 222 |
| 223 .align 6 | |
| 224 | |
| 225 function idct_col4_st8_neon | |
| 226 vqshrun.s16 d2, q1, #COL_SHIFT-16 | |
| 227 vqshrun.s16 d3, q2, #COL_SHIFT-16 | |
| 228 vqshrun.s16 d4, q3, #COL_SHIFT-16 | |
| 229 vqshrun.s16 d5, q4, #COL_SHIFT-16 | |
| 230 vst1.32 {d2[0]}, [r0,:32], r1 | |
| 231 vst1.32 {d2[1]}, [r0,:32], r1 | |
| 232 vst1.32 {d3[0]}, [r0,:32], r1 | |
| 233 vst1.32 {d3[1]}, [r0,:32], r1 | |
| 234 vst1.32 {d4[0]}, [r0,:32], r1 | |
| 235 vst1.32 {d4[1]}, [r0,:32], r1 | |
| 236 vst1.32 {d5[0]}, [r0,:32], r1 | |
| 237 vst1.32 {d5[1]}, [r0,:32], r1 | |
| 238 | |
| 239 bx lr | |
| 11443 | 240 endfunc |
| 8335 | 241 |
| 242 .section .rodata | |
| 243 .align 4 | |
| 8506 | 244 idct_coeff_neon: |
| 245 .short W1, W2, W3, W4, W5, W6, W7, W4c | |
| 8335 | 246 |
| 247 .macro idct_start data | |
| 248 push {r4-r7, lr} | |
| 249 pld [\data] | |
| 250 pld [\data, #64] | |
| 251 vpush {d8-d15} | |
|
8507
779a9c93bf61
ARM: work around linker bug with movw/movt relocations in shared libs
mru
parents:
8506
diff
changeset
|
252 movrel r3, idct_coeff_neon |
| 8335 | 253 vld1.64 {d0,d1}, [r3,:128] |
| 254 .endm | |
| 255 | |
| 256 .macro idct_end | |
| 257 vpop {d8-d15} | |
| 258 pop {r4-r7, pc} | |
| 259 .endm | |
| 260 | |
| 261 /* void ff_simple_idct_put_neon(uint8_t *dst, int line_size, DCTELEM *data); */ | |
| 262 function ff_simple_idct_put_neon, export=1 | |
| 263 idct_start r2 | |
| 264 | |
| 9724 | 265 bl idct_row4_pld_neon |
| 8335 | 266 bl idct_row4_neon |
| 267 add r2, r2, #-128 | |
| 268 bl idct_col4_neon | |
| 269 bl idct_col4_st8_neon | |
| 270 sub r0, r0, r1, lsl #3 | |
| 271 add r0, r0, #4 | |
| 272 add r2, r2, #-120 | |
| 273 bl idct_col4_neon | |
| 274 bl idct_col4_st8_neon | |
| 275 | |
| 276 idct_end | |
| 11443 | 277 endfunc |
| 8335 | 278 |
| 279 .align 6 | |
| 280 | |
| 281 function idct_col4_add8_neon | |
| 282 mov ip, r0 | |
| 283 | |
| 284 vld1.32 {d10[0]}, [r0,:32], r1 | |
| 285 vshr.s16 q1, q1, #COL_SHIFT-16 | |
| 286 vld1.32 {d10[1]}, [r0,:32], r1 | |
| 287 vshr.s16 q2, q2, #COL_SHIFT-16 | |
| 288 vld1.32 {d11[0]}, [r0,:32], r1 | |
| 289 vshr.s16 q3, q3, #COL_SHIFT-16 | |
| 290 vld1.32 {d11[1]}, [r0,:32], r1 | |
| 291 vshr.s16 q4, q4, #COL_SHIFT-16 | |
| 292 vld1.32 {d12[0]}, [r0,:32], r1 | |
| 293 vaddw.u8 q1, q1, d10 | |
| 294 vld1.32 {d12[1]}, [r0,:32], r1 | |
| 295 vaddw.u8 q2, q2, d11 | |
| 296 vld1.32 {d13[0]}, [r0,:32], r1 | |
| 297 vqmovun.s16 d2, q1 | |
| 298 vld1.32 {d13[1]}, [r0,:32], r1 | |
| 299 vaddw.u8 q3, q3, d12 | |
| 300 vst1.32 {d2[0]}, [ip,:32], r1 | |
| 301 vqmovun.s16 d3, q2 | |
| 302 vst1.32 {d2[1]}, [ip,:32], r1 | |
| 303 vaddw.u8 q4, q4, d13 | |
| 304 vst1.32 {d3[0]}, [ip,:32], r1 | |
| 305 vqmovun.s16 d4, q3 | |
| 306 vst1.32 {d3[1]}, [ip,:32], r1 | |
| 307 vqmovun.s16 d5, q4 | |
| 308 vst1.32 {d4[0]}, [ip,:32], r1 | |
| 309 vst1.32 {d4[1]}, [ip,:32], r1 | |
| 310 vst1.32 {d5[0]}, [ip,:32], r1 | |
| 311 vst1.32 {d5[1]}, [ip,:32], r1 | |
| 312 | |
| 313 bx lr | |
| 11443 | 314 endfunc |
| 8335 | 315 |
| 316 /* void ff_simple_idct_add_neon(uint8_t *dst, int line_size, DCTELEM *data); */ | |
| 317 function ff_simple_idct_add_neon, export=1 | |
| 318 idct_start r2 | |
| 319 | |
| 9724 | 320 bl idct_row4_pld_neon |
| 8335 | 321 bl idct_row4_neon |
| 322 add r2, r2, #-128 | |
| 323 bl idct_col4_neon | |
| 324 bl idct_col4_add8_neon | |
| 325 sub r0, r0, r1, lsl #3 | |
| 326 add r0, r0, #4 | |
| 327 add r2, r2, #-120 | |
| 328 bl idct_col4_neon | |
| 329 bl idct_col4_add8_neon | |
| 330 | |
| 331 idct_end | |
| 11443 | 332 endfunc |
| 8335 | 333 |
| 334 .align 6 | |
| 335 | |
| 336 function idct_col4_st16_neon | |
| 337 mov ip, #16 | |
| 338 | |
| 339 vshr.s16 q1, q1, #COL_SHIFT-16 | |
| 340 vshr.s16 q2, q2, #COL_SHIFT-16 | |
| 341 vst1.64 {d2}, [r2,:64], ip | |
| 342 vshr.s16 q3, q3, #COL_SHIFT-16 | |
| 343 vst1.64 {d3}, [r2,:64], ip | |
| 344 vshr.s16 q4, q4, #COL_SHIFT-16 | |
| 345 vst1.64 {d4}, [r2,:64], ip | |
| 346 vst1.64 {d5}, [r2,:64], ip | |
| 347 vst1.64 {d6}, [r2,:64], ip | |
| 348 vst1.64 {d7}, [r2,:64], ip | |
| 349 vst1.64 {d8}, [r2,:64], ip | |
| 350 vst1.64 {d9}, [r2,:64], ip | |
| 351 | |
| 352 bx lr | |
| 11443 | 353 endfunc |
| 8335 | 354 |
| 355 /* void ff_simple_idct_neon(DCTELEM *data); */ | |
| 356 function ff_simple_idct_neon, export=1 | |
| 357 idct_start r0 | |
| 358 | |
| 359 mov r2, r0 | |
| 360 bl idct_row4_neon | |
| 361 bl idct_row4_neon | |
| 362 add r2, r2, #-128 | |
| 363 bl idct_col4_neon | |
| 364 add r2, r2, #-128 | |
| 365 bl idct_col4_st16_neon | |
| 366 add r2, r2, #-120 | |
| 367 bl idct_col4_neon | |
| 368 add r2, r2, #-128 | |
| 369 bl idct_col4_st16_neon | |
| 370 | |
| 371 idct_end | |
| 11443 | 372 endfunc |
