Mercurial > libavcodec.hg
annotate arm/dsputil_neon.S @ 12530:63edd10ad4bc libavcodec tip
Try to fix crashes introduced by r25218
r25218 made assumptions about the existence of past reference frames that
weren't necessarily true.
| author | darkshikari |
|---|---|
| date | Tue, 28 Sep 2010 09:06:22 +0000 |
| parents | 659f16d04776 |
| children |
| rev | line source |
|---|---|
| 8334 | 1 /* |
| 2 * ARM NEON optimised DSP functions | |
| 3 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com> | |
| 4 * | |
| 5 * This file is part of FFmpeg. | |
| 6 * | |
| 7 * FFmpeg is free software; you can redistribute it and/or | |
| 8 * modify it under the terms of the GNU Lesser General Public | |
| 9 * License as published by the Free Software Foundation; either | |
| 10 * version 2.1 of the License, or (at your option) any later version. | |
| 11 * | |
| 12 * FFmpeg is distributed in the hope that it will be useful, | |
| 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 15 * Lesser General Public License for more details. | |
| 16 * | |
| 17 * You should have received a copy of the GNU Lesser General Public | |
| 18 * License along with FFmpeg; if not, write to the Free Software | |
| 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 20 */ | |
| 21 | |
| 10046 | 22 #include "config.h" |
| 8334 | 23 #include "asm.S" |
| 24 | |
| 25 preserve8 | |
| 26 .text | |
| 27 | |
| 11807 | 28 function ff_clear_block_neon, export=1 |
| 29 vmov.i16 q0, #0 | |
| 30 .rept 8 | |
| 31 vst1.16 {q0}, [r0,:128]! | |
| 32 .endr | |
| 33 bx lr | |
| 34 endfunc | |
| 35 | |
| 36 function ff_clear_blocks_neon, export=1 | |
| 37 vmov.i16 q0, #0 | |
| 38 .rept 8*6 | |
| 39 vst1.16 {q0}, [r0,:128]! | |
| 40 .endr | |
| 41 bx lr | |
| 42 endfunc | |
| 43 | |
| 8334 | 44 .macro pixels16 avg=0 |
| 45 .if \avg | |
| 46 mov ip, r0 | |
| 47 .endif | |
| 48 1: vld1.64 {d0, d1}, [r1], r2 | |
| 49 vld1.64 {d2, d3}, [r1], r2 | |
| 50 vld1.64 {d4, d5}, [r1], r2 | |
| 51 pld [r1, r2, lsl #2] | |
| 52 vld1.64 {d6, d7}, [r1], r2 | |
| 53 pld [r1] | |
| 54 pld [r1, r2] | |
| 55 pld [r1, r2, lsl #1] | |
| 56 .if \avg | |
|
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
57 vld1.64 {d16,d17}, [ip,:128], r2 |
| 8334 | 58 vrhadd.u8 q0, q0, q8 |
|
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
59 vld1.64 {d18,d19}, [ip,:128], r2 |
| 8334 | 60 vrhadd.u8 q1, q1, q9 |
|
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
61 vld1.64 {d20,d21}, [ip,:128], r2 |
| 8334 | 62 vrhadd.u8 q2, q2, q10 |
|
9451
93c20dd3da43
Add guaranteed alignment for loading dest pixels in avg_pixels16_neon
conrad
parents:
9345
diff
changeset
|
63 vld1.64 {d22,d23}, [ip,:128], r2 |
| 8334 | 64 vrhadd.u8 q3, q3, q11 |
| 65 .endif | |
| 66 subs r3, r3, #4 | |
| 67 vst1.64 {d0, d1}, [r0,:128], r2 | |
| 68 vst1.64 {d2, d3}, [r0,:128], r2 | |
| 69 vst1.64 {d4, d5}, [r0,:128], r2 | |
| 70 vst1.64 {d6, d7}, [r0,:128], r2 | |
| 71 bne 1b | |
| 72 bx lr | |
| 73 .endm | |
| 74 | |
| 75 .macro pixels16_x2 vhadd=vrhadd.u8 | |
| 76 1: vld1.64 {d0-d2}, [r1], r2 | |
| 77 vld1.64 {d4-d6}, [r1], r2 | |
| 78 pld [r1] | |
| 79 pld [r1, r2] | |
| 80 subs r3, r3, #2 | |
| 81 vext.8 q1, q0, q1, #1 | |
| 82 \vhadd q0, q0, q1 | |
| 83 vext.8 q3, q2, q3, #1 | |
| 84 \vhadd q2, q2, q3 | |
| 85 vst1.64 {d0, d1}, [r0,:128], r2 | |
| 86 vst1.64 {d4, d5}, [r0,:128], r2 | |
| 87 bne 1b | |
| 88 bx lr | |
| 89 .endm | |
| 90 | |
| 91 .macro pixels16_y2 vhadd=vrhadd.u8 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
92 vld1.64 {d0, d1}, [r1], r2 |
|
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
93 vld1.64 {d2, d3}, [r1], r2 |
| 8334 | 94 1: subs r3, r3, #2 |
| 95 \vhadd q2, q0, q1 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
96 vld1.64 {d0, d1}, [r1], r2 |
| 8334 | 97 \vhadd q3, q0, q1 |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
98 vld1.64 {d2, d3}, [r1], r2 |
| 8334 | 99 pld [r1] |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
100 pld [r1, r2] |
| 8334 | 101 vst1.64 {d4, d5}, [r0,:128], r2 |
| 102 vst1.64 {d6, d7}, [r0,:128], r2 | |
| 103 bne 1b | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
104 bx lr |
| 8334 | 105 .endm |
| 106 | |
| 107 .macro pixels16_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
108 vld1.64 {d0-d2}, [r1], r2 |
|
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
109 vld1.64 {d4-d6}, [r1], r2 |
| 8334 | 110 .if \no_rnd |
| 111 vmov.i16 q13, #1 | |
| 112 .endif | |
| 113 pld [r1] | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
114 pld [r1, r2] |
| 8334 | 115 vext.8 q1, q0, q1, #1 |
| 116 vext.8 q3, q2, q3, #1 | |
| 117 vaddl.u8 q8, d0, d2 | |
| 118 vaddl.u8 q10, d1, d3 | |
| 119 vaddl.u8 q9, d4, d6 | |
| 120 vaddl.u8 q11, d5, d7 | |
| 121 1: subs r3, r3, #2 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
122 vld1.64 {d0-d2}, [r1], r2 |
| 8334 | 123 vadd.u16 q12, q8, q9 |
| 124 pld [r1] | |
| 125 .if \no_rnd | |
| 126 vadd.u16 q12, q12, q13 | |
| 127 .endif | |
| 128 vext.8 q15, q0, q1, #1 | |
| 129 vadd.u16 q1 , q10, q11 | |
| 130 \vshrn d28, q12, #2 | |
| 131 .if \no_rnd | |
| 132 vadd.u16 q1, q1, q13 | |
| 133 .endif | |
| 134 \vshrn d29, q1, #2 | |
| 135 vaddl.u8 q8, d0, d30 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
136 vld1.64 {d2-d4}, [r1], r2 |
| 8334 | 137 vaddl.u8 q10, d1, d31 |
| 138 vst1.64 {d28,d29}, [r0,:128], r2 | |
| 139 vadd.u16 q12, q8, q9 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
140 pld [r1, r2] |
| 8334 | 141 .if \no_rnd |
| 142 vadd.u16 q12, q12, q13 | |
| 143 .endif | |
| 144 vext.8 q2, q1, q2, #1 | |
| 145 vadd.u16 q0, q10, q11 | |
| 146 \vshrn d30, q12, #2 | |
| 147 .if \no_rnd | |
| 148 vadd.u16 q0, q0, q13 | |
| 149 .endif | |
| 150 \vshrn d31, q0, #2 | |
| 151 vaddl.u8 q9, d2, d4 | |
| 152 vaddl.u8 q11, d3, d5 | |
| 153 vst1.64 {d30,d31}, [r0,:128], r2 | |
| 154 bgt 1b | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
155 bx lr |
| 8334 | 156 .endm |
| 157 | |
| 10375 | 158 .macro pixels8 avg=0 |
| 8334 | 159 1: vld1.64 {d0}, [r1], r2 |
| 160 vld1.64 {d1}, [r1], r2 | |
| 161 vld1.64 {d2}, [r1], r2 | |
| 162 pld [r1, r2, lsl #2] | |
| 163 vld1.64 {d3}, [r1], r2 | |
| 164 pld [r1] | |
| 165 pld [r1, r2] | |
| 166 pld [r1, r2, lsl #1] | |
| 10375 | 167 .if \avg |
| 168 vld1.64 {d4}, [r0,:64], r2 | |
| 169 vrhadd.u8 d0, d0, d4 | |
| 170 vld1.64 {d5}, [r0,:64], r2 | |
| 171 vrhadd.u8 d1, d1, d5 | |
| 172 vld1.64 {d6}, [r0,:64], r2 | |
| 173 vrhadd.u8 d2, d2, d6 | |
| 174 vld1.64 {d7}, [r0,:64], r2 | |
| 175 vrhadd.u8 d3, d3, d7 | |
| 176 sub r0, r0, r2, lsl #2 | |
| 177 .endif | |
| 8334 | 178 subs r3, r3, #4 |
| 179 vst1.64 {d0}, [r0,:64], r2 | |
| 180 vst1.64 {d1}, [r0,:64], r2 | |
| 181 vst1.64 {d2}, [r0,:64], r2 | |
| 182 vst1.64 {d3}, [r0,:64], r2 | |
| 183 bne 1b | |
| 184 bx lr | |
| 185 .endm | |
| 186 | |
| 187 .macro pixels8_x2 vhadd=vrhadd.u8 | |
| 188 1: vld1.64 {d0, d1}, [r1], r2 | |
| 189 vext.8 d1, d0, d1, #1 | |
| 190 vld1.64 {d2, d3}, [r1], r2 | |
| 191 vext.8 d3, d2, d3, #1 | |
| 192 pld [r1] | |
| 193 pld [r1, r2] | |
| 194 subs r3, r3, #2 | |
| 195 vswp d1, d2 | |
| 196 \vhadd q0, q0, q1 | |
| 197 vst1.64 {d0}, [r0,:64], r2 | |
| 198 vst1.64 {d1}, [r0,:64], r2 | |
| 199 bne 1b | |
| 200 bx lr | |
| 201 .endm | |
| 202 | |
| 203 .macro pixels8_y2 vhadd=vrhadd.u8 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
204 vld1.64 {d0}, [r1], r2 |
|
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
205 vld1.64 {d1}, [r1], r2 |
| 8334 | 206 1: subs r3, r3, #2 |
| 207 \vhadd d4, d0, d1 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
208 vld1.64 {d0}, [r1], r2 |
| 8334 | 209 \vhadd d5, d0, d1 |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
210 vld1.64 {d1}, [r1], r2 |
| 8334 | 211 pld [r1] |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
212 pld [r1, r2] |
| 8334 | 213 vst1.64 {d4}, [r0,:64], r2 |
| 214 vst1.64 {d5}, [r0,:64], r2 | |
| 215 bne 1b | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
216 bx lr |
| 8334 | 217 .endm |
| 218 | |
| 219 .macro pixels8_xy2 vshrn=vrshrn.u16 no_rnd=0 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
220 vld1.64 {d0, d1}, [r1], r2 |
|
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
221 vld1.64 {d2, d3}, [r1], r2 |
| 8334 | 222 .if \no_rnd |
| 223 vmov.i16 q11, #1 | |
| 224 .endif | |
| 225 pld [r1] | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
226 pld [r1, r2] |
| 8334 | 227 vext.8 d4, d0, d1, #1 |
| 228 vext.8 d6, d2, d3, #1 | |
| 229 vaddl.u8 q8, d0, d4 | |
| 230 vaddl.u8 q9, d2, d6 | |
| 231 1: subs r3, r3, #2 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
232 vld1.64 {d0, d1}, [r1], r2 |
| 8334 | 233 pld [r1] |
| 234 vadd.u16 q10, q8, q9 | |
| 235 vext.8 d4, d0, d1, #1 | |
| 236 .if \no_rnd | |
| 237 vadd.u16 q10, q10, q11 | |
| 238 .endif | |
| 239 vaddl.u8 q8, d0, d4 | |
| 240 \vshrn d5, q10, #2 | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
241 vld1.64 {d2, d3}, [r1], r2 |
| 8334 | 242 vadd.u16 q10, q8, q9 |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
243 pld [r1, r2] |
| 8334 | 244 .if \no_rnd |
| 245 vadd.u16 q10, q10, q11 | |
| 246 .endif | |
| 247 vst1.64 {d5}, [r0,:64], r2 | |
| 248 \vshrn d7, q10, #2 | |
| 249 vext.8 d6, d2, d3, #1 | |
| 250 vaddl.u8 q9, d2, d6 | |
| 251 vst1.64 {d7}, [r0,:64], r2 | |
| 252 bgt 1b | |
|
9581
2b3b9358bee7
ARM: Use fewer register in NEON put_pixels _y2 and _xy2
conrad
parents:
9580
diff
changeset
|
253 bx lr |
| 8334 | 254 .endm |
| 255 | |
| 256 .macro pixfunc pfx name suf rnd_op args:vararg | |
| 257 function ff_\pfx\name\suf\()_neon, export=1 | |
| 258 \name \rnd_op \args | |
| 11443 | 259 endfunc |
| 8334 | 260 .endm |
| 261 | |
| 262 .macro pixfunc2 pfx name args:vararg | |
| 263 pixfunc \pfx \name | |
| 264 pixfunc \pfx \name \args | |
| 265 .endm | |
| 266 | |
| 267 function ff_put_h264_qpel16_mc00_neon, export=1 | |
| 10376 | 268 mov r3, #16 |
| 11443 | 269 endfunc |
| 8334 | 270 |
| 271 pixfunc put_ pixels16 | |
| 272 pixfunc2 put_ pixels16_x2, _no_rnd, vhadd.u8 | |
| 273 pixfunc2 put_ pixels16_y2, _no_rnd, vhadd.u8 | |
| 274 pixfunc2 put_ pixels16_xy2, _no_rnd, vshrn.u16, 1 | |
| 275 | |
| 276 function ff_avg_h264_qpel16_mc00_neon, export=1 | |
| 10376 | 277 mov r3, #16 |
| 11443 | 278 endfunc |
| 8334 | 279 |
| 280 pixfunc avg_ pixels16,, 1 | |
| 281 | |
| 282 function ff_put_h264_qpel8_mc00_neon, export=1 | |
| 10376 | 283 mov r3, #8 |
| 11443 | 284 endfunc |
| 8334 | 285 |
| 286 pixfunc put_ pixels8 | |
| 287 pixfunc2 put_ pixels8_x2, _no_rnd, vhadd.u8 | |
| 288 pixfunc2 put_ pixels8_y2, _no_rnd, vhadd.u8 | |
| 289 pixfunc2 put_ pixels8_xy2, _no_rnd, vshrn.u16, 1 | |
| 8492 | 290 |
| 10375 | 291 function ff_avg_h264_qpel8_mc00_neon, export=1 |
| 292 mov r3, #8 | |
| 11443 | 293 endfunc |
| 10375 | 294 |
| 295 pixfunc avg_ pixels8,, 1 | |
| 296 | |
| 9580 | 297 function ff_put_pixels_clamped_neon, export=1 |
| 298 vld1.64 {d16-d19}, [r0,:128]! | |
| 299 vqmovun.s16 d0, q8 | |
| 300 vld1.64 {d20-d23}, [r0,:128]! | |
| 301 vqmovun.s16 d1, q9 | |
| 302 vld1.64 {d24-d27}, [r0,:128]! | |
| 303 vqmovun.s16 d2, q10 | |
| 304 vld1.64 {d28-d31}, [r0,:128]! | |
| 305 vqmovun.s16 d3, q11 | |
| 306 vst1.64 {d0}, [r1,:64], r2 | |
| 307 vqmovun.s16 d4, q12 | |
| 308 vst1.64 {d1}, [r1,:64], r2 | |
| 309 vqmovun.s16 d5, q13 | |
| 310 vst1.64 {d2}, [r1,:64], r2 | |
| 311 vqmovun.s16 d6, q14 | |
| 312 vst1.64 {d3}, [r1,:64], r2 | |
| 313 vqmovun.s16 d7, q15 | |
| 314 vst1.64 {d4}, [r1,:64], r2 | |
| 315 vst1.64 {d5}, [r1,:64], r2 | |
| 316 vst1.64 {d6}, [r1,:64], r2 | |
| 317 vst1.64 {d7}, [r1,:64], r2 | |
| 318 bx lr | |
| 11443 | 319 endfunc |
| 9580 | 320 |
| 9345 | 321 function ff_put_signed_pixels_clamped_neon, export=1 |
| 322 vmov.u8 d31, #128 | |
| 323 vld1.64 {d16-d17}, [r0,:128]! | |
| 324 vqmovn.s16 d0, q8 | |
| 325 vld1.64 {d18-d19}, [r0,:128]! | |
| 326 vqmovn.s16 d1, q9 | |
| 327 vld1.64 {d16-d17}, [r0,:128]! | |
| 328 vqmovn.s16 d2, q8 | |
| 329 vld1.64 {d18-d19}, [r0,:128]! | |
| 330 vadd.u8 d0, d0, d31 | |
| 331 vld1.64 {d20-d21}, [r0,:128]! | |
| 332 vadd.u8 d1, d1, d31 | |
| 333 vld1.64 {d22-d23}, [r0,:128]! | |
| 334 vadd.u8 d2, d2, d31 | |
| 335 vst1.64 {d0}, [r1,:64], r2 | |
| 336 vqmovn.s16 d3, q9 | |
| 337 vst1.64 {d1}, [r1,:64], r2 | |
| 338 vqmovn.s16 d4, q10 | |
| 339 vst1.64 {d2}, [r1,:64], r2 | |
| 340 vqmovn.s16 d5, q11 | |
| 341 vld1.64 {d24-d25}, [r0,:128]! | |
| 342 vadd.u8 d3, d3, d31 | |
| 343 vld1.64 {d26-d27}, [r0,:128]! | |
| 344 vadd.u8 d4, d4, d31 | |
| 345 vadd.u8 d5, d5, d31 | |
| 346 vst1.64 {d3}, [r1,:64], r2 | |
| 347 vqmovn.s16 d6, q12 | |
| 348 vst1.64 {d4}, [r1,:64], r2 | |
| 349 vqmovn.s16 d7, q13 | |
| 350 vst1.64 {d5}, [r1,:64], r2 | |
| 351 vadd.u8 d6, d6, d31 | |
| 352 vadd.u8 d7, d7, d31 | |
| 353 vst1.64 {d6}, [r1,:64], r2 | |
| 354 vst1.64 {d7}, [r1,:64], r2 | |
| 355 bx lr | |
| 11443 | 356 endfunc |
| 9345 | 357 |
| 9344 | 358 function ff_add_pixels_clamped_neon, export=1 |
| 359 mov r3, r1 | |
| 360 vld1.64 {d16}, [r1,:64], r2 | |
| 361 vld1.64 {d0-d1}, [r0,:128]! | |
| 362 vaddw.u8 q0, q0, d16 | |
| 363 vld1.64 {d17}, [r1,:64], r2 | |
| 364 vld1.64 {d2-d3}, [r0,:128]! | |
| 365 vqmovun.s16 d0, q0 | |
| 366 vld1.64 {d18}, [r1,:64], r2 | |
| 367 vaddw.u8 q1, q1, d17 | |
| 368 vld1.64 {d4-d5}, [r0,:128]! | |
| 369 vaddw.u8 q2, q2, d18 | |
| 370 vst1.64 {d0}, [r3,:64], r2 | |
| 371 vqmovun.s16 d2, q1 | |
| 372 vld1.64 {d19}, [r1,:64], r2 | |
| 373 vld1.64 {d6-d7}, [r0,:128]! | |
| 374 vaddw.u8 q3, q3, d19 | |
| 375 vqmovun.s16 d4, q2 | |
| 376 vst1.64 {d2}, [r3,:64], r2 | |
| 377 vld1.64 {d16}, [r1,:64], r2 | |
| 378 vqmovun.s16 d6, q3 | |
| 379 vld1.64 {d0-d1}, [r0,:128]! | |
| 380 vaddw.u8 q0, q0, d16 | |
| 381 vst1.64 {d4}, [r3,:64], r2 | |
| 382 vld1.64 {d17}, [r1,:64], r2 | |
| 383 vld1.64 {d2-d3}, [r0,:128]! | |
| 384 vaddw.u8 q1, q1, d17 | |
| 385 vst1.64 {d6}, [r3,:64], r2 | |
| 386 vqmovun.s16 d0, q0 | |
| 387 vld1.64 {d18}, [r1,:64], r2 | |
| 388 vld1.64 {d4-d5}, [r0,:128]! | |
| 389 vaddw.u8 q2, q2, d18 | |
| 390 vst1.64 {d0}, [r3,:64], r2 | |
| 391 vqmovun.s16 d2, q1 | |
| 392 vld1.64 {d19}, [r1,:64], r2 | |
| 393 vqmovun.s16 d4, q2 | |
| 394 vld1.64 {d6-d7}, [r0,:128]! | |
| 395 vaddw.u8 q3, q3, d19 | |
| 396 vst1.64 {d2}, [r3,:64], r2 | |
| 397 vqmovun.s16 d6, q3 | |
| 398 vst1.64 {d4}, [r3,:64], r2 | |
| 399 vst1.64 {d6}, [r3,:64], r2 | |
| 400 bx lr | |
| 11443 | 401 endfunc |
| 9344 | 402 |
| 8492 | 403 function ff_float_to_int16_neon, export=1 |
| 404 subs r2, r2, #8 | |
| 405 vld1.64 {d0-d1}, [r1,:128]! | |
| 406 vcvt.s32.f32 q8, q0, #16 | |
| 407 vld1.64 {d2-d3}, [r1,:128]! | |
| 408 vcvt.s32.f32 q9, q1, #16 | |
| 409 beq 3f | |
| 410 bics ip, r2, #15 | |
| 411 beq 2f | |
| 412 1: subs ip, ip, #16 | |
| 413 vshrn.s32 d4, q8, #16 | |
| 414 vld1.64 {d0-d1}, [r1,:128]! | |
| 415 vcvt.s32.f32 q0, q0, #16 | |
| 416 vshrn.s32 d5, q9, #16 | |
| 417 vld1.64 {d2-d3}, [r1,:128]! | |
| 418 vcvt.s32.f32 q1, q1, #16 | |
| 419 vshrn.s32 d6, q0, #16 | |
| 420 vst1.64 {d4-d5}, [r0,:128]! | |
| 421 vshrn.s32 d7, q1, #16 | |
| 422 vld1.64 {d16-d17},[r1,:128]! | |
| 423 vcvt.s32.f32 q8, q8, #16 | |
| 424 vld1.64 {d18-d19},[r1,:128]! | |
| 425 vcvt.s32.f32 q9, q9, #16 | |
| 426 vst1.64 {d6-d7}, [r0,:128]! | |
| 427 bne 1b | |
| 428 ands r2, r2, #15 | |
| 429 beq 3f | |
| 430 2: vld1.64 {d0-d1}, [r1,:128]! | |
| 431 vshrn.s32 d4, q8, #16 | |
| 432 vcvt.s32.f32 q0, q0, #16 | |
| 433 vld1.64 {d2-d3}, [r1,:128]! | |
| 434 vshrn.s32 d5, q9, #16 | |
| 435 vcvt.s32.f32 q1, q1, #16 | |
| 436 vshrn.s32 d6, q0, #16 | |
| 437 vst1.64 {d4-d5}, [r0,:128]! | |
| 438 vshrn.s32 d7, q1, #16 | |
| 439 vst1.64 {d6-d7}, [r0,:128]! | |
| 440 bx lr | |
| 441 3: vshrn.s32 d4, q8, #16 | |
| 442 vshrn.s32 d5, q9, #16 | |
| 443 vst1.64 {d4-d5}, [r0,:128]! | |
| 444 bx lr | |
| 11443 | 445 endfunc |
| 8492 | 446 |
| 447 function ff_float_to_int16_interleave_neon, export=1 | |
| 448 cmp r3, #2 | |
| 449 ldrlt r1, [r1] | |
| 450 blt ff_float_to_int16_neon | |
| 451 bne 4f | |
| 452 | |
| 453 ldr r3, [r1] | |
| 454 ldr r1, [r1, #4] | |
| 455 | |
| 456 subs r2, r2, #8 | |
| 457 vld1.64 {d0-d1}, [r3,:128]! | |
| 458 vcvt.s32.f32 q8, q0, #16 | |
| 459 vld1.64 {d2-d3}, [r3,:128]! | |
| 460 vcvt.s32.f32 q9, q1, #16 | |
| 461 vld1.64 {d20-d21},[r1,:128]! | |
| 462 vcvt.s32.f32 q10, q10, #16 | |
| 463 vld1.64 {d22-d23},[r1,:128]! | |
| 464 vcvt.s32.f32 q11, q11, #16 | |
| 465 beq 3f | |
| 466 bics ip, r2, #15 | |
| 467 beq 2f | |
| 468 1: subs ip, ip, #16 | |
| 469 vld1.64 {d0-d1}, [r3,:128]! | |
| 470 vcvt.s32.f32 q0, q0, #16 | |
| 471 vsri.32 q10, q8, #16 | |
| 472 vld1.64 {d2-d3}, [r3,:128]! | |
| 473 vcvt.s32.f32 q1, q1, #16 | |
| 474 vld1.64 {d24-d25},[r1,:128]! | |
| 475 vcvt.s32.f32 q12, q12, #16 | |
| 476 vld1.64 {d26-d27},[r1,:128]! | |
| 477 vsri.32 q11, q9, #16 | |
| 478 vst1.64 {d20-d21},[r0,:128]! | |
| 479 vcvt.s32.f32 q13, q13, #16 | |
| 480 vst1.64 {d22-d23},[r0,:128]! | |
| 481 vsri.32 q12, q0, #16 | |
| 482 vld1.64 {d16-d17},[r3,:128]! | |
| 483 vsri.32 q13, q1, #16 | |
| 484 vst1.64 {d24-d25},[r0,:128]! | |
| 485 vcvt.s32.f32 q8, q8, #16 | |
| 486 vld1.64 {d18-d19},[r3,:128]! | |
| 487 vcvt.s32.f32 q9, q9, #16 | |
| 488 vld1.64 {d20-d21},[r1,:128]! | |
| 489 vcvt.s32.f32 q10, q10, #16 | |
| 490 vld1.64 {d22-d23},[r1,:128]! | |
| 491 vcvt.s32.f32 q11, q11, #16 | |
| 492 vst1.64 {d26-d27},[r0,:128]! | |
| 493 bne 1b | |
| 494 ands r2, r2, #15 | |
| 495 beq 3f | |
| 496 2: vsri.32 q10, q8, #16 | |
| 497 vld1.64 {d0-d1}, [r3,:128]! | |
| 498 vcvt.s32.f32 q0, q0, #16 | |
| 499 vld1.64 {d2-d3}, [r3,:128]! | |
| 500 vcvt.s32.f32 q1, q1, #16 | |
| 501 vld1.64 {d24-d25},[r1,:128]! | |
| 502 vcvt.s32.f32 q12, q12, #16 | |
| 503 vsri.32 q11, q9, #16 | |
| 504 vld1.64 {d26-d27},[r1,:128]! | |
| 505 vcvt.s32.f32 q13, q13, #16 | |
| 506 vst1.64 {d20-d21},[r0,:128]! | |
| 507 vsri.32 q12, q0, #16 | |
| 508 vst1.64 {d22-d23},[r0,:128]! | |
| 509 vsri.32 q13, q1, #16 | |
| 510 vst1.64 {d24-d27},[r0,:128]! | |
| 511 bx lr | |
| 512 3: vsri.32 q10, q8, #16 | |
| 513 vsri.32 q11, q9, #16 | |
| 514 vst1.64 {d20-d23},[r0,:128]! | |
| 515 bx lr | |
| 516 | |
| 517 4: push {r4-r8,lr} | |
| 518 cmp r3, #4 | |
| 519 lsl ip, r3, #1 | |
| 520 blt 4f | |
| 521 | |
| 522 @ 4 channels | |
| 523 5: ldmia r1!, {r4-r7} | |
| 524 mov lr, r2 | |
| 525 mov r8, r0 | |
| 526 vld1.64 {d16-d17},[r4,:128]! | |
| 527 vcvt.s32.f32 q8, q8, #16 | |
| 528 vld1.64 {d18-d19},[r5,:128]! | |
| 529 vcvt.s32.f32 q9, q9, #16 | |
| 530 vld1.64 {d20-d21},[r6,:128]! | |
| 531 vcvt.s32.f32 q10, q10, #16 | |
| 532 vld1.64 {d22-d23},[r7,:128]! | |
| 533 vcvt.s32.f32 q11, q11, #16 | |
| 534 6: subs lr, lr, #8 | |
| 535 vld1.64 {d0-d1}, [r4,:128]! | |
| 536 vcvt.s32.f32 q0, q0, #16 | |
| 537 vsri.32 q9, q8, #16 | |
| 538 vld1.64 {d2-d3}, [r5,:128]! | |
| 539 vcvt.s32.f32 q1, q1, #16 | |
| 540 vsri.32 q11, q10, #16 | |
| 541 vld1.64 {d4-d5}, [r6,:128]! | |
| 542 vcvt.s32.f32 q2, q2, #16 | |
| 543 vzip.32 d18, d22 | |
| 544 vld1.64 {d6-d7}, [r7,:128]! | |
| 545 vcvt.s32.f32 q3, q3, #16 | |
| 546 vzip.32 d19, d23 | |
| 547 vst1.64 {d18}, [r8], ip | |
| 548 vsri.32 q1, q0, #16 | |
| 549 vst1.64 {d22}, [r8], ip | |
| 550 vsri.32 q3, q2, #16 | |
| 551 vst1.64 {d19}, [r8], ip | |
| 552 vzip.32 d2, d6 | |
| 553 vst1.64 {d23}, [r8], ip | |
| 554 vzip.32 d3, d7 | |
| 555 beq 7f | |
| 556 vld1.64 {d16-d17},[r4,:128]! | |
| 557 vcvt.s32.f32 q8, q8, #16 | |
| 558 vst1.64 {d2}, [r8], ip | |
| 559 vld1.64 {d18-d19},[r5,:128]! | |
| 560 vcvt.s32.f32 q9, q9, #16 | |
| 561 vst1.64 {d6}, [r8], ip | |
| 562 vld1.64 {d20-d21},[r6,:128]! | |
| 563 vcvt.s32.f32 q10, q10, #16 | |
| 564 vst1.64 {d3}, [r8], ip | |
| 565 vld1.64 {d22-d23},[r7,:128]! | |
| 566 vcvt.s32.f32 q11, q11, #16 | |
| 567 vst1.64 {d7}, [r8], ip | |
| 568 b 6b | |
| 569 7: vst1.64 {d2}, [r8], ip | |
| 570 vst1.64 {d6}, [r8], ip | |
| 571 vst1.64 {d3}, [r8], ip | |
| 572 vst1.64 {d7}, [r8], ip | |
| 573 subs r3, r3, #4 | |
| 574 popeq {r4-r8,pc} | |
| 575 cmp r3, #4 | |
| 576 add r0, r0, #8 | |
| 577 bge 5b | |
| 578 | |
| 579 @ 2 channels | |
| 580 4: cmp r3, #2 | |
| 581 blt 4f | |
| 582 ldmia r1!, {r4-r5} | |
| 583 mov lr, r2 | |
| 584 mov r8, r0 | |
| 585 tst lr, #8 | |
| 586 vld1.64 {d16-d17},[r4,:128]! | |
| 587 vcvt.s32.f32 q8, q8, #16 | |
| 588 vld1.64 {d18-d19},[r5,:128]! | |
| 589 vcvt.s32.f32 q9, q9, #16 | |
| 590 vld1.64 {d20-d21},[r4,:128]! | |
| 591 vcvt.s32.f32 q10, q10, #16 | |
| 592 vld1.64 {d22-d23},[r5,:128]! | |
| 593 vcvt.s32.f32 q11, q11, #16 | |
| 594 beq 6f | |
| 595 subs lr, lr, #8 | |
| 596 beq 7f | |
| 597 vsri.32 d18, d16, #16 | |
| 598 vsri.32 d19, d17, #16 | |
| 599 vld1.64 {d16-d17},[r4,:128]! | |
| 600 vcvt.s32.f32 q8, q8, #16 | |
| 601 vst1.32 {d18[0]}, [r8], ip | |
| 602 vsri.32 d22, d20, #16 | |
| 603 vst1.32 {d18[1]}, [r8], ip | |
| 604 vsri.32 d23, d21, #16 | |
| 605 vst1.32 {d19[0]}, [r8], ip | |
| 606 vst1.32 {d19[1]}, [r8], ip | |
| 607 vld1.64 {d18-d19},[r5,:128]! | |
| 608 vcvt.s32.f32 q9, q9, #16 | |
| 609 vst1.32 {d22[0]}, [r8], ip | |
| 610 vst1.32 {d22[1]}, [r8], ip | |
| 611 vld1.64 {d20-d21},[r4,:128]! | |
| 612 vcvt.s32.f32 q10, q10, #16 | |
| 613 vst1.32 {d23[0]}, [r8], ip | |
| 614 vst1.32 {d23[1]}, [r8], ip | |
| 615 vld1.64 {d22-d23},[r5,:128]! | |
| 616 vcvt.s32.f32 q11, q11, #16 | |
| 617 6: subs lr, lr, #16 | |
| 618 vld1.64 {d0-d1}, [r4,:128]! | |
| 619 vcvt.s32.f32 q0, q0, #16 | |
| 620 vsri.32 d18, d16, #16 | |
| 621 vld1.64 {d2-d3}, [r5,:128]! | |
| 622 vcvt.s32.f32 q1, q1, #16 | |
| 623 vsri.32 d19, d17, #16 | |
| 624 vld1.64 {d4-d5}, [r4,:128]! | |
| 625 vcvt.s32.f32 q2, q2, #16 | |
| 626 vld1.64 {d6-d7}, [r5,:128]! | |
| 627 vcvt.s32.f32 q3, q3, #16 | |
| 628 vst1.32 {d18[0]}, [r8], ip | |
| 629 vsri.32 d22, d20, #16 | |
| 630 vst1.32 {d18[1]}, [r8], ip | |
| 631 vsri.32 d23, d21, #16 | |
| 632 vst1.32 {d19[0]}, [r8], ip | |
| 633 vsri.32 d2, d0, #16 | |
| 634 vst1.32 {d19[1]}, [r8], ip | |
| 635 vsri.32 d3, d1, #16 | |
| 636 vst1.32 {d22[0]}, [r8], ip | |
| 637 vsri.32 d6, d4, #16 | |
| 638 vst1.32 {d22[1]}, [r8], ip | |
| 639 vsri.32 d7, d5, #16 | |
| 640 vst1.32 {d23[0]}, [r8], ip | |
| 641 vst1.32 {d23[1]}, [r8], ip | |
| 642 beq 6f | |
| 643 vld1.64 {d16-d17},[r4,:128]! | |
| 644 vcvt.s32.f32 q8, q8, #16 | |
| 645 vst1.32 {d2[0]}, [r8], ip | |
| 646 vst1.32 {d2[1]}, [r8], ip | |
| 647 vld1.64 {d18-d19},[r5,:128]! | |
| 648 vcvt.s32.f32 q9, q9, #16 | |
| 649 vst1.32 {d3[0]}, [r8], ip | |
| 650 vst1.32 {d3[1]}, [r8], ip | |
| 651 vld1.64 {d20-d21},[r4,:128]! | |
| 652 vcvt.s32.f32 q10, q10, #16 | |
| 653 vst1.32 {d6[0]}, [r8], ip | |
| 654 vst1.32 {d6[1]}, [r8], ip | |
| 655 vld1.64 {d22-d23},[r5,:128]! | |
| 656 vcvt.s32.f32 q11, q11, #16 | |
| 657 vst1.32 {d7[0]}, [r8], ip | |
| 658 vst1.32 {d7[1]}, [r8], ip | |
| 659 bgt 6b | |
| 660 6: vst1.32 {d2[0]}, [r8], ip | |
| 661 vst1.32 {d2[1]}, [r8], ip | |
| 662 vst1.32 {d3[0]}, [r8], ip | |
| 663 vst1.32 {d3[1]}, [r8], ip | |
| 664 vst1.32 {d6[0]}, [r8], ip | |
| 665 vst1.32 {d6[1]}, [r8], ip | |
| 666 vst1.32 {d7[0]}, [r8], ip | |
| 667 vst1.32 {d7[1]}, [r8], ip | |
| 668 b 8f | |
| 669 7: vsri.32 d18, d16, #16 | |
| 670 vsri.32 d19, d17, #16 | |
| 671 vst1.32 {d18[0]}, [r8], ip | |
| 672 vsri.32 d22, d20, #16 | |
| 673 vst1.32 {d18[1]}, [r8], ip | |
| 674 vsri.32 d23, d21, #16 | |
| 675 vst1.32 {d19[0]}, [r8], ip | |
| 676 vst1.32 {d19[1]}, [r8], ip | |
| 677 vst1.32 {d22[0]}, [r8], ip | |
| 678 vst1.32 {d22[1]}, [r8], ip | |
| 679 vst1.32 {d23[0]}, [r8], ip | |
| 680 vst1.32 {d23[1]}, [r8], ip | |
| 681 8: subs r3, r3, #2 | |
| 682 add r0, r0, #4 | |
| 683 popeq {r4-r8,pc} | |
| 684 | |
| 685 @ 1 channel | |
| 686 4: ldr r4, [r1],#4 | |
| 687 tst r2, #8 | |
| 688 mov lr, r2 | |
| 689 mov r5, r0 | |
| 690 vld1.64 {d0-d1}, [r4,:128]! | |
| 691 vcvt.s32.f32 q0, q0, #16 | |
| 692 vld1.64 {d2-d3}, [r4,:128]! | |
| 693 vcvt.s32.f32 q1, q1, #16 | |
| 694 bne 8f | |
| 695 6: subs lr, lr, #16 | |
| 696 vld1.64 {d4-d5}, [r4,:128]! | |
| 697 vcvt.s32.f32 q2, q2, #16 | |
| 698 vld1.64 {d6-d7}, [r4,:128]! | |
| 699 vcvt.s32.f32 q3, q3, #16 | |
| 700 vst1.16 {d0[1]}, [r5,:16], ip | |
| 701 vst1.16 {d0[3]}, [r5,:16], ip | |
| 702 vst1.16 {d1[1]}, [r5,:16], ip | |
| 703 vst1.16 {d1[3]}, [r5,:16], ip | |
| 704 vst1.16 {d2[1]}, [r5,:16], ip | |
| 705 vst1.16 {d2[3]}, [r5,:16], ip | |
| 706 vst1.16 {d3[1]}, [r5,:16], ip | |
| 707 vst1.16 {d3[3]}, [r5,:16], ip | |
| 708 beq 7f | |
| 709 vld1.64 {d0-d1}, [r4,:128]! | |
| 710 vcvt.s32.f32 q0, q0, #16 | |
| 711 vld1.64 {d2-d3}, [r4,:128]! | |
| 712 vcvt.s32.f32 q1, q1, #16 | |
| 713 7: vst1.16 {d4[1]}, [r5,:16], ip | |
| 714 vst1.16 {d4[3]}, [r5,:16], ip | |
| 715 vst1.16 {d5[1]}, [r5,:16], ip | |
| 716 vst1.16 {d5[3]}, [r5,:16], ip | |
| 717 vst1.16 {d6[1]}, [r5,:16], ip | |
| 718 vst1.16 {d6[3]}, [r5,:16], ip | |
| 719 vst1.16 {d7[1]}, [r5,:16], ip | |
| 720 vst1.16 {d7[3]}, [r5,:16], ip | |
| 721 bgt 6b | |
| 722 pop {r4-r8,pc} | |
| 723 8: subs lr, lr, #8 | |
| 724 vst1.16 {d0[1]}, [r5,:16], ip | |
| 725 vst1.16 {d0[3]}, [r5,:16], ip | |
| 726 vst1.16 {d1[1]}, [r5,:16], ip | |
| 727 vst1.16 {d1[3]}, [r5,:16], ip | |
| 728 vst1.16 {d2[1]}, [r5,:16], ip | |
| 729 vst1.16 {d2[3]}, [r5,:16], ip | |
| 730 vst1.16 {d3[1]}, [r5,:16], ip | |
| 731 vst1.16 {d3[3]}, [r5,:16], ip | |
| 732 popeq {r4-r8,pc} | |
| 733 vld1.64 {d0-d1}, [r4,:128]! | |
| 734 vcvt.s32.f32 q0, q0, #16 | |
| 735 vld1.64 {d2-d3}, [r4,:128]! | |
| 736 vcvt.s32.f32 q1, q1, #16 | |
| 737 b 6b | |
| 11443 | 738 endfunc |
| 8697 | 739 |
| 740 function ff_vector_fmul_neon, export=1 | |
| 741 mov r3, r0 | |
| 742 subs r2, r2, #8 | |
| 743 vld1.64 {d0-d3}, [r0,:128]! | |
| 744 vld1.64 {d4-d7}, [r1,:128]! | |
| 745 vmul.f32 q8, q0, q2 | |
| 746 vmul.f32 q9, q1, q3 | |
| 747 beq 3f | |
| 748 bics ip, r2, #15 | |
| 749 beq 2f | |
| 750 1: subs ip, ip, #16 | |
| 751 vld1.64 {d0-d1}, [r0,:128]! | |
| 752 vld1.64 {d4-d5}, [r1,:128]! | |
| 753 vmul.f32 q10, q0, q2 | |
| 754 vld1.64 {d2-d3}, [r0,:128]! | |
| 755 vld1.64 {d6-d7}, [r1,:128]! | |
| 756 vmul.f32 q11, q1, q3 | |
| 757 vst1.64 {d16-d19},[r3,:128]! | |
| 758 vld1.64 {d0-d1}, [r0,:128]! | |
| 759 vld1.64 {d4-d5}, [r1,:128]! | |
| 760 vmul.f32 q8, q0, q2 | |
| 761 vld1.64 {d2-d3}, [r0,:128]! | |
| 762 vld1.64 {d6-d7}, [r1,:128]! | |
| 763 vmul.f32 q9, q1, q3 | |
| 764 vst1.64 {d20-d23},[r3,:128]! | |
| 765 bne 1b | |
| 766 ands r2, r2, #15 | |
| 767 beq 3f | |
| 768 2: vld1.64 {d0-d1}, [r0,:128]! | |
| 769 vld1.64 {d4-d5}, [r1,:128]! | |
| 770 vst1.64 {d16-d17},[r3,:128]! | |
| 771 vmul.f32 q8, q0, q2 | |
| 772 vld1.64 {d2-d3}, [r0,:128]! | |
| 773 vld1.64 {d6-d7}, [r1,:128]! | |
| 774 vst1.64 {d18-d19},[r3,:128]! | |
| 775 vmul.f32 q9, q1, q3 | |
| 776 3: vst1.64 {d16-d19},[r3,:128]! | |
| 777 bx lr | |
| 11443 | 778 endfunc |
| 8698 | 779 |
| 780 function ff_vector_fmul_window_neon, export=1 | |
|
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
781 VFP vdup.32 q8, d0[0] |
|
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
782 NOVFP vld1.32 {d16[],d17[]}, [sp,:32] |
| 8698 | 783 push {r4,r5,lr} |
|
9969
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
784 VFP ldr lr, [sp, #12] |
|
5cca2790d582
ARM: handle VFP register arguments in ff_vector_fmul_window_neon()
mru
parents:
9581
diff
changeset
|
785 NOVFP ldr lr, [sp, #16] |
| 8698 | 786 sub r2, r2, #8 |
| 787 sub r5, lr, #2 | |
| 788 add r2, r2, r5, lsl #2 | |
| 789 add r4, r3, r5, lsl #3 | |
| 790 add ip, r0, r5, lsl #3 | |
| 791 mov r5, #-16 | |
| 792 vld1.64 {d0,d1}, [r1,:128]! | |
| 793 vld1.64 {d2,d3}, [r2,:128], r5 | |
| 794 vld1.64 {d4,d5}, [r3,:128]! | |
| 795 vld1.64 {d6,d7}, [r4,:128], r5 | |
| 796 1: subs lr, lr, #4 | |
| 797 vmov q11, q8 | |
| 798 vmla.f32 d22, d0, d4 | |
| 799 vmov q10, q8 | |
| 800 vmla.f32 d23, d1, d5 | |
| 801 vrev64.32 q3, q3 | |
| 802 vmla.f32 d20, d0, d7 | |
| 803 vrev64.32 q1, q1 | |
| 804 vmla.f32 d21, d1, d6 | |
| 805 beq 2f | |
| 806 vmla.f32 d22, d3, d7 | |
| 807 vld1.64 {d0,d1}, [r1,:128]! | |
| 808 vmla.f32 d23, d2, d6 | |
| 809 vld1.64 {d18,d19},[r2,:128], r5 | |
| 810 vmls.f32 d20, d3, d4 | |
| 811 vld1.64 {d24,d25},[r3,:128]! | |
| 812 vmls.f32 d21, d2, d5 | |
| 813 vld1.64 {d6,d7}, [r4,:128], r5 | |
| 814 vmov q1, q9 | |
| 815 vrev64.32 q11, q11 | |
| 816 vmov q2, q12 | |
| 817 vswp d22, d23 | |
| 818 vst1.64 {d20,d21},[r0,:128]! | |
| 819 vst1.64 {d22,d23},[ip,:128], r5 | |
| 820 b 1b | |
| 821 2: vmla.f32 d22, d3, d7 | |
| 822 vmla.f32 d23, d2, d6 | |
| 823 vmls.f32 d20, d3, d4 | |
| 824 vmls.f32 d21, d2, d5 | |
| 825 vrev64.32 q11, q11 | |
| 826 vswp d22, d23 | |
| 827 vst1.64 {d20,d21},[r0,:128]! | |
| 828 vst1.64 {d22,d23},[ip,:128], r5 | |
| 829 pop {r4,r5,pc} | |
| 11443 | 830 endfunc |
| 10046 | 831 |
| 832 #if CONFIG_VORBIS_DECODER | |
| 833 function ff_vorbis_inverse_coupling_neon, export=1 | |
| 834 vmov.i32 q10, #1<<31 | |
| 835 subs r2, r2, #4 | |
| 836 mov r3, r0 | |
| 837 mov r12, r1 | |
| 838 beq 3f | |
| 839 | |
| 840 vld1.32 {d24-d25},[r1,:128]! | |
| 841 vld1.32 {d22-d23},[r0,:128]! | |
| 842 vcle.s32 q8, q12, #0 | |
| 843 vand q9, q11, q10 | |
| 844 veor q12, q12, q9 | |
| 845 vand q2, q12, q8 | |
| 846 vbic q3, q12, q8 | |
| 847 vadd.f32 q12, q11, q2 | |
| 848 vsub.f32 q11, q11, q3 | |
| 849 1: vld1.32 {d2-d3}, [r1,:128]! | |
| 850 vld1.32 {d0-d1}, [r0,:128]! | |
| 851 vcle.s32 q8, q1, #0 | |
| 852 vand q9, q0, q10 | |
| 853 veor q1, q1, q9 | |
| 854 vst1.32 {d24-d25},[r3, :128]! | |
| 855 vst1.32 {d22-d23},[r12,:128]! | |
| 856 vand q2, q1, q8 | |
| 857 vbic q3, q1, q8 | |
| 858 vadd.f32 q1, q0, q2 | |
| 859 vsub.f32 q0, q0, q3 | |
| 860 subs r2, r2, #8 | |
| 861 ble 2f | |
| 862 vld1.32 {d24-d25},[r1,:128]! | |
| 863 vld1.32 {d22-d23},[r0,:128]! | |
| 864 vcle.s32 q8, q12, #0 | |
| 865 vand q9, q11, q10 | |
| 866 veor q12, q12, q9 | |
| 867 vst1.32 {d2-d3}, [r3, :128]! | |
| 868 vst1.32 {d0-d1}, [r12,:128]! | |
| 869 vand q2, q12, q8 | |
| 870 vbic q3, q12, q8 | |
| 871 vadd.f32 q12, q11, q2 | |
| 872 vsub.f32 q11, q11, q3 | |
| 873 b 1b | |
| 874 | |
| 875 2: vst1.32 {d2-d3}, [r3, :128]! | |
| 876 vst1.32 {d0-d1}, [r12,:128]! | |
| 877 bxlt lr | |
| 878 | |
| 879 3: vld1.32 {d2-d3}, [r1,:128] | |
| 880 vld1.32 {d0-d1}, [r0,:128] | |
| 881 vcle.s32 q8, q1, #0 | |
| 882 vand q9, q0, q10 | |
| 883 veor q1, q1, q9 | |
| 884 vand q2, q1, q8 | |
| 885 vbic q3, q1, q8 | |
| 886 vadd.f32 q1, q0, q2 | |
| 887 vsub.f32 q0, q0, q3 | |
| 888 vst1.32 {d2-d3}, [r0,:128]! | |
| 889 vst1.32 {d0-d1}, [r1,:128]! | |
| 890 bx lr | |
| 11443 | 891 endfunc |
| 10046 | 892 #endif |
| 10221 | 893 |
| 894 function ff_vector_fmul_scalar_neon, export=1 | |
| 895 VFP len .req r2 | |
| 896 NOVFP len .req r3 | |
| 897 VFP vdup.32 q8, d0[0] | |
| 898 NOVFP vdup.32 q8, r2 | |
| 899 bics r12, len, #15 | |
| 900 beq 3f | |
| 901 vld1.32 {q0},[r1,:128]! | |
| 902 vld1.32 {q1},[r1,:128]! | |
| 903 1: vmul.f32 q0, q0, q8 | |
| 904 vld1.32 {q2},[r1,:128]! | |
| 905 vmul.f32 q1, q1, q8 | |
| 906 vld1.32 {q3},[r1,:128]! | |
| 907 vmul.f32 q2, q2, q8 | |
| 908 vst1.32 {q0},[r0,:128]! | |
| 909 vmul.f32 q3, q3, q8 | |
| 910 vst1.32 {q1},[r0,:128]! | |
| 911 subs r12, r12, #16 | |
| 912 beq 2f | |
| 913 vld1.32 {q0},[r1,:128]! | |
| 914 vst1.32 {q2},[r0,:128]! | |
| 915 vld1.32 {q1},[r1,:128]! | |
| 916 vst1.32 {q3},[r0,:128]! | |
| 917 b 1b | |
| 918 2: vst1.32 {q2},[r0,:128]! | |
| 919 vst1.32 {q3},[r0,:128]! | |
| 920 ands len, len, #15 | |
| 921 bxeq lr | |
| 922 3: vld1.32 {q0},[r1,:128]! | |
| 923 vmul.f32 q0, q0, q8 | |
| 924 vst1.32 {q0},[r0,:128]! | |
| 925 subs len, len, #4 | |
| 926 bgt 3b | |
| 927 bx lr | |
| 928 .unreq len | |
| 11443 | 929 endfunc |
| 10221 | 930 |
| 931 function ff_vector_fmul_sv_scalar_2_neon, export=1 | |
| 932 VFP vdup.32 d16, d0[0] | |
| 933 NOVFP vdup.32 d16, r3 | |
| 934 NOVFP ldr r3, [sp] | |
| 935 vld1.32 {d0},[r1,:64]! | |
| 936 vld1.32 {d1},[r1,:64]! | |
| 937 1: subs r3, r3, #4 | |
| 938 vmul.f32 d4, d0, d16 | |
| 939 vmul.f32 d5, d1, d16 | |
| 940 ldr r12, [r2], #4 | |
| 941 vld1.32 {d2},[r12,:64] | |
| 942 ldr r12, [r2], #4 | |
| 943 vld1.32 {d3},[r12,:64] | |
| 944 vmul.f32 d4, d4, d2 | |
| 945 vmul.f32 d5, d5, d3 | |
| 946 beq 2f | |
| 947 vld1.32 {d0},[r1,:64]! | |
| 948 vld1.32 {d1},[r1,:64]! | |
| 949 vst1.32 {d4},[r0,:64]! | |
| 950 vst1.32 {d5},[r0,:64]! | |
| 951 b 1b | |
| 952 2: vst1.32 {d4},[r0,:64]! | |
| 953 vst1.32 {d5},[r0,:64]! | |
| 954 bx lr | |
| 11443 | 955 endfunc |
| 10221 | 956 |
| 957 function ff_vector_fmul_sv_scalar_4_neon, export=1 | |
| 958 VFP vdup.32 q10, d0[0] | |
| 959 NOVFP vdup.32 q10, r3 | |
| 960 NOVFP ldr r3, [sp] | |
| 961 push {lr} | |
| 962 bics lr, r3, #7 | |
| 963 beq 3f | |
| 964 vld1.32 {q0},[r1,:128]! | |
| 965 vld1.32 {q2},[r1,:128]! | |
| 966 1: ldr r12, [r2], #4 | |
| 967 vld1.32 {q1},[r12,:128] | |
| 968 ldr r12, [r2], #4 | |
| 969 vld1.32 {q3},[r12,:128] | |
| 970 vmul.f32 q8, q0, q10 | |
| 971 vmul.f32 q8, q8, q1 | |
| 972 vmul.f32 q9, q2, q10 | |
| 973 vmul.f32 q9, q9, q3 | |
| 974 subs lr, lr, #8 | |
| 975 beq 2f | |
| 976 vld1.32 {q0},[r1,:128]! | |
| 977 vld1.32 {q2},[r1,:128]! | |
| 978 vst1.32 {q8},[r0,:128]! | |
| 979 vst1.32 {q9},[r0,:128]! | |
| 980 b 1b | |
| 981 2: vst1.32 {q8},[r0,:128]! | |
| 982 vst1.32 {q9},[r0,:128]! | |
| 983 ands r3, r3, #7 | |
| 984 popeq {pc} | |
| 985 3: vld1.32 {q0},[r1,:128]! | |
| 986 ldr r12, [r2], #4 | |
| 987 vld1.32 {q1},[r12,:128] | |
| 988 vmul.f32 q0, q0, q10 | |
| 989 vmul.f32 q0, q0, q1 | |
| 990 vst1.32 {q0},[r0,:128]! | |
| 991 subs r3, r3, #4 | |
| 992 bgt 3b | |
| 993 pop {pc} | |
| 11443 | 994 endfunc |
| 10221 | 995 |
| 996 function ff_sv_fmul_scalar_2_neon, export=1 | |
| 997 VFP len .req r2 | |
| 998 NOVFP len .req r3 | |
| 999 VFP vdup.32 q8, d0[0] | |
| 1000 NOVFP vdup.32 q8, r2 | |
| 1001 ldr r12, [r1], #4 | |
| 1002 vld1.32 {d0},[r12,:64] | |
| 1003 ldr r12, [r1], #4 | |
| 1004 vld1.32 {d1},[r12,:64] | |
| 1005 1: vmul.f32 q1, q0, q8 | |
| 1006 subs len, len, #4 | |
| 1007 beq 2f | |
| 1008 ldr r12, [r1], #4 | |
| 1009 vld1.32 {d0},[r12,:64] | |
| 1010 ldr r12, [r1], #4 | |
| 1011 vld1.32 {d1},[r12,:64] | |
| 1012 vst1.32 {q1},[r0,:128]! | |
| 1013 b 1b | |
| 1014 2: vst1.32 {q1},[r0,:128]! | |
| 1015 bx lr | |
| 1016 .unreq len | |
| 11443 | 1017 endfunc |
| 10221 | 1018 |
| 1019 function ff_sv_fmul_scalar_4_neon, export=1 | |
| 1020 VFP len .req r2 | |
| 1021 NOVFP len .req r3 | |
| 1022 VFP vdup.32 q8, d0[0] | |
| 1023 NOVFP vdup.32 q8, r2 | |
| 1024 1: ldr r12, [r1], #4 | |
| 1025 vld1.32 {q0},[r12,:128] | |
| 1026 vmul.f32 q0, q0, q8 | |
| 1027 vst1.32 {q0},[r0,:128]! | |
| 1028 subs len, len, #4 | |
| 1029 bgt 1b | |
| 1030 bx lr | |
| 1031 .unreq len | |
| 11443 | 1032 endfunc |
| 10221 | 1033 |
| 1034 function ff_butterflies_float_neon, export=1 | |
| 1035 1: vld1.32 {q0},[r0,:128] | |
| 1036 vld1.32 {q1},[r1,:128] | |
| 1037 vsub.f32 q2, q0, q1 | |
| 1038 vadd.f32 q1, q0, q1 | |
| 1039 vst1.32 {q2},[r1,:128]! | |
| 1040 vst1.32 {q1},[r0,:128]! | |
| 1041 subs r2, r2, #4 | |
| 1042 bgt 1b | |
| 1043 bx lr | |
| 11443 | 1044 endfunc |
| 10228 | 1045 |
| 1046 function ff_scalarproduct_float_neon, export=1 | |
| 1047 vmov.f32 q2, #0.0 | |
| 1048 1: vld1.32 {q0},[r0,:128]! | |
| 1049 vld1.32 {q1},[r1,:128]! | |
| 1050 vmla.f32 q2, q0, q1 | |
| 1051 subs r2, r2, #4 | |
| 1052 bgt 1b | |
| 1053 vadd.f32 d0, d4, d5 | |
| 1054 vpadd.f32 d0, d0, d0 | |
| 1055 NOVFP vmov.32 r0, d0[0] | |
| 1056 bx lr | |
| 11443 | 1057 endfunc |
| 10253 | 1058 |
| 1059 function ff_int32_to_float_fmul_scalar_neon, export=1 | |
| 1060 VFP vdup.32 q0, d0[0] | |
| 1061 VFP len .req r2 | |
| 1062 NOVFP vdup.32 q0, r2 | |
| 1063 NOVFP len .req r3 | |
| 1064 | |
| 1065 vld1.32 {q1},[r1,:128]! | |
| 1066 vcvt.f32.s32 q3, q1 | |
| 1067 vld1.32 {q2},[r1,:128]! | |
| 1068 vcvt.f32.s32 q8, q2 | |
| 1069 1: subs len, len, #8 | |
| 1070 pld [r1, #16] | |
| 1071 vmul.f32 q9, q3, q0 | |
| 1072 vmul.f32 q10, q8, q0 | |
| 1073 beq 2f | |
| 1074 vld1.32 {q1},[r1,:128]! | |
| 1075 vcvt.f32.s32 q3, q1 | |
| 1076 vld1.32 {q2},[r1,:128]! | |
| 1077 vcvt.f32.s32 q8, q2 | |
| 1078 vst1.32 {q9}, [r0,:128]! | |
| 1079 vst1.32 {q10},[r0,:128]! | |
| 1080 b 1b | |
| 1081 2: vst1.32 {q9}, [r0,:128]! | |
| 1082 vst1.32 {q10},[r0,:128]! | |
| 1083 bx lr | |
| 1084 .unreq len | |
| 11443 | 1085 endfunc |
| 10274 | 1086 |
| 1087 function ff_vector_fmul_reverse_neon, export=1 | |
| 1088 add r2, r2, r3, lsl #2 | |
| 1089 sub r2, r2, #32 | |
| 1090 mov r12, #-32 | |
| 1091 vld1.32 {q0-q1}, [r1,:128]! | |
| 1092 vld1.32 {q2-q3}, [r2,:128], r12 | |
| 1093 1: pld [r1, #32] | |
| 1094 vrev64.32 q3, q3 | |
| 1095 vmul.f32 d16, d0, d7 | |
| 1096 vmul.f32 d17, d1, d6 | |
| 1097 pld [r2, #-32] | |
| 1098 vrev64.32 q2, q2 | |
| 1099 vmul.f32 d18, d2, d5 | |
| 1100 vmul.f32 d19, d3, d4 | |
| 1101 subs r3, r3, #8 | |
| 1102 beq 2f | |
| 1103 vld1.32 {q0-q1}, [r1,:128]! | |
| 1104 vld1.32 {q2-q3}, [r2,:128], r12 | |
| 1105 vst1.32 {q8-q9}, [r0,:128]! | |
| 1106 b 1b | |
| 1107 2: vst1.32 {q8-q9}, [r0,:128]! | |
| 1108 bx lr | |
| 11443 | 1109 endfunc |
| 10276 | 1110 |
| 10302 | 1111 function ff_vector_fmul_add_neon, export=1 |
| 1112 ldr r12, [sp] | |
| 1113 vld1.32 {q0-q1}, [r1,:128]! | |
| 1114 vld1.32 {q8-q9}, [r2,:128]! | |
| 1115 vld1.32 {q2-q3}, [r3,:128]! | |
| 1116 vmul.f32 q10, q0, q8 | |
| 1117 vmul.f32 q11, q1, q9 | |
| 1118 1: vadd.f32 q12, q2, q10 | |
| 1119 vadd.f32 q13, q3, q11 | |
| 1120 pld [r1, #16] | |
| 1121 pld [r2, #16] | |
| 1122 pld [r3, #16] | |
| 1123 subs r12, r12, #8 | |
| 1124 beq 2f | |
| 1125 vld1.32 {q0}, [r1,:128]! | |
| 1126 vld1.32 {q8}, [r2,:128]! | |
| 1127 vmul.f32 q10, q0, q8 | |
| 1128 vld1.32 {q1}, [r1,:128]! | |
| 1129 vld1.32 {q9}, [r2,:128]! | |
| 1130 vmul.f32 q11, q1, q9 | |
| 1131 vld1.32 {q2-q3}, [r3,:128]! | |
| 1132 vst1.32 {q12-q13},[r0,:128]! | |
| 1133 b 1b | |
| 1134 2: vst1.32 {q12-q13},[r0,:128]! | |
| 1135 bx lr | |
| 11443 | 1136 endfunc |
| 10302 | 1137 |
| 10276 | 1138 function ff_vector_clipf_neon, export=1 |
| 1139 VFP vdup.32 q1, d0[1] | |
| 1140 VFP vdup.32 q0, d0[0] | |
| 1141 NOVFP vdup.32 q0, r2 | |
| 1142 NOVFP vdup.32 q1, r3 | |
| 1143 NOVFP ldr r2, [sp] | |
| 1144 vld1.f32 {q2},[r1,:128]! | |
| 1145 vmin.f32 q10, q2, q1 | |
| 1146 vld1.f32 {q3},[r1,:128]! | |
| 1147 vmin.f32 q11, q3, q1 | |
| 1148 1: vmax.f32 q8, q10, q0 | |
| 1149 vmax.f32 q9, q11, q0 | |
| 1150 subs r2, r2, #8 | |
| 1151 beq 2f | |
| 1152 vld1.f32 {q2},[r1,:128]! | |
| 1153 vmin.f32 q10, q2, q1 | |
| 1154 vld1.f32 {q3},[r1,:128]! | |
| 1155 vmin.f32 q11, q3, q1 | |
| 1156 vst1.f32 {q8},[r0,:128]! | |
| 1157 vst1.f32 {q9},[r0,:128]! | |
| 1158 b 1b | |
| 1159 2: vst1.f32 {q8},[r0,:128]! | |
| 1160 vst1.f32 {q9},[r0,:128]! | |
| 1161 bx lr | |
| 11443 | 1162 endfunc |
