Mercurial > libavcodec.hg
annotate ppc/vc1dsp_altivec.c @ 9976:e52cd349e708 libavcodec
Only compile in NEON optimizations for H.264 when the H.264 decoder is enabled.
| author | diego |
|---|---|
| date | Wed, 22 Jul 2009 22:33:33 +0000 |
| parents | 7cee7292d5cc |
| children | 50415a8f1451 |
| rev | line source |
|---|---|
| 3537 | 1 /* |
| 2 * VC-1 and WMV3 decoder - DSP functions AltiVec-optimized | |
| 3 * Copyright (c) 2006 Konstantin Shishkov | |
| 4 * | |
|
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
5 * This file is part of FFmpeg. |
|
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
6 * |
|
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
| 3537 | 8 * modify it under the terms of the GNU Lesser General Public |
| 9 * License as published by the Free Software Foundation; either | |
|
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
| 3537 | 11 * |
|
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
| 3537 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 15 * Lesser General Public License for more details. | |
| 16 * | |
| 17 * You should have received a copy of the GNU Lesser General Public | |
|
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3537
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
| 3537 | 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 */ | |
| 21 | |
| 6763 | 22 #include "libavcodec/dsputil.h" |
| 3537 | 23 |
|
5750
09f99af1db40
Sanitize altivec code so it can be built with runtime check properly
lu_zero
parents:
5215
diff
changeset
|
24 #include "util_altivec.h" |
| 3537 | 25 |
| 26 // main steps of 8x8 transform | |
| 27 #define STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_rnd) \ | |
| 28 do { \ | |
| 29 t0 = vec_sl(vec_add(s0, s4), vec_2); \ | |
| 30 t0 = vec_add(vec_sl(t0, vec_1), t0); \ | |
| 31 t0 = vec_add(t0, vec_rnd); \ | |
| 32 t1 = vec_sl(vec_sub(s0, s4), vec_2); \ | |
| 33 t1 = vec_add(vec_sl(t1, vec_1), t1); \ | |
| 34 t1 = vec_add(t1, vec_rnd); \ | |
| 35 t2 = vec_add(vec_sl(s6, vec_2), vec_sl(s6, vec_1)); \ | |
| 36 t2 = vec_add(t2, vec_sl(s2, vec_4)); \ | |
| 37 t3 = vec_add(vec_sl(s2, vec_2), vec_sl(s2, vec_1)); \ | |
| 38 t3 = vec_sub(t3, vec_sl(s6, vec_4)); \ | |
| 39 t4 = vec_add(t0, t2); \ | |
| 40 t5 = vec_add(t1, t3); \ | |
| 41 t6 = vec_sub(t1, t3); \ | |
| 42 t7 = vec_sub(t0, t2); \ | |
| 43 \ | |
| 44 t0 = vec_sl(vec_add(s1, s3), vec_4); \ | |
| 45 t0 = vec_add(t0, vec_sl(s5, vec_3)); \ | |
| 46 t0 = vec_add(t0, vec_sl(s7, vec_2)); \ | |
| 47 t0 = vec_add(t0, vec_sub(s5, s3)); \ | |
| 48 \ | |
| 49 t1 = vec_sl(vec_sub(s1, s5), vec_4); \ | |
| 50 t1 = vec_sub(t1, vec_sl(s7, vec_3)); \ | |
| 51 t1 = vec_sub(t1, vec_sl(s3, vec_2)); \ | |
| 52 t1 = vec_sub(t1, vec_add(s1, s7)); \ | |
| 53 \ | |
| 54 t2 = vec_sl(vec_sub(s7, s3), vec_4); \ | |
| 55 t2 = vec_add(t2, vec_sl(s1, vec_3)); \ | |
| 56 t2 = vec_add(t2, vec_sl(s5, vec_2)); \ | |
| 57 t2 = vec_add(t2, vec_sub(s1, s7)); \ | |
| 58 \ | |
| 59 t3 = vec_sl(vec_sub(s5, s7), vec_4); \ | |
| 60 t3 = vec_sub(t3, vec_sl(s3, vec_3)); \ | |
| 61 t3 = vec_add(t3, vec_sl(s1, vec_2)); \ | |
| 62 t3 = vec_sub(t3, vec_add(s3, s5)); \ | |
| 63 \ | |
| 64 s0 = vec_add(t4, t0); \ | |
| 65 s1 = vec_add(t5, t1); \ | |
| 66 s2 = vec_add(t6, t2); \ | |
| 67 s3 = vec_add(t7, t3); \ | |
| 68 s4 = vec_sub(t7, t3); \ | |
| 69 s5 = vec_sub(t6, t2); \ | |
| 70 s6 = vec_sub(t5, t1); \ | |
| 71 s7 = vec_sub(t4, t0); \ | |
| 72 }while(0) | |
| 73 | |
| 74 #define SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7) \ | |
| 75 do { \ | |
| 76 s0 = vec_sra(s0, vec_3); \ | |
| 77 s1 = vec_sra(s1, vec_3); \ | |
| 78 s2 = vec_sra(s2, vec_3); \ | |
| 79 s3 = vec_sra(s3, vec_3); \ | |
| 80 s4 = vec_sra(s4, vec_3); \ | |
| 81 s5 = vec_sra(s5, vec_3); \ | |
| 82 s6 = vec_sra(s6, vec_3); \ | |
| 83 s7 = vec_sra(s7, vec_3); \ | |
| 84 }while(0) | |
| 85 | |
| 86 #define SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7) \ | |
| 87 do { \ | |
| 88 s0 = vec_sra(s0, vec_7); \ | |
| 89 s1 = vec_sra(s1, vec_7); \ | |
| 90 s2 = vec_sra(s2, vec_7); \ | |
| 91 s3 = vec_sra(s3, vec_7); \ | |
| 92 s4 = vec_sra(vec_add(s4, vec_1s), vec_7); \ | |
| 93 s5 = vec_sra(vec_add(s5, vec_1s), vec_7); \ | |
| 94 s6 = vec_sra(vec_add(s6, vec_1s), vec_7); \ | |
| 95 s7 = vec_sra(vec_add(s7, vec_1s), vec_7); \ | |
| 96 }while(0) | |
| 97 | |
| 98 /* main steps of 4x4 transform */ | |
| 99 #define STEP4(s0, s1, s2, s3, vec_rnd) \ | |
| 100 do { \ | |
| 101 t1 = vec_add(vec_sl(s0, vec_4), s0); \ | |
| 102 t1 = vec_add(t1, vec_rnd); \ | |
| 103 t2 = vec_add(vec_sl(s2, vec_4), s2); \ | |
| 104 t0 = vec_add(t1, t2); \ | |
| 105 t1 = vec_sub(t1, t2); \ | |
| 106 t3 = vec_sl(vec_sub(s3, s1), vec_1); \ | |
| 107 t3 = vec_add(t3, vec_sl(t3, vec_2)); \ | |
| 108 t2 = vec_add(t3, vec_sl(s1, vec_5)); \ | |
| 109 t3 = vec_add(t3, vec_sl(s3, vec_3)); \ | |
| 110 t3 = vec_add(t3, vec_sl(s3, vec_2)); \ | |
| 111 s0 = vec_add(t0, t2); \ | |
| 112 s1 = vec_sub(t1, t3); \ | |
| 113 s2 = vec_add(t1, t3); \ | |
| 114 s3 = vec_sub(t0, t2); \ | |
| 115 }while (0) | |
| 116 | |
| 117 #define SHIFT_HOR4(s0, s1, s2, s3) \ | |
| 118 s0 = vec_sra(s0, vec_3); \ | |
| 119 s1 = vec_sra(s1, vec_3); \ | |
| 120 s2 = vec_sra(s2, vec_3); \ | |
| 121 s3 = vec_sra(s3, vec_3); | |
| 122 | |
| 123 #define SHIFT_VERT4(s0, s1, s2, s3) \ | |
| 124 s0 = vec_sra(s0, vec_7); \ | |
| 125 s1 = vec_sra(s1, vec_7); \ | |
| 126 s2 = vec_sra(s2, vec_7); \ | |
| 127 s3 = vec_sra(s3, vec_7); | |
| 128 | |
| 129 /** Do inverse transform on 8x8 block | |
| 130 */ | |
| 131 static void vc1_inv_trans_8x8_altivec(DCTELEM block[64]) | |
| 132 { | |
| 133 vector signed short src0, src1, src2, src3, src4, src5, src6, src7; | |
| 134 vector signed int s0, s1, s2, s3, s4, s5, s6, s7; | |
| 135 vector signed int s8, s9, sA, sB, sC, sD, sE, sF; | |
| 136 vector signed int t0, t1, t2, t3, t4, t5, t6, t7; | |
| 137 const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); | |
| 138 const vector unsigned int vec_7 = vec_splat_u32(7); | |
| 139 const vector unsigned int vec_4 = vec_splat_u32(4); | |
| 140 const vector signed int vec_4s = vec_splat_s32(4); | |
| 141 const vector unsigned int vec_3 = vec_splat_u32(3); | |
| 142 const vector unsigned int vec_2 = vec_splat_u32(2); | |
| 143 const vector signed int vec_1s = vec_splat_s32(1); | |
| 144 const vector unsigned int vec_1 = vec_splat_u32(1); | |
| 145 | |
| 146 | |
| 147 src0 = vec_ld( 0, block); | |
| 148 src1 = vec_ld( 16, block); | |
| 149 src2 = vec_ld( 32, block); | |
| 150 src3 = vec_ld( 48, block); | |
| 151 src4 = vec_ld( 64, block); | |
| 152 src5 = vec_ld( 80, block); | |
| 153 src6 = vec_ld( 96, block); | |
| 154 src7 = vec_ld(112, block); | |
| 155 | |
| 156 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
| 157 s0 = vec_unpackl(src0); | |
| 158 s1 = vec_unpackl(src1); | |
| 159 s2 = vec_unpackl(src2); | |
| 160 s3 = vec_unpackl(src3); | |
| 161 s4 = vec_unpackl(src4); | |
| 162 s5 = vec_unpackl(src5); | |
| 163 s6 = vec_unpackl(src6); | |
| 164 s7 = vec_unpackl(src7); | |
| 165 s8 = vec_unpackh(src0); | |
| 166 s9 = vec_unpackh(src1); | |
| 167 sA = vec_unpackh(src2); | |
| 168 sB = vec_unpackh(src3); | |
| 169 sC = vec_unpackh(src4); | |
| 170 sD = vec_unpackh(src5); | |
| 171 sE = vec_unpackh(src6); | |
| 172 sF = vec_unpackh(src7); | |
| 173 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); | |
| 174 SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); | |
| 175 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); | |
| 176 SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); | |
| 177 src0 = vec_pack(s8, s0); | |
| 178 src1 = vec_pack(s9, s1); | |
| 179 src2 = vec_pack(sA, s2); | |
| 180 src3 = vec_pack(sB, s3); | |
| 181 src4 = vec_pack(sC, s4); | |
| 182 src5 = vec_pack(sD, s5); | |
| 183 src6 = vec_pack(sE, s6); | |
| 184 src7 = vec_pack(sF, s7); | |
| 185 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
| 186 | |
| 187 s0 = vec_unpackl(src0); | |
| 188 s1 = vec_unpackl(src1); | |
| 189 s2 = vec_unpackl(src2); | |
| 190 s3 = vec_unpackl(src3); | |
| 191 s4 = vec_unpackl(src4); | |
| 192 s5 = vec_unpackl(src5); | |
| 193 s6 = vec_unpackl(src6); | |
| 194 s7 = vec_unpackl(src7); | |
| 195 s8 = vec_unpackh(src0); | |
| 196 s9 = vec_unpackh(src1); | |
| 197 sA = vec_unpackh(src2); | |
| 198 sB = vec_unpackh(src3); | |
| 199 sC = vec_unpackh(src4); | |
| 200 sD = vec_unpackh(src5); | |
| 201 sE = vec_unpackh(src6); | |
| 202 sF = vec_unpackh(src7); | |
| 203 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_64); | |
| 204 SHIFT_VERT8(s0, s1, s2, s3, s4, s5, s6, s7); | |
| 205 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_64); | |
| 206 SHIFT_VERT8(s8, s9, sA, sB, sC, sD, sE, sF); | |
| 207 src0 = vec_pack(s8, s0); | |
| 208 src1 = vec_pack(s9, s1); | |
| 209 src2 = vec_pack(sA, s2); | |
| 210 src3 = vec_pack(sB, s3); | |
| 211 src4 = vec_pack(sC, s4); | |
| 212 src5 = vec_pack(sD, s5); | |
| 213 src6 = vec_pack(sE, s6); | |
| 214 src7 = vec_pack(sF, s7); | |
| 215 | |
| 216 vec_st(src0, 0, block); | |
| 217 vec_st(src1, 16, block); | |
| 218 vec_st(src2, 32, block); | |
| 219 vec_st(src3, 48, block); | |
| 220 vec_st(src4, 64, block); | |
| 221 vec_st(src5, 80, block); | |
| 222 vec_st(src6, 96, block); | |
| 223 vec_st(src7,112, block); | |
| 224 } | |
| 225 | |
| 226 /** Do inverse transform on 8x4 part of block | |
| 227 */ | |
| 5999 | 228 static void vc1_inv_trans_8x4_altivec(uint8_t *dest, int stride, DCTELEM *block) |
| 3537 | 229 { |
| 230 vector signed short src0, src1, src2, src3, src4, src5, src6, src7; | |
| 231 vector signed int s0, s1, s2, s3, s4, s5, s6, s7; | |
| 232 vector signed int s8, s9, sA, sB, sC, sD, sE, sF; | |
| 233 vector signed int t0, t1, t2, t3, t4, t5, t6, t7; | |
| 234 const vector signed int vec_64 = vec_sl(vec_splat_s32(4), vec_splat_u32(4)); | |
| 235 const vector unsigned int vec_7 = vec_splat_u32(7); | |
| 236 const vector unsigned int vec_5 = vec_splat_u32(5); | |
| 237 const vector unsigned int vec_4 = vec_splat_u32(4); | |
| 238 const vector signed int vec_4s = vec_splat_s32(4); | |
| 239 const vector unsigned int vec_3 = vec_splat_u32(3); | |
| 240 const vector unsigned int vec_2 = vec_splat_u32(2); | |
| 241 const vector unsigned int vec_1 = vec_splat_u32(1); | |
| 5999 | 242 vector unsigned char tmp; |
| 243 vector signed short tmp2, tmp3; | |
| 244 vector unsigned char perm0, perm1, p0, p1, p; | |
| 3537 | 245 |
| 246 src0 = vec_ld( 0, block); | |
| 247 src1 = vec_ld( 16, block); | |
| 248 src2 = vec_ld( 32, block); | |
| 249 src3 = vec_ld( 48, block); | |
| 250 src4 = vec_ld( 64, block); | |
| 251 src5 = vec_ld( 80, block); | |
| 252 src6 = vec_ld( 96, block); | |
| 253 src7 = vec_ld(112, block); | |
| 254 | |
| 255 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
| 256 s0 = vec_unpackl(src0); | |
| 257 s1 = vec_unpackl(src1); | |
| 258 s2 = vec_unpackl(src2); | |
| 259 s3 = vec_unpackl(src3); | |
| 260 s4 = vec_unpackl(src4); | |
| 261 s5 = vec_unpackl(src5); | |
| 262 s6 = vec_unpackl(src6); | |
| 263 s7 = vec_unpackl(src7); | |
| 264 s8 = vec_unpackh(src0); | |
| 265 s9 = vec_unpackh(src1); | |
| 266 sA = vec_unpackh(src2); | |
| 267 sB = vec_unpackh(src3); | |
| 268 sC = vec_unpackh(src4); | |
| 269 sD = vec_unpackh(src5); | |
| 270 sE = vec_unpackh(src6); | |
| 271 sF = vec_unpackh(src7); | |
| 272 STEP8(s0, s1, s2, s3, s4, s5, s6, s7, vec_4s); | |
| 273 SHIFT_HOR8(s0, s1, s2, s3, s4, s5, s6, s7); | |
| 274 STEP8(s8, s9, sA, sB, sC, sD, sE, sF, vec_4s); | |
| 275 SHIFT_HOR8(s8, s9, sA, sB, sC, sD, sE, sF); | |
| 276 src0 = vec_pack(s8, s0); | |
| 277 src1 = vec_pack(s9, s1); | |
| 278 src2 = vec_pack(sA, s2); | |
| 279 src3 = vec_pack(sB, s3); | |
| 280 src4 = vec_pack(sC, s4); | |
| 281 src5 = vec_pack(sD, s5); | |
| 282 src6 = vec_pack(sE, s6); | |
| 283 src7 = vec_pack(sF, s7); | |
| 284 TRANSPOSE8(src0, src1, src2, src3, src4, src5, src6, src7); | |
| 285 | |
| 6000 | 286 s0 = vec_unpackh(src0); |
| 287 s1 = vec_unpackh(src1); | |
| 288 s2 = vec_unpackh(src2); | |
| 289 s3 = vec_unpackh(src3); | |
| 290 s8 = vec_unpackl(src0); | |
| 291 s9 = vec_unpackl(src1); | |
| 292 sA = vec_unpackl(src2); | |
| 293 sB = vec_unpackl(src3); | |
| 294 STEP4(s0, s1, s2, s3, vec_64); | |
| 295 SHIFT_VERT4(s0, s1, s2, s3); | |
| 296 STEP4(s8, s9, sA, sB, vec_64); | |
| 297 SHIFT_VERT4(s8, s9, sA, sB); | |
| 298 src0 = vec_pack(s0, s8); | |
| 299 src1 = vec_pack(s1, s9); | |
| 300 src2 = vec_pack(s2, sA); | |
| 301 src3 = vec_pack(s3, sB); | |
| 3537 | 302 |
| 5999 | 303 p0 = vec_lvsl (0, dest); |
| 304 p1 = vec_lvsl (stride, dest); | |
| 305 p = vec_splat_u8 (-1); | |
| 306 perm0 = vec_mergeh (p, p0); | |
| 307 perm1 = vec_mergeh (p, p1); | |
| 3537 | 308 |
| 5999 | 309 #define ADD(dest,src,perm) \ |
| 310 /* *(uint64_t *)&tmp = *(uint64_t *)dest; */ \ | |
| 311 tmp = vec_ld (0, dest); \ | |
|
6028
1ba8ee13e5b9
Make strict altivec parsers happy (gcc-4.3 and others)
lu_zero
parents:
6000
diff
changeset
|
312 tmp2 = (vector signed short)vec_perm (tmp, vec_splat_u8(0), perm); \ |
| 5999 | 313 tmp3 = vec_adds (tmp2, src); \ |
| 314 tmp = vec_packsu (tmp3, tmp3); \ | |
| 315 vec_ste ((vector unsigned int)tmp, 0, (unsigned int *)dest); \ | |
| 316 vec_ste ((vector unsigned int)tmp, 4, (unsigned int *)dest); | |
| 317 | |
| 318 ADD (dest, src0, perm0) dest += stride; | |
| 319 ADD (dest, src1, perm1) dest += stride; | |
| 320 ADD (dest, src2, perm0) dest += stride; | |
| 321 ADD (dest, src3, perm1) | |
| 3537 | 322 } |
| 323 | |
| 324 | |
| 325 void vc1dsp_init_altivec(DSPContext* dsp, AVCodecContext *avctx) { | |
| 326 dsp->vc1_inv_trans_8x8 = vc1_inv_trans_8x8_altivec; | |
| 5999 | 327 dsp->vc1_inv_trans_8x4 = vc1_inv_trans_8x4_altivec; |
| 3537 | 328 } |
