Mercurial > mplayer.hg
comparison liba52/liba52_changes.diff @ 14990:9de84a73f6d0
MPlayer-specific changes to liba52
| author | diego |
|---|---|
| date | Tue, 22 Mar 2005 23:25:06 +0000 |
| parents | |
| children | 4bad7f00556e |
comparison
equal
deleted
inserted
replaced
| 14989:d55bd88c2b42 | 14990:9de84a73f6d0 |
|---|---|
| 1 --- include/a52.h 2005-03-22 19:58:53.000000000 +0100 | |
| 2 +++ a52.h 2004-03-19 01:15:49.000000000 +0100 | |
| 3 @@ -19,6 +25,9 @@ | |
| 4 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
| 5 */ | |
| 6 | |
| 7 +#ifndef A52_H | |
| 8 +#define A52_H | |
| 9 + | |
| 10 #ifndef LIBA52_DOUBLE | |
| 11 typedef float sample_t; | |
| 12 #else | |
| 13 @@ -113,3 +122,10 @@ | |
| 14 void a52_dynrng (a52_state_t * state, | |
| 15 sample_t (* call) (sample_t, void *), void * data); | |
| 16 int a52_block (a52_state_t * state, sample_t * samples); | |
| 17 + | |
| 18 +void* a52_resample_init(uint32_t mm_accel,int flags,int chans); | |
| 19 +extern int (* a52_resample) (float * _f, int16_t * s16); | |
| 20 + | |
| 21 +uint16_t crc16_block(uint8_t *data,uint32_t num_bytes); | |
| 22 + | |
| 23 +#endif /* A52_H */ | |
| 24 --- liba52/a52_internal.h 2005-03-22 19:59:35.000000000 +0100 | |
| 25 +++ a52_internal.h 2004-03-19 01:15:49.000000000 +0100 | |
| 26 @@ -41,11 +43,12 @@ | |
| 27 | |
| 28 int downmix_init (int input, int flags, sample_t * level, | |
| 29 sample_t clev, sample_t slev); | |
| 30 +void downmix_accel_init(uint32_t mm_accel); | |
| 31 int downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level, | |
| 32 sample_t clev, sample_t slev); | |
| 33 -void downmix (sample_t * samples, int acmod, int output, sample_t bias, | |
| 34 +extern void (*downmix) (sample_t * samples, int acmod, int output, sample_t bias, | |
| 35 sample_t clev, sample_t slev); | |
| 36 -void upmix (sample_t * samples, int acmod, int output); | |
| 37 +extern void (*upmix) (sample_t * samples, int acmod, int output); | |
| 38 | |
| 39 void imdct_init (uint32_t mm_accel); | |
| 40 extern void (* imdct_256) (sample_t * data, sample_t * delay, sample_t bias); | |
| 41 --- liba52/bitstream.c 2005-03-22 19:59:35.000000000 +0100 | |
| 42 +++ bitstream.c 2004-03-19 01:15:49.000000000 +0100 | |
| 43 @@ -29,7 +35,12 @@ | |
| 44 | |
| 45 #define BUFFER_SIZE 4096 | |
| 46 | |
| 47 +#ifdef ALT_BITSTREAM_READER | |
| 48 +int indx=0; | |
| 49 +uint32_t * buffer_start; | |
| 50 +#else | |
| 51 static uint32_t * buffer_start; | |
| 52 +#endif | |
| 53 | |
| 54 uint32_t bits_left; | |
| 55 uint32_t current_word; | |
| 56 @@ -41,6 +52,9 @@ | |
| 57 align = (int)buf & 3; | |
| 58 buffer_start = (uint32_t *) (buf - align); | |
| 59 bits_left = 0; | |
| 60 +#ifdef ALT_BITSTREAM_READER | |
| 61 + indx=0; | |
| 62 +#endif | |
| 63 bitstream_get (align * 8); | |
| 64 } | |
| 65 | |
| 66 --- liba52/bitstream.h 2005-03-22 19:59:35.000000000 +0100 | |
| 67 +++ bitstream.h 2004-03-19 01:15:49.000000000 +0100 | |
| 68 @@ -19,6 +25,48 @@ | |
| 69 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
| 70 */ | |
| 71 | |
| 72 +/* code from ffmpeg/libavcodec */ | |
| 73 +#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC_ == 3 && __GNUC_MINOR__ > 0) | |
| 74 +# define always_inline __attribute__((always_inline)) inline | |
| 75 +#else | |
| 76 +# define always_inline inline | |
| 77 +#endif | |
| 78 + | |
| 79 +#if defined(__sparc__) || defined(hpux) | |
| 80 +/* | |
| 81 + * the alt bitstream reader performs unaligned memory accesses; that doesn't work | |
| 82 + * on sparc/hpux. For now, disable ALT_BITSTREAM_READER. | |
| 83 + */ | |
| 84 +#undef ALT_BITSTREAM_READER | |
| 85 +#else | |
| 86 +// alternative (faster) bitstram reader (reades upto 3 bytes over the end of the input) | |
| 87 +#define ALT_BITSTREAM_READER | |
| 88 + | |
| 89 +/* used to avoid missaligned exceptions on some archs (alpha, ...) */ | |
| 90 +#if defined (ARCH_X86) || defined(ARCH_ARMV4L) | |
| 91 +# define unaligned32(a) (*(uint32_t*)(a)) | |
| 92 +#else | |
| 93 +# ifdef __GNUC__ | |
| 94 +static always_inline uint32_t unaligned32(const void *v) { | |
| 95 + struct Unaligned { | |
| 96 + uint32_t i; | |
| 97 + } __attribute__((packed)); | |
| 98 + | |
| 99 + return ((const struct Unaligned *) v)->i; | |
| 100 +} | |
| 101 +# elif defined(__DECC) | |
| 102 +static inline uint32_t unaligned32(const void *v) { | |
| 103 + return *(const __unaligned uint32_t *) v; | |
| 104 +} | |
| 105 +# else | |
| 106 +static inline uint32_t unaligned32(const void *v) { | |
| 107 + return *(const uint32_t *) v; | |
| 108 +} | |
| 109 +# endif | |
| 110 +#endif //!ARCH_X86 | |
| 111 + | |
| 112 +#endif | |
| 113 + | |
| 114 /* (stolen from the kernel) */ | |
| 115 #ifdef WORDS_BIGENDIAN | |
| 116 | |
| 117 @@ -29,7 +77,7 @@ | |
| 118 # if defined (__i386__) | |
| 119 | |
| 120 # define swab32(x) __i386_swab32(x) | |
| 121 - static inline const uint32_t __i386_swab32(uint32_t x) | |
| 122 + static always_inline const uint32_t __i386_swab32(uint32_t x) | |
| 123 { | |
| 124 __asm__("bswap %0" : "=r" (x) : "0" (x)); | |
| 125 return x; | |
| 126 @@ -37,25 +85,42 @@ | |
| 127 | |
| 128 # else | |
| 129 | |
| 130 -# define swab32(x)\ | |
| 131 -((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | \ | |
| 132 - (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])) | |
| 133 - | |
| 134 +# define swab32(x) __generic_swab32(x) | |
| 135 + static always_inline const uint32_t __generic_swab32(uint32_t x) | |
| 136 + { | |
| 137 + return ((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) | | |
| 138 + (((uint8_t*)&x)[2] << 8) | (((uint8_t*)&x)[3])); | |
| 139 + } | |
| 140 # endif | |
| 141 #endif | |
| 142 | |
| 143 +#ifdef ALT_BITSTREAM_READER | |
| 144 +extern uint32_t *buffer_start; | |
| 145 +extern int indx; | |
| 146 +#else | |
| 147 extern uint32_t bits_left; | |
| 148 extern uint32_t current_word; | |
| 149 +#endif | |
| 150 | |
| 151 void bitstream_set_ptr (uint8_t * buf); | |
| 152 uint32_t bitstream_get_bh(uint32_t num_bits); | |
| 153 int32_t bitstream_get_bh_2(uint32_t num_bits); | |
| 154 | |
| 155 + | |
| 156 static inline uint32_t | |
| 157 -bitstream_get(uint32_t num_bits) | |
| 158 +bitstream_get(uint32_t num_bits) // note num_bits is practically a constant due to inlineing | |
| 159 { | |
| 160 +#ifdef ALT_BITSTREAM_READER | |
| 161 + uint32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) ); | |
| 162 + | |
| 163 + result<<= (indx&0x07); | |
| 164 + result>>= 32 - num_bits; | |
| 165 + indx+= num_bits; | |
| 166 + | |
| 167 + return result; | |
| 168 +#else | |
| 169 uint32_t result; | |
| 170 - | |
| 171 + | |
| 172 if(num_bits < bits_left) { | |
| 173 result = (current_word << (32 - bits_left)) >> (32 - num_bits); | |
| 174 bits_left -= num_bits; | |
| 175 @@ -63,11 +128,30 @@ | |
| 176 } | |
| 177 | |
| 178 return bitstream_get_bh(num_bits); | |
| 179 +#endif | |
| 180 +} | |
| 181 + | |
| 182 +static inline void bitstream_skip(int num_bits) | |
| 183 +{ | |
| 184 +#ifdef ALT_BITSTREAM_READER | |
| 185 + indx+= num_bits; | |
| 186 +#else | |
| 187 + bitstream_get(num_bits); | |
| 188 +#endif | |
| 189 } | |
| 190 | |
| 191 static inline int32_t | |
| 192 bitstream_get_2(uint32_t num_bits) | |
| 193 { | |
| 194 +#ifdef ALT_BITSTREAM_READER | |
| 195 + int32_t result= swab32( unaligned32(((uint8_t *)buffer_start)+(indx>>3)) ); | |
| 196 + | |
| 197 + result<<= (indx&0x07); | |
| 198 + result>>= 32 - num_bits; | |
| 199 + indx+= num_bits; | |
| 200 + | |
| 201 + return result; | |
| 202 +#else | |
| 203 int32_t result; | |
| 204 | |
| 205 if(num_bits < bits_left) { | |
| 206 @@ -77,4 +161,5 @@ | |
| 207 } | |
| 208 | |
| 209 return bitstream_get_bh_2(num_bits); | |
| 210 +#endif | |
| 211 } | |
| 212 --- liba52/downmix.c 2005-03-22 19:59:35.000000000 +0100 | |
| 213 +++ downmix.c 2004-04-12 18:42:14.000000000 +0200 | |
| 214 @@ -17,18 +23,46 @@ | |
| 215 * You should have received a copy of the GNU General Public License | |
| 216 * along with this program; if not, write to the Free Software | |
| 217 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
| 218 + * | |
| 219 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) | |
| 220 */ | |
| 221 | |
| 222 #include "config.h" | |
| 223 | |
| 224 -#include <inttypes.h> | |
| 225 #include <string.h> | |
| 226 +#include <inttypes.h> | |
| 227 | |
| 228 #include "a52.h" | |
| 229 #include "a52_internal.h" | |
| 230 +#include "mm_accel.h" | |
| 231 | |
| 232 #define CONVERT(acmod,output) (((output) << 3) + (acmod)) | |
| 233 | |
| 234 + | |
| 235 +void (*downmix)(sample_t * samples, int acmod, int output, sample_t bias, | |
| 236 + sample_t clev, sample_t slev)= NULL; | |
| 237 +void (*upmix)(sample_t * samples, int acmod, int output)= NULL; | |
| 238 + | |
| 239 +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, | |
| 240 + sample_t clev, sample_t slev); | |
| 241 +static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, | |
| 242 + sample_t clev, sample_t slev); | |
| 243 +static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, | |
| 244 + sample_t clev, sample_t slev); | |
| 245 +static void upmix_MMX (sample_t * samples, int acmod, int output); | |
| 246 +static void upmix_C (sample_t * samples, int acmod, int output); | |
| 247 + | |
| 248 +void downmix_accel_init(uint32_t mm_accel) | |
| 249 +{ | |
| 250 + upmix= upmix_C; | |
| 251 + downmix= downmix_C; | |
| 252 +#ifdef ARCH_X86 | |
| 253 + if(mm_accel & MM_ACCEL_X86_MMX) upmix= upmix_MMX; | |
| 254 + if(mm_accel & MM_ACCEL_X86_SSE) downmix= downmix_SSE; | |
| 255 + if(mm_accel & MM_ACCEL_X86_3DNOW) downmix= downmix_3dnow; | |
| 256 +#endif | |
| 257 +} | |
| 258 + | |
| 259 int downmix_init (int input, int flags, sample_t * level, | |
| 260 sample_t clev, sample_t slev) | |
| 261 { | |
| 262 @@ -61,7 +95,7 @@ | |
| 263 output = flags & A52_CHANNEL_MASK; | |
| 264 if (output > A52_DOLBY) | |
| 265 return -1; | |
| 266 - | |
| 267 + | |
| 268 output = table[output][input & 7]; | |
| 269 | |
| 270 if ((output == A52_STEREO) && | |
| 271 @@ -145,7 +179,6 @@ | |
| 272 *level *= 1 / (1 + 3 * LEVEL_3DB); | |
| 273 break; | |
| 274 } | |
| 275 - | |
| 276 return output; | |
| 277 } | |
| 278 | |
| 279 @@ -440,12 +473,11 @@ | |
| 280 static void zero (sample_t * samples) | |
| 281 { | |
| 282 int i; | |
| 283 - | |
| 284 for (i = 0; i < 256; i++) | |
| 285 samples[i] = 0; | |
| 286 } | |
| 287 | |
| 288 -void downmix (sample_t * samples, int acmod, int output, sample_t bias, | |
| 289 +static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias, | |
| 290 sample_t clev, sample_t slev) | |
| 291 { | |
| 292 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
| 293 @@ -557,7 +589,7 @@ | |
| 294 break; | |
| 295 | |
| 296 case CONVERT (A52_3F2R, A52_2F1R): | |
| 297 - mix3to2 (samples, bias); | |
| 298 + mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used) | |
| 299 move2to1 (samples + 768, samples + 512, bias); | |
| 300 break; | |
| 301 | |
| 302 @@ -581,12 +613,12 @@ | |
| 303 break; | |
| 304 | |
| 305 case CONVERT (A52_3F1R, A52_3F2R): | |
| 306 - memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t)); | |
| 307 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | |
| 308 break; | |
| 309 } | |
| 310 } | |
| 311 | |
| 312 -void upmix (sample_t * samples, int acmod, int output) | |
| 313 +static void upmix_C (sample_t * samples, int acmod, int output) | |
| 314 { | |
| 315 switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
| 316 | |
| 317 @@ -651,3 +683,1137 @@ | |
| 318 goto mix_31to21; | |
| 319 } | |
| 320 } | |
| 321 + | |
| 322 +#ifdef ARCH_X86 | |
| 323 +static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias) | |
| 324 +{ | |
| 325 + asm volatile( | |
| 326 + "movlps %2, %%xmm7 \n\t" | |
| 327 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 328 + "movl $-1024, %%esi \n\t" | |
| 329 + ".balign 16\n\t" | |
| 330 + "1: \n\t" | |
| 331 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
| 332 + "movaps 16(%0, %%esi), %%xmm1 \n\t" | |
| 333 + "addps (%1, %%esi), %%xmm0 \n\t" | |
| 334 + "addps 16(%1, %%esi), %%xmm1 \n\t" | |
| 335 + "addps %%xmm7, %%xmm0 \n\t" | |
| 336 + "addps %%xmm7, %%xmm1 \n\t" | |
| 337 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
| 338 + "movaps %%xmm1, 16(%1, %%esi) \n\t" | |
| 339 + "addl $32, %%esi \n\t" | |
| 340 + " jnz 1b \n\t" | |
| 341 + :: "r" (src+256), "r" (dest+256), "m" (bias) | |
| 342 + : "%esi" | |
| 343 + ); | |
| 344 +} | |
| 345 + | |
| 346 +static void mix3to1_SSE (sample_t * samples, sample_t bias) | |
| 347 +{ | |
| 348 + asm volatile( | |
| 349 + "movlps %1, %%xmm7 \n\t" | |
| 350 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 351 + "movl $-1024, %%esi \n\t" | |
| 352 + ".balign 16\n\t" | |
| 353 + "1: \n\t" | |
| 354 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
| 355 + "movaps 1024(%0, %%esi), %%xmm1 \n\t" | |
| 356 + "addps 2048(%0, %%esi), %%xmm0 \n\t" | |
| 357 + "addps %%xmm7, %%xmm1 \n\t" | |
| 358 + "addps %%xmm1, %%xmm0 \n\t" | |
| 359 + "movaps %%xmm0, (%0, %%esi) \n\t" | |
| 360 + "addl $16, %%esi \n\t" | |
| 361 + " jnz 1b \n\t" | |
| 362 + :: "r" (samples+256), "m" (bias) | |
| 363 + : "%esi" | |
| 364 + ); | |
| 365 +} | |
| 366 + | |
| 367 +static void mix4to1_SSE (sample_t * samples, sample_t bias) | |
| 368 +{ | |
| 369 + asm volatile( | |
| 370 + "movlps %1, %%xmm7 \n\t" | |
| 371 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 372 + "movl $-1024, %%esi \n\t" | |
| 373 + ".balign 16\n\t" | |
| 374 + "1: \n\t" | |
| 375 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
| 376 + "movaps 1024(%0, %%esi), %%xmm1 \n\t" | |
| 377 + "addps 2048(%0, %%esi), %%xmm0 \n\t" | |
| 378 + "addps 3072(%0, %%esi), %%xmm1 \n\t" | |
| 379 + "addps %%xmm7, %%xmm0 \n\t" | |
| 380 + "addps %%xmm1, %%xmm0 \n\t" | |
| 381 + "movaps %%xmm0, (%0, %%esi) \n\t" | |
| 382 + "addl $16, %%esi \n\t" | |
| 383 + " jnz 1b \n\t" | |
| 384 + :: "r" (samples+256), "m" (bias) | |
| 385 + : "%esi" | |
| 386 + ); | |
| 387 +} | |
| 388 + | |
| 389 +static void mix5to1_SSE (sample_t * samples, sample_t bias) | |
| 390 +{ | |
| 391 + asm volatile( | |
| 392 + "movlps %1, %%xmm7 \n\t" | |
| 393 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 394 + "movl $-1024, %%esi \n\t" | |
| 395 + ".balign 16\n\t" | |
| 396 + "1: \n\t" | |
| 397 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
| 398 + "movaps 1024(%0, %%esi), %%xmm1 \n\t" | |
| 399 + "addps 2048(%0, %%esi), %%xmm0 \n\t" | |
| 400 + "addps 3072(%0, %%esi), %%xmm1 \n\t" | |
| 401 + "addps %%xmm7, %%xmm0 \n\t" | |
| 402 + "addps 4096(%0, %%esi), %%xmm1 \n\t" | |
| 403 + "addps %%xmm1, %%xmm0 \n\t" | |
| 404 + "movaps %%xmm0, (%0, %%esi) \n\t" | |
| 405 + "addl $16, %%esi \n\t" | |
| 406 + " jnz 1b \n\t" | |
| 407 + :: "r" (samples+256), "m" (bias) | |
| 408 + : "%esi" | |
| 409 + ); | |
| 410 +} | |
| 411 + | |
| 412 +static void mix3to2_SSE (sample_t * samples, sample_t bias) | |
| 413 +{ | |
| 414 + asm volatile( | |
| 415 + "movlps %1, %%xmm7 \n\t" | |
| 416 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 417 + "movl $-1024, %%esi \n\t" | |
| 418 + ".balign 16\n\t" | |
| 419 + "1: \n\t" | |
| 420 + "movaps 1024(%0, %%esi), %%xmm0 \n\t" | |
| 421 + "addps %%xmm7, %%xmm0 \n\t" //common | |
| 422 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
| 423 + "movaps 2048(%0, %%esi), %%xmm2 \n\t" | |
| 424 + "addps %%xmm0, %%xmm1 \n\t" | |
| 425 + "addps %%xmm0, %%xmm2 \n\t" | |
| 426 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
| 427 + "movaps %%xmm2, 1024(%0, %%esi) \n\t" | |
| 428 + "addl $16, %%esi \n\t" | |
| 429 + " jnz 1b \n\t" | |
| 430 + :: "r" (samples+256), "m" (bias) | |
| 431 + : "%esi" | |
| 432 + ); | |
| 433 +} | |
| 434 + | |
| 435 +static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias) | |
| 436 +{ | |
| 437 + asm volatile( | |
| 438 + "movlps %2, %%xmm7 \n\t" | |
| 439 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 440 + "movl $-1024, %%esi \n\t" | |
| 441 + ".balign 16\n\t" | |
| 442 + "1: \n\t" | |
| 443 + "movaps 1024(%1, %%esi), %%xmm0 \n\t" | |
| 444 + "addps %%xmm7, %%xmm0 \n\t" //common | |
| 445 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
| 446 + "movaps (%1, %%esi), %%xmm2 \n\t" | |
| 447 + "addps %%xmm0, %%xmm1 \n\t" | |
| 448 + "addps %%xmm0, %%xmm2 \n\t" | |
| 449 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
| 450 + "movaps %%xmm2, (%1, %%esi) \n\t" | |
| 451 + "addl $16, %%esi \n\t" | |
| 452 + " jnz 1b \n\t" | |
| 453 + :: "r" (left+256), "r" (right+256), "m" (bias) | |
| 454 + : "%esi" | |
| 455 + ); | |
| 456 +} | |
| 457 + | |
| 458 +static void mix21toS_SSE (sample_t * samples, sample_t bias) | |
| 459 +{ | |
| 460 + asm volatile( | |
| 461 + "movlps %1, %%xmm7 \n\t" | |
| 462 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 463 + "movl $-1024, %%esi \n\t" | |
| 464 + ".balign 16\n\t" | |
| 465 + "1: \n\t" | |
| 466 + "movaps 2048(%0, %%esi), %%xmm0 \n\t" // surround | |
| 467 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
| 468 + "movaps 1024(%0, %%esi), %%xmm2 \n\t" | |
| 469 + "addps %%xmm7, %%xmm1 \n\t" | |
| 470 + "addps %%xmm7, %%xmm2 \n\t" | |
| 471 + "subps %%xmm0, %%xmm1 \n\t" | |
| 472 + "addps %%xmm0, %%xmm2 \n\t" | |
| 473 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
| 474 + "movaps %%xmm2, 1024(%0, %%esi) \n\t" | |
| 475 + "addl $16, %%esi \n\t" | |
| 476 + " jnz 1b \n\t" | |
| 477 + :: "r" (samples+256), "m" (bias) | |
| 478 + : "%esi" | |
| 479 + ); | |
| 480 +} | |
| 481 + | |
| 482 +static void mix31to2_SSE (sample_t * samples, sample_t bias) | |
| 483 +{ | |
| 484 + asm volatile( | |
| 485 + "movlps %1, %%xmm7 \n\t" | |
| 486 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 487 + "movl $-1024, %%esi \n\t" | |
| 488 + ".balign 16\n\t" | |
| 489 + "1: \n\t" | |
| 490 + "movaps 1024(%0, %%esi), %%xmm0 \n\t" | |
| 491 + "addps 3072(%0, %%esi), %%xmm0 \n\t" | |
| 492 + "addps %%xmm7, %%xmm0 \n\t" // common | |
| 493 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
| 494 + "movaps 2048(%0, %%esi), %%xmm2 \n\t" | |
| 495 + "addps %%xmm0, %%xmm1 \n\t" | |
| 496 + "addps %%xmm0, %%xmm2 \n\t" | |
| 497 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
| 498 + "movaps %%xmm2, 1024(%0, %%esi) \n\t" | |
| 499 + "addl $16, %%esi \n\t" | |
| 500 + " jnz 1b \n\t" | |
| 501 + :: "r" (samples+256), "m" (bias) | |
| 502 + : "%esi" | |
| 503 + ); | |
| 504 +} | |
| 505 + | |
| 506 +static void mix31toS_SSE (sample_t * samples, sample_t bias) | |
| 507 +{ | |
| 508 + asm volatile( | |
| 509 + "movlps %1, %%xmm7 \n\t" | |
| 510 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 511 + "movl $-1024, %%esi \n\t" | |
| 512 + ".balign 16\n\t" | |
| 513 + "1: \n\t" | |
| 514 + "movaps 1024(%0, %%esi), %%xmm0 \n\t" | |
| 515 + "movaps 3072(%0, %%esi), %%xmm3 \n\t" // surround | |
| 516 + "addps %%xmm7, %%xmm0 \n\t" // common | |
| 517 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
| 518 + "movaps 2048(%0, %%esi), %%xmm2 \n\t" | |
| 519 + "addps %%xmm0, %%xmm1 \n\t" | |
| 520 + "addps %%xmm0, %%xmm2 \n\t" | |
| 521 + "subps %%xmm3, %%xmm1 \n\t" | |
| 522 + "addps %%xmm3, %%xmm2 \n\t" | |
| 523 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
| 524 + "movaps %%xmm2, 1024(%0, %%esi) \n\t" | |
| 525 + "addl $16, %%esi \n\t" | |
| 526 + " jnz 1b \n\t" | |
| 527 + :: "r" (samples+256), "m" (bias) | |
| 528 + : "%esi" | |
| 529 + ); | |
| 530 +} | |
| 531 + | |
| 532 +static void mix22toS_SSE (sample_t * samples, sample_t bias) | |
| 533 +{ | |
| 534 + asm volatile( | |
| 535 + "movlps %1, %%xmm7 \n\t" | |
| 536 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 537 + "movl $-1024, %%esi \n\t" | |
| 538 + ".balign 16\n\t" | |
| 539 + "1: \n\t" | |
| 540 + "movaps 2048(%0, %%esi), %%xmm0 \n\t" | |
| 541 + "addps 3072(%0, %%esi), %%xmm0 \n\t" // surround | |
| 542 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
| 543 + "movaps 1024(%0, %%esi), %%xmm2 \n\t" | |
| 544 + "addps %%xmm7, %%xmm1 \n\t" | |
| 545 + "addps %%xmm7, %%xmm2 \n\t" | |
| 546 + "subps %%xmm0, %%xmm1 \n\t" | |
| 547 + "addps %%xmm0, %%xmm2 \n\t" | |
| 548 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
| 549 + "movaps %%xmm2, 1024(%0, %%esi) \n\t" | |
| 550 + "addl $16, %%esi \n\t" | |
| 551 + " jnz 1b \n\t" | |
| 552 + :: "r" (samples+256), "m" (bias) | |
| 553 + : "%esi" | |
| 554 + ); | |
| 555 +} | |
| 556 + | |
| 557 +static void mix32to2_SSE (sample_t * samples, sample_t bias) | |
| 558 +{ | |
| 559 + asm volatile( | |
| 560 + "movlps %1, %%xmm7 \n\t" | |
| 561 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 562 + "movl $-1024, %%esi \n\t" | |
| 563 + ".balign 16\n\t" | |
| 564 + "1: \n\t" | |
| 565 + "movaps 1024(%0, %%esi), %%xmm0 \n\t" | |
| 566 + "addps %%xmm7, %%xmm0 \n\t" // common | |
| 567 + "movaps %%xmm0, %%xmm1 \n\t" // common | |
| 568 + "addps (%0, %%esi), %%xmm0 \n\t" | |
| 569 + "addps 2048(%0, %%esi), %%xmm1 \n\t" | |
| 570 + "addps 3072(%0, %%esi), %%xmm0 \n\t" | |
| 571 + "addps 4096(%0, %%esi), %%xmm1 \n\t" | |
| 572 + "movaps %%xmm0, (%0, %%esi) \n\t" | |
| 573 + "movaps %%xmm1, 1024(%0, %%esi) \n\t" | |
| 574 + "addl $16, %%esi \n\t" | |
| 575 + " jnz 1b \n\t" | |
| 576 + :: "r" (samples+256), "m" (bias) | |
| 577 + : "%esi" | |
| 578 + ); | |
| 579 +} | |
| 580 + | |
| 581 +static void mix32toS_SSE (sample_t * samples, sample_t bias) | |
| 582 +{ | |
| 583 + asm volatile( | |
| 584 + "movlps %1, %%xmm7 \n\t" | |
| 585 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 586 + "movl $-1024, %%esi \n\t" | |
| 587 + ".balign 16\n\t" | |
| 588 + "1: \n\t" | |
| 589 + "movaps 1024(%0, %%esi), %%xmm0 \n\t" | |
| 590 + "movaps 3072(%0, %%esi), %%xmm2 \n\t" | |
| 591 + "addps %%xmm7, %%xmm0 \n\t" // common | |
| 592 + "addps 4096(%0, %%esi), %%xmm2 \n\t" // surround | |
| 593 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
| 594 + "movaps 2048(%0, %%esi), %%xmm3 \n\t" | |
| 595 + "subps %%xmm2, %%xmm1 \n\t" | |
| 596 + "addps %%xmm2, %%xmm3 \n\t" | |
| 597 + "addps %%xmm0, %%xmm1 \n\t" | |
| 598 + "addps %%xmm0, %%xmm3 \n\t" | |
| 599 + "movaps %%xmm1, (%0, %%esi) \n\t" | |
| 600 + "movaps %%xmm3, 1024(%0, %%esi) \n\t" | |
| 601 + "addl $16, %%esi \n\t" | |
| 602 + " jnz 1b \n\t" | |
| 603 + :: "r" (samples+256), "m" (bias) | |
| 604 + : "%esi" | |
| 605 + ); | |
| 606 +} | |
| 607 + | |
| 608 +static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias) | |
| 609 +{ | |
| 610 + asm volatile( | |
| 611 + "movlps %2, %%xmm7 \n\t" | |
| 612 + "shufps $0x00, %%xmm7, %%xmm7 \n\t" | |
| 613 + "movl $-1024, %%esi \n\t" | |
| 614 + ".balign 16\n\t" | |
| 615 + "1: \n\t" | |
| 616 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
| 617 + "movaps 16(%0, %%esi), %%xmm1 \n\t" | |
| 618 + "addps 1024(%0, %%esi), %%xmm0 \n\t" | |
| 619 + "addps 1040(%0, %%esi), %%xmm1 \n\t" | |
| 620 + "addps %%xmm7, %%xmm0 \n\t" | |
| 621 + "addps %%xmm7, %%xmm1 \n\t" | |
| 622 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
| 623 + "movaps %%xmm1, 16(%1, %%esi) \n\t" | |
| 624 + "addl $32, %%esi \n\t" | |
| 625 + " jnz 1b \n\t" | |
| 626 + :: "r" (src+256), "r" (dest+256), "m" (bias) | |
| 627 + : "%esi" | |
| 628 + ); | |
| 629 +} | |
| 630 + | |
| 631 +static void zero_MMX(sample_t * samples) | |
| 632 +{ | |
| 633 + asm volatile( | |
| 634 + "movl $-1024, %%esi \n\t" | |
| 635 + "pxor %%mm0, %%mm0 \n\t" | |
| 636 + ".balign 16\n\t" | |
| 637 + "1: \n\t" | |
| 638 + "movq %%mm0, (%0, %%esi) \n\t" | |
| 639 + "movq %%mm0, 8(%0, %%esi) \n\t" | |
| 640 + "movq %%mm0, 16(%0, %%esi) \n\t" | |
| 641 + "movq %%mm0, 24(%0, %%esi) \n\t" | |
| 642 + "addl $32, %%esi \n\t" | |
| 643 + " jnz 1b \n\t" | |
| 644 + "emms" | |
| 645 + :: "r" (samples+256) | |
| 646 + : "%esi" | |
| 647 + ); | |
| 648 +} | |
| 649 + | |
| 650 +/* | |
| 651 + I hope dest and src will be at least 8 byte aligned and size | |
| 652 + will devide on 8 without remain | |
| 653 + Note: untested and unused. | |
| 654 +*/ | |
| 655 +static void copy_MMX(void *dest,const void *src,unsigned size) | |
| 656 +{ | |
| 657 + unsigned i; | |
| 658 + size /= 64; | |
| 659 + for(i=0;i<size;i++) | |
| 660 + { | |
| 661 + __asm __volatile( | |
| 662 + "movq %0, %%mm0\n\t" | |
| 663 + "movq 8%0, %%mm1\n\t" | |
| 664 + "movq 16%0, %%mm2\n\t" | |
| 665 + "movq 24%0, %%mm3\n\t" | |
| 666 + "movq 32%0, %%mm4\n\t" | |
| 667 + "movq 40%0, %%mm5\n\t" | |
| 668 + "movq 48%0, %%mm6\n\t" | |
| 669 + "movq 56%0, %%mm7\n\t" | |
| 670 + "movq %%mm0, %1\n\t" | |
| 671 + "movq %%mm1, 8%1\n\t" | |
| 672 + "movq %%mm2, 16%1\n\t" | |
| 673 + "movq %%mm3, 24%1\n\t" | |
| 674 + "movq %%mm4, 32%1\n\t" | |
| 675 + "movq %%mm5, 40%1\n\t" | |
| 676 + "movq %%mm6, 48%1\n\t" | |
| 677 + "movq %%mm7, 56%1\n\t" | |
| 678 + : | |
| 679 + :"m"(src),"m"(dest)); | |
| 680 + } | |
| 681 +} | |
| 682 + | |
| 683 +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias, | |
| 684 + sample_t clev, sample_t slev) | |
| 685 +{ | |
| 686 + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
| 687 + | |
| 688 + case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
| 689 + memcpy (samples, samples + 256, 256 * sizeof (sample_t)); | |
| 690 + break; | |
| 691 + | |
| 692 + case CONVERT (A52_CHANNEL, A52_MONO): | |
| 693 + case CONVERT (A52_STEREO, A52_MONO): | |
| 694 + mix_2to1_SSE: | |
| 695 + mix2to1_SSE (samples, samples + 256, bias); | |
| 696 + break; | |
| 697 + | |
| 698 + case CONVERT (A52_2F1R, A52_MONO): | |
| 699 + if (slev == 0) | |
| 700 + goto mix_2to1_SSE; | |
| 701 + case CONVERT (A52_3F, A52_MONO): | |
| 702 + mix_3to1_SSE: | |
| 703 + mix3to1_SSE (samples, bias); | |
| 704 + break; | |
| 705 + | |
| 706 + case CONVERT (A52_3F1R, A52_MONO): | |
| 707 + if (slev == 0) | |
| 708 + goto mix_3to1_SSE; | |
| 709 + case CONVERT (A52_2F2R, A52_MONO): | |
| 710 + if (slev == 0) | |
| 711 + goto mix_2to1_SSE; | |
| 712 + mix4to1_SSE (samples, bias); | |
| 713 + break; | |
| 714 + | |
| 715 + case CONVERT (A52_3F2R, A52_MONO): | |
| 716 + if (slev == 0) | |
| 717 + goto mix_3to1_SSE; | |
| 718 + mix5to1_SSE (samples, bias); | |
| 719 + break; | |
| 720 + | |
| 721 + case CONVERT (A52_MONO, A52_DOLBY): | |
| 722 + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
| 723 + break; | |
| 724 + | |
| 725 + case CONVERT (A52_3F, A52_STEREO): | |
| 726 + case CONVERT (A52_3F, A52_DOLBY): | |
| 727 + mix_3to2_SSE: | |
| 728 + mix3to2_SSE (samples, bias); | |
| 729 + break; | |
| 730 + | |
| 731 + case CONVERT (A52_2F1R, A52_STEREO): | |
| 732 + if (slev == 0) | |
| 733 + break; | |
| 734 + mix21to2_SSE (samples, samples + 256, bias); | |
| 735 + break; | |
| 736 + | |
| 737 + case CONVERT (A52_2F1R, A52_DOLBY): | |
| 738 + mix21toS_SSE (samples, bias); | |
| 739 + break; | |
| 740 + | |
| 741 + case CONVERT (A52_3F1R, A52_STEREO): | |
| 742 + if (slev == 0) | |
| 743 + goto mix_3to2_SSE; | |
| 744 + mix31to2_SSE (samples, bias); | |
| 745 + break; | |
| 746 + | |
| 747 + case CONVERT (A52_3F1R, A52_DOLBY): | |
| 748 + mix31toS_SSE (samples, bias); | |
| 749 + break; | |
| 750 + | |
| 751 + case CONVERT (A52_2F2R, A52_STEREO): | |
| 752 + if (slev == 0) | |
| 753 + break; | |
| 754 + mix2to1_SSE (samples, samples + 512, bias); | |
| 755 + mix2to1_SSE (samples + 256, samples + 768, bias); | |
| 756 + break; | |
| 757 + | |
| 758 + case CONVERT (A52_2F2R, A52_DOLBY): | |
| 759 + mix22toS_SSE (samples, bias); | |
| 760 + break; | |
| 761 + | |
| 762 + case CONVERT (A52_3F2R, A52_STEREO): | |
| 763 + if (slev == 0) | |
| 764 + goto mix_3to2_SSE; | |
| 765 + mix32to2_SSE (samples, bias); | |
| 766 + break; | |
| 767 + | |
| 768 + case CONVERT (A52_3F2R, A52_DOLBY): | |
| 769 + mix32toS_SSE (samples, bias); | |
| 770 + break; | |
| 771 + | |
| 772 + case CONVERT (A52_3F1R, A52_3F): | |
| 773 + if (slev == 0) | |
| 774 + break; | |
| 775 + mix21to2_SSE (samples, samples + 512, bias); | |
| 776 + break; | |
| 777 + | |
| 778 + case CONVERT (A52_3F2R, A52_3F): | |
| 779 + if (slev == 0) | |
| 780 + break; | |
| 781 + mix2to1_SSE (samples, samples + 768, bias); | |
| 782 + mix2to1_SSE (samples + 512, samples + 1024, bias); | |
| 783 + break; | |
| 784 + | |
| 785 + case CONVERT (A52_3F1R, A52_2F1R): | |
| 786 + mix3to2_SSE (samples, bias); | |
| 787 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
| 788 + break; | |
| 789 + | |
| 790 + case CONVERT (A52_2F2R, A52_2F1R): | |
| 791 + mix2to1_SSE (samples + 512, samples + 768, bias); | |
| 792 + break; | |
| 793 + | |
| 794 + case CONVERT (A52_3F2R, A52_2F1R): | |
| 795 + mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used) | |
| 796 + move2to1_SSE (samples + 768, samples + 512, bias); | |
| 797 + break; | |
| 798 + | |
| 799 + case CONVERT (A52_3F2R, A52_3F1R): | |
| 800 + mix2to1_SSE (samples + 768, samples + 1024, bias); | |
| 801 + break; | |
| 802 + | |
| 803 + case CONVERT (A52_2F1R, A52_2F2R): | |
| 804 + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
| 805 + break; | |
| 806 + | |
| 807 + case CONVERT (A52_3F1R, A52_2F2R): | |
| 808 + mix3to2_SSE (samples, bias); | |
| 809 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
| 810 + break; | |
| 811 + | |
| 812 + case CONVERT (A52_3F2R, A52_2F2R): | |
| 813 + mix3to2_SSE (samples, bias); | |
| 814 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
| 815 + memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); | |
| 816 + break; | |
| 817 + | |
| 818 + case CONVERT (A52_3F1R, A52_3F2R): | |
| 819 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | |
| 820 + break; | |
| 821 + } | |
| 822 +} | |
| 823 + | |
| 824 +static void upmix_MMX (sample_t * samples, int acmod, int output) | |
| 825 +{ | |
| 826 + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
| 827 + | |
| 828 + case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
| 829 + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
| 830 + break; | |
| 831 + | |
| 832 + case CONVERT (A52_3F2R, A52_MONO): | |
| 833 + zero_MMX (samples + 1024); | |
| 834 + case CONVERT (A52_3F1R, A52_MONO): | |
| 835 + case CONVERT (A52_2F2R, A52_MONO): | |
| 836 + zero_MMX (samples + 768); | |
| 837 + case CONVERT (A52_3F, A52_MONO): | |
| 838 + case CONVERT (A52_2F1R, A52_MONO): | |
| 839 + zero_MMX (samples + 512); | |
| 840 + case CONVERT (A52_CHANNEL, A52_MONO): | |
| 841 + case CONVERT (A52_STEREO, A52_MONO): | |
| 842 + zero_MMX (samples + 256); | |
| 843 + break; | |
| 844 + | |
| 845 + case CONVERT (A52_3F2R, A52_STEREO): | |
| 846 + case CONVERT (A52_3F2R, A52_DOLBY): | |
| 847 + zero_MMX (samples + 1024); | |
| 848 + case CONVERT (A52_3F1R, A52_STEREO): | |
| 849 + case CONVERT (A52_3F1R, A52_DOLBY): | |
| 850 + zero_MMX (samples + 768); | |
| 851 + case CONVERT (A52_3F, A52_STEREO): | |
| 852 + case CONVERT (A52_3F, A52_DOLBY): | |
| 853 + mix_3to2_MMX: | |
| 854 + memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t)); | |
| 855 + zero_MMX (samples + 256); | |
| 856 + break; | |
| 857 + | |
| 858 + case CONVERT (A52_2F2R, A52_STEREO): | |
| 859 + case CONVERT (A52_2F2R, A52_DOLBY): | |
| 860 + zero_MMX (samples + 768); | |
| 861 + case CONVERT (A52_2F1R, A52_STEREO): | |
| 862 + case CONVERT (A52_2F1R, A52_DOLBY): | |
| 863 + zero_MMX (samples + 512); | |
| 864 + break; | |
| 865 + | |
| 866 + case CONVERT (A52_3F2R, A52_3F): | |
| 867 + zero_MMX (samples + 1024); | |
| 868 + case CONVERT (A52_3F1R, A52_3F): | |
| 869 + case CONVERT (A52_2F2R, A52_2F1R): | |
| 870 + zero_MMX (samples + 768); | |
| 871 + break; | |
| 872 + | |
| 873 + case CONVERT (A52_3F2R, A52_3F1R): | |
| 874 + zero_MMX (samples + 1024); | |
| 875 + break; | |
| 876 + | |
| 877 + case CONVERT (A52_3F2R, A52_2F1R): | |
| 878 + zero_MMX (samples + 1024); | |
| 879 + case CONVERT (A52_3F1R, A52_2F1R): | |
| 880 + mix_31to21_MMX: | |
| 881 + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
| 882 + goto mix_3to2_MMX; | |
| 883 + | |
| 884 + case CONVERT (A52_3F2R, A52_2F2R): | |
| 885 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | |
| 886 + goto mix_31to21_MMX; | |
| 887 + } | |
| 888 +} | |
| 889 + | |
| 890 +static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias) | |
| 891 +{ | |
| 892 + asm volatile( | |
| 893 + "movd %2, %%mm7 \n\t" | |
| 894 + "punpckldq %2, %%mm7 \n\t" | |
| 895 + "movl $-1024, %%esi \n\t" | |
| 896 + ".balign 16\n\t" | |
| 897 + "1: \n\t" | |
| 898 + "movq (%0, %%esi), %%mm0 \n\t" | |
| 899 + "movq 8(%0, %%esi), %%mm1 \n\t" | |
| 900 + "movq 16(%0, %%esi), %%mm2 \n\t" | |
| 901 + "movq 24(%0, %%esi), %%mm3 \n\t" | |
| 902 + "pfadd (%1, %%esi), %%mm0 \n\t" | |
| 903 + "pfadd 8(%1, %%esi), %%mm1 \n\t" | |
| 904 + "pfadd 16(%1, %%esi), %%mm2 \n\t" | |
| 905 + "pfadd 24(%1, %%esi), %%mm3 \n\t" | |
| 906 + "pfadd %%mm7, %%mm0 \n\t" | |
| 907 + "pfadd %%mm7, %%mm1 \n\t" | |
| 908 + "pfadd %%mm7, %%mm2 \n\t" | |
| 909 + "pfadd %%mm7, %%mm3 \n\t" | |
| 910 + "movq %%mm0, (%1, %%esi) \n\t" | |
| 911 + "movq %%mm1, 8(%1, %%esi) \n\t" | |
| 912 + "movq %%mm2, 16(%1, %%esi) \n\t" | |
| 913 + "movq %%mm3, 24(%1, %%esi) \n\t" | |
| 914 + "addl $32, %%esi \n\t" | |
| 915 + " jnz 1b \n\t" | |
| 916 + :: "r" (src+256), "r" (dest+256), "m" (bias) | |
| 917 + : "%esi" | |
| 918 + ); | |
| 919 +} | |
| 920 + | |
| 921 +static void mix3to1_3dnow (sample_t * samples, sample_t bias) | |
| 922 +{ | |
| 923 + asm volatile( | |
| 924 + "movd %1, %%mm7 \n\t" | |
| 925 + "punpckldq %1, %%mm7 \n\t" | |
| 926 + "movl $-1024, %%esi \n\t" | |
| 927 + ".balign 16\n\t" | |
| 928 + "1: \n\t" | |
| 929 + "movq (%0, %%esi), %%mm0 \n\t" | |
| 930 + "movq 8(%0, %%esi), %%mm1 \n\t" | |
| 931 + "movq 1024(%0, %%esi), %%mm2 \n\t" | |
| 932 + "movq 1032(%0, %%esi), %%mm3 \n\t" | |
| 933 + "pfadd 2048(%0, %%esi), %%mm0 \n\t" | |
| 934 + "pfadd 2056(%0, %%esi), %%mm1 \n\t" | |
| 935 + "pfadd %%mm7, %%mm0 \n\t" | |
| 936 + "pfadd %%mm7, %%mm1 \n\t" | |
| 937 + "pfadd %%mm2, %%mm0 \n\t" | |
| 938 + "pfadd %%mm3, %%mm1 \n\t" | |
| 939 + "movq %%mm0, (%0, %%esi) \n\t" | |
| 940 + "movq %%mm1, 8(%0, %%esi) \n\t" | |
| 941 + "addl $16, %%esi \n\t" | |
| 942 + " jnz 1b \n\t" | |
| 943 + :: "r" (samples+256), "m" (bias) | |
| 944 + : "%esi" | |
| 945 + ); | |
| 946 +} | |
| 947 + | |
| 948 +static void mix4to1_3dnow (sample_t * samples, sample_t bias) | |
| 949 +{ | |
| 950 + asm volatile( | |
| 951 + "movd %1, %%mm7 \n\t" | |
| 952 + "punpckldq %1, %%mm7 \n\t" | |
| 953 + "movl $-1024, %%esi \n\t" | |
| 954 + ".balign 16\n\t" | |
| 955 + "1: \n\t" | |
| 956 + "movq (%0, %%esi), %%mm0 \n\t" | |
| 957 + "movq 8(%0, %%esi), %%mm1 \n\t" | |
| 958 + "movq 1024(%0, %%esi), %%mm2 \n\t" | |
| 959 + "movq 1032(%0, %%esi), %%mm3 \n\t" | |
| 960 + "pfadd 2048(%0, %%esi), %%mm0 \n\t" | |
| 961 + "pfadd 2056(%0, %%esi), %%mm1 \n\t" | |
| 962 + "pfadd 3072(%0, %%esi), %%mm2 \n\t" | |
| 963 + "pfadd 3080(%0, %%esi), %%mm3 \n\t" | |
| 964 + "pfadd %%mm7, %%mm0 \n\t" | |
| 965 + "pfadd %%mm7, %%mm1 \n\t" | |
| 966 + "pfadd %%mm2, %%mm0 \n\t" | |
| 967 + "pfadd %%mm3, %%mm1 \n\t" | |
| 968 + "movq %%mm0, (%0, %%esi) \n\t" | |
| 969 + "movq %%mm1, 8(%0, %%esi) \n\t" | |
| 970 + "addl $16, %%esi \n\t" | |
| 971 + " jnz 1b \n\t" | |
| 972 + :: "r" (samples+256), "m" (bias) | |
| 973 + : "%esi" | |
| 974 + ); | |
| 975 +} | |
| 976 + | |
| 977 +static void mix5to1_3dnow (sample_t * samples, sample_t bias) | |
| 978 +{ | |
| 979 + asm volatile( | |
| 980 + "movd %1, %%mm7 \n\t" | |
| 981 + "punpckldq %1, %%mm7 \n\t" | |
| 982 + "movl $-1024, %%esi \n\t" | |
| 983 + ".balign 16\n\t" | |
| 984 + "1: \n\t" | |
| 985 + "movq (%0, %%esi), %%mm0 \n\t" | |
| 986 + "movq 8(%0, %%esi), %%mm1 \n\t" | |
| 987 + "movq 1024(%0, %%esi), %%mm2 \n\t" | |
| 988 + "movq 1032(%0, %%esi), %%mm3 \n\t" | |
| 989 + "pfadd 2048(%0, %%esi), %%mm0 \n\t" | |
| 990 + "pfadd 2056(%0, %%esi), %%mm1 \n\t" | |
| 991 + "pfadd 3072(%0, %%esi), %%mm2 \n\t" | |
| 992 + "pfadd 3080(%0, %%esi), %%mm3 \n\t" | |
| 993 + "pfadd %%mm7, %%mm0 \n\t" | |
| 994 + "pfadd %%mm7, %%mm1 \n\t" | |
| 995 + "pfadd 4096(%0, %%esi), %%mm2 \n\t" | |
| 996 + "pfadd 4104(%0, %%esi), %%mm3 \n\t" | |
| 997 + "pfadd %%mm2, %%mm0 \n\t" | |
| 998 + "pfadd %%mm3, %%mm1 \n\t" | |
| 999 + "movq %%mm0, (%0, %%esi) \n\t" | |
| 1000 + "movq %%mm1, 8(%0, %%esi) \n\t" | |
| 1001 + "addl $16, %%esi \n\t" | |
| 1002 + " jnz 1b \n\t" | |
| 1003 + :: "r" (samples+256), "m" (bias) | |
| 1004 + : "%esi" | |
| 1005 + ); | |
| 1006 +} | |
| 1007 + | |
| 1008 +static void mix3to2_3dnow (sample_t * samples, sample_t bias) | |
| 1009 +{ | |
| 1010 + asm volatile( | |
| 1011 + "movd %1, %%mm7 \n\t" | |
| 1012 + "punpckldq %1, %%mm7 \n\t" | |
| 1013 + "movl $-1024, %%esi \n\t" | |
| 1014 + ".balign 16\n\t" | |
| 1015 + "1: \n\t" | |
| 1016 + "movq 1024(%0, %%esi), %%mm0 \n\t" | |
| 1017 + "movq 1032(%0, %%esi), %%mm1 \n\t" | |
| 1018 + "pfadd %%mm7, %%mm0 \n\t" //common | |
| 1019 + "pfadd %%mm7, %%mm1 \n\t" //common | |
| 1020 + "movq (%0, %%esi), %%mm2 \n\t" | |
| 1021 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
| 1022 + "movq 2048(%0, %%esi), %%mm4 \n\t" | |
| 1023 + "movq 2056(%0, %%esi), %%mm5 \n\t" | |
| 1024 + "pfadd %%mm0, %%mm2 \n\t" | |
| 1025 + "pfadd %%mm1, %%mm3 \n\t" | |
| 1026 + "pfadd %%mm0, %%mm4 \n\t" | |
| 1027 + "pfadd %%mm1, %%mm5 \n\t" | |
| 1028 + "movq %%mm2, (%0, %%esi) \n\t" | |
| 1029 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
| 1030 + "movq %%mm4, 1024(%0, %%esi) \n\t" | |
| 1031 + "movq %%mm5, 1032(%0, %%esi) \n\t" | |
| 1032 + "addl $16, %%esi \n\t" | |
| 1033 + " jnz 1b \n\t" | |
| 1034 + :: "r" (samples+256), "m" (bias) | |
| 1035 + : "%esi" | |
| 1036 + ); | |
| 1037 +} | |
| 1038 + | |
| 1039 +static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias) | |
| 1040 +{ | |
| 1041 + asm volatile( | |
| 1042 + "movd %2, %%mm7 \n\t" | |
| 1043 + "punpckldq %2, %%mm7 \n\t" | |
| 1044 + "movl $-1024, %%esi \n\t" | |
| 1045 + ".balign 16\n\t" | |
| 1046 + "1: \n\t" | |
| 1047 + "movq 1024(%1, %%esi), %%mm0 \n\t" | |
| 1048 + "movq 1032(%1, %%esi), %%mm1 \n\t" | |
| 1049 + "pfadd %%mm7, %%mm0 \n\t" //common | |
| 1050 + "pfadd %%mm7, %%mm1 \n\t" //common | |
| 1051 + "movq (%0, %%esi), %%mm2 \n\t" | |
| 1052 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
| 1053 + "movq (%1, %%esi), %%mm4 \n\t" | |
| 1054 + "movq 8(%1, %%esi), %%mm5 \n\t" | |
| 1055 + "pfadd %%mm0, %%mm2 \n\t" | |
| 1056 + "pfadd %%mm1, %%mm3 \n\t" | |
| 1057 + "pfadd %%mm0, %%mm4 \n\t" | |
| 1058 + "pfadd %%mm1, %%mm5 \n\t" | |
| 1059 + "movq %%mm2, (%0, %%esi) \n\t" | |
| 1060 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
| 1061 + "movq %%mm4, (%1, %%esi) \n\t" | |
| 1062 + "movq %%mm5, 8(%1, %%esi) \n\t" | |
| 1063 + "addl $16, %%esi \n\t" | |
| 1064 + " jnz 1b \n\t" | |
| 1065 + :: "r" (left+256), "r" (right+256), "m" (bias) | |
| 1066 + : "%esi" | |
| 1067 + ); | |
| 1068 +} | |
| 1069 + | |
| 1070 +static void mix21toS_3dnow (sample_t * samples, sample_t bias) | |
| 1071 +{ | |
| 1072 + asm volatile( | |
| 1073 + "movd %1, %%mm7 \n\t" | |
| 1074 + "punpckldq %1, %%mm7 \n\t" | |
| 1075 + "movl $-1024, %%esi \n\t" | |
| 1076 + ".balign 16\n\t" | |
| 1077 + "1: \n\t" | |
| 1078 + "movq 2048(%0, %%esi), %%mm0 \n\t" // surround | |
| 1079 + "movq 2056(%0, %%esi), %%mm1 \n\t" // surround | |
| 1080 + "movq (%0, %%esi), %%mm2 \n\t" | |
| 1081 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
| 1082 + "movq 1024(%0, %%esi), %%mm4 \n\t" | |
| 1083 + "movq 1032(%0, %%esi), %%mm5 \n\t" | |
| 1084 + "pfadd %%mm7, %%mm2 \n\t" | |
| 1085 + "pfadd %%mm7, %%mm3 \n\t" | |
| 1086 + "pfadd %%mm7, %%mm4 \n\t" | |
| 1087 + "pfadd %%mm7, %%mm5 \n\t" | |
| 1088 + "pfsub %%mm0, %%mm2 \n\t" | |
| 1089 + "pfsub %%mm1, %%mm3 \n\t" | |
| 1090 + "pfadd %%mm0, %%mm4 \n\t" | |
| 1091 + "pfadd %%mm1, %%mm5 \n\t" | |
| 1092 + "movq %%mm2, (%0, %%esi) \n\t" | |
| 1093 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
| 1094 + "movq %%mm4, 1024(%0, %%esi) \n\t" | |
| 1095 + "movq %%mm5, 1032(%0, %%esi) \n\t" | |
| 1096 + "addl $16, %%esi \n\t" | |
| 1097 + " jnz 1b \n\t" | |
| 1098 + :: "r" (samples+256), "m" (bias) | |
| 1099 + : "%esi" | |
| 1100 + ); | |
| 1101 +} | |
| 1102 + | |
| 1103 +static void mix31to2_3dnow (sample_t * samples, sample_t bias) | |
| 1104 +{ | |
| 1105 + asm volatile( | |
| 1106 + "movd %1, %%mm7 \n\t" | |
| 1107 + "punpckldq %1, %%mm7 \n\t" | |
| 1108 + "movl $-1024, %%esi \n\t" | |
| 1109 + ".balign 16\n\t" | |
| 1110 + "1: \n\t" | |
| 1111 + "movq 1024(%0, %%esi), %%mm0 \n\t" | |
| 1112 + "movq 1032(%0, %%esi), %%mm1 \n\t" | |
| 1113 + "pfadd 3072(%0, %%esi), %%mm0 \n\t" | |
| 1114 + "pfadd 3080(%0, %%esi), %%mm1 \n\t" | |
| 1115 + "pfadd %%mm7, %%mm0 \n\t" // common | |
| 1116 + "pfadd %%mm7, %%mm1 \n\t" // common | |
| 1117 + "movq (%0, %%esi), %%mm2 \n\t" | |
| 1118 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
| 1119 + "movq 2048(%0, %%esi), %%mm4 \n\t" | |
| 1120 + "movq 2056(%0, %%esi), %%mm5 \n\t" | |
| 1121 + "pfadd %%mm0, %%mm2 \n\t" | |
| 1122 + "pfadd %%mm1, %%mm3 \n\t" | |
| 1123 + "pfadd %%mm0, %%mm4 \n\t" | |
| 1124 + "pfadd %%mm1, %%mm5 \n\t" | |
| 1125 + "movq %%mm2, (%0, %%esi) \n\t" | |
| 1126 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
| 1127 + "movq %%mm4, 1024(%0, %%esi) \n\t" | |
| 1128 + "movq %%mm5, 1032(%0, %%esi) \n\t" | |
| 1129 + "addl $16, %%esi \n\t" | |
| 1130 + " jnz 1b \n\t" | |
| 1131 + :: "r" (samples+256), "m" (bias) | |
| 1132 + : "%esi" | |
| 1133 + ); | |
| 1134 +} | |
| 1135 + | |
| 1136 +static void mix31toS_3dnow (sample_t * samples, sample_t bias) | |
| 1137 +{ | |
| 1138 + asm volatile( | |
| 1139 + "movd %1, %%mm7 \n\t" | |
| 1140 + "punpckldq %1, %%mm7 \n\t" | |
| 1141 + "movl $-1024, %%esi \n\t" | |
| 1142 + ".balign 16\n\t" | |
| 1143 + "1: \n\t" | |
| 1144 + "movq 1024(%0, %%esi), %%mm0 \n\t" | |
| 1145 + "movq 1032(%0, %%esi), %%mm1 \n\t" | |
| 1146 + "pfadd %%mm7, %%mm0 \n\t" // common | |
| 1147 + "pfadd %%mm7, %%mm1 \n\t" // common | |
| 1148 + "movq (%0, %%esi), %%mm2 \n\t" | |
| 1149 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
| 1150 + "movq 2048(%0, %%esi), %%mm4 \n\t" | |
| 1151 + "movq 2056(%0, %%esi), %%mm5 \n\t" | |
| 1152 + "pfadd %%mm0, %%mm2 \n\t" | |
| 1153 + "pfadd %%mm1, %%mm3 \n\t" | |
| 1154 + "pfadd %%mm0, %%mm4 \n\t" | |
| 1155 + "pfadd %%mm1, %%mm5 \n\t" | |
| 1156 + "movq 3072(%0, %%esi), %%mm0 \n\t" // surround | |
| 1157 + "movq 3080(%0, %%esi), %%mm1 \n\t" // surround | |
| 1158 + "pfsub %%mm0, %%mm2 \n\t" | |
| 1159 + "pfsub %%mm1, %%mm3 \n\t" | |
| 1160 + "pfadd %%mm0, %%mm4 \n\t" | |
| 1161 + "pfadd %%mm1, %%mm5 \n\t" | |
| 1162 + "movq %%mm2, (%0, %%esi) \n\t" | |
| 1163 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
| 1164 + "movq %%mm4, 1024(%0, %%esi) \n\t" | |
| 1165 + "movq %%mm5, 1032(%0, %%esi) \n\t" | |
| 1166 + "addl $16, %%esi \n\t" | |
| 1167 + " jnz 1b \n\t" | |
| 1168 + :: "r" (samples+256), "m" (bias) | |
| 1169 + : "%esi" | |
| 1170 + ); | |
| 1171 +} | |
| 1172 + | |
| 1173 +static void mix22toS_3dnow (sample_t * samples, sample_t bias) | |
| 1174 +{ | |
| 1175 + asm volatile( | |
| 1176 + "movd %1, %%mm7 \n\t" | |
| 1177 + "punpckldq %1, %%mm7 \n\t" | |
| 1178 + "movl $-1024, %%esi \n\t" | |
| 1179 + ".balign 16\n\t" | |
| 1180 + "1: \n\t" | |
| 1181 + "movq 2048(%0, %%esi), %%mm0 \n\t" | |
| 1182 + "movq 2056(%0, %%esi), %%mm1 \n\t" | |
| 1183 + "pfadd 3072(%0, %%esi), %%mm0 \n\t" // surround | |
| 1184 + "pfadd 3080(%0, %%esi), %%mm1 \n\t" // surround | |
| 1185 + "movq (%0, %%esi), %%mm2 \n\t" | |
| 1186 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
| 1187 + "movq 1024(%0, %%esi), %%mm4 \n\t" | |
| 1188 + "movq 1032(%0, %%esi), %%mm5 \n\t" | |
| 1189 + "pfadd %%mm7, %%mm2 \n\t" | |
| 1190 + "pfadd %%mm7, %%mm3 \n\t" | |
| 1191 + "pfadd %%mm7, %%mm4 \n\t" | |
| 1192 + "pfadd %%mm7, %%mm5 \n\t" | |
| 1193 + "pfsub %%mm0, %%mm2 \n\t" | |
| 1194 + "pfsub %%mm1, %%mm3 \n\t" | |
| 1195 + "pfadd %%mm0, %%mm4 \n\t" | |
| 1196 + "pfadd %%mm1, %%mm5 \n\t" | |
| 1197 + "movq %%mm2, (%0, %%esi) \n\t" | |
| 1198 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
| 1199 + "movq %%mm4, 1024(%0, %%esi) \n\t" | |
| 1200 + "movq %%mm5, 1032(%0, %%esi) \n\t" | |
| 1201 + "addl $16, %%esi \n\t" | |
| 1202 + " jnz 1b \n\t" | |
| 1203 + :: "r" (samples+256), "m" (bias) | |
| 1204 + : "%esi" | |
| 1205 + ); | |
| 1206 +} | |
| 1207 + | |
| 1208 +static void mix32to2_3dnow (sample_t * samples, sample_t bias) | |
| 1209 +{ | |
| 1210 + asm volatile( | |
| 1211 + "movd %1, %%mm7 \n\t" | |
| 1212 + "punpckldq %1, %%mm7 \n\t" | |
| 1213 + "movl $-1024, %%esi \n\t" | |
| 1214 + ".balign 16\n\t" | |
| 1215 + "1: \n\t" | |
| 1216 + "movq 1024(%0, %%esi), %%mm0 \n\t" | |
| 1217 + "movq 1032(%0, %%esi), %%mm1 \n\t" | |
| 1218 + "pfadd %%mm7, %%mm0 \n\t" // common | |
| 1219 + "pfadd %%mm7, %%mm1 \n\t" // common | |
| 1220 + "movq %%mm0, %%mm2 \n\t" // common | |
| 1221 + "movq %%mm1, %%mm3 \n\t" // common | |
| 1222 + "pfadd (%0, %%esi), %%mm0 \n\t" | |
| 1223 + "pfadd 8(%0, %%esi), %%mm1 \n\t" | |
| 1224 + "pfadd 2048(%0, %%esi), %%mm2 \n\t" | |
| 1225 + "pfadd 2056(%0, %%esi), %%mm3 \n\t" | |
| 1226 + "pfadd 3072(%0, %%esi), %%mm0 \n\t" | |
| 1227 + "pfadd 3080(%0, %%esi), %%mm1 \n\t" | |
| 1228 + "pfadd 4096(%0, %%esi), %%mm2 \n\t" | |
| 1229 + "pfadd 4104(%0, %%esi), %%mm3 \n\t" | |
| 1230 + "movq %%mm0, (%0, %%esi) \n\t" | |
| 1231 + "movq %%mm1, 8(%0, %%esi) \n\t" | |
| 1232 + "movq %%mm2, 1024(%0, %%esi) \n\t" | |
| 1233 + "movq %%mm3, 1032(%0, %%esi) \n\t" | |
| 1234 + "addl $16, %%esi \n\t" | |
| 1235 + " jnz 1b \n\t" | |
| 1236 + :: "r" (samples+256), "m" (bias) | |
| 1237 + : "%esi" | |
| 1238 + ); | |
| 1239 +} | |
| 1240 + | |
| 1241 +/* todo: should be optimized better */ | |
| 1242 +static void mix32toS_3dnow (sample_t * samples, sample_t bias) | |
| 1243 +{ | |
| 1244 + asm volatile( | |
| 1245 + "movl $-1024, %%esi \n\t" | |
| 1246 + ".balign 16\n\t" | |
| 1247 + "1: \n\t" | |
| 1248 + "movd %1, %%mm7 \n\t" | |
| 1249 + "punpckldq %1, %%mm7 \n\t" | |
| 1250 + "movq 1024(%0, %%esi), %%mm0 \n\t" | |
| 1251 + "movq 1032(%0, %%esi), %%mm1 \n\t" | |
| 1252 + "movq 3072(%0, %%esi), %%mm4 \n\t" | |
| 1253 + "movq 3080(%0, %%esi), %%mm5 \n\t" | |
| 1254 + "pfadd %%mm7, %%mm0 \n\t" // common | |
| 1255 + "pfadd %%mm7, %%mm1 \n\t" // common | |
| 1256 + "pfadd 4096(%0, %%esi), %%mm4 \n\t" // surround | |
| 1257 + "pfadd 4104(%0, %%esi), %%mm5 \n\t" // surround | |
| 1258 + "movq (%0, %%esi), %%mm2 \n\t" | |
| 1259 + "movq 8(%0, %%esi), %%mm3 \n\t" | |
| 1260 + "movq 2048(%0, %%esi), %%mm6 \n\t" | |
| 1261 + "movq 2056(%0, %%esi), %%mm7 \n\t" | |
| 1262 + "pfsub %%mm4, %%mm2 \n\t" | |
| 1263 + "pfsub %%mm5, %%mm3 \n\t" | |
| 1264 + "pfadd %%mm4, %%mm6 \n\t" | |
| 1265 + "pfadd %%mm5, %%mm7 \n\t" | |
| 1266 + "pfadd %%mm0, %%mm2 \n\t" | |
| 1267 + "pfadd %%mm1, %%mm3 \n\t" | |
| 1268 + "pfadd %%mm0, %%mm6 \n\t" | |
| 1269 + "pfadd %%mm1, %%mm7 \n\t" | |
| 1270 + "movq %%mm2, (%0, %%esi) \n\t" | |
| 1271 + "movq %%mm3, 8(%0, %%esi) \n\t" | |
| 1272 + "movq %%mm6, 1024(%0, %%esi) \n\t" | |
| 1273 + "movq %%mm7, 1032(%0, %%esi) \n\t" | |
| 1274 + "addl $16, %%esi \n\t" | |
| 1275 + " jnz 1b \n\t" | |
| 1276 + :: "r" (samples+256), "m" (bias) | |
| 1277 + : "%esi" | |
| 1278 + ); | |
| 1279 +} | |
| 1280 + | |
| 1281 +static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias) | |
| 1282 +{ | |
| 1283 + asm volatile( | |
| 1284 + "movd %2, %%mm7 \n\t" | |
| 1285 + "punpckldq %2, %%mm7 \n\t" | |
| 1286 + "movl $-1024, %%esi \n\t" | |
| 1287 + ".balign 16\n\t" | |
| 1288 + "1: \n\t" | |
| 1289 + "movq (%0, %%esi), %%mm0 \n\t" | |
| 1290 + "movq 8(%0, %%esi), %%mm1 \n\t" | |
| 1291 + "movq 16(%0, %%esi), %%mm2 \n\t" | |
| 1292 + "movq 24(%0, %%esi), %%mm3 \n\t" | |
| 1293 + "pfadd 1024(%0, %%esi), %%mm0 \n\t" | |
| 1294 + "pfadd 1032(%0, %%esi), %%mm1 \n\t" | |
| 1295 + "pfadd 1040(%0, %%esi), %%mm2 \n\t" | |
| 1296 + "pfadd 1048(%0, %%esi), %%mm3 \n\t" | |
| 1297 + "pfadd %%mm7, %%mm0 \n\t" | |
| 1298 + "pfadd %%mm7, %%mm1 \n\t" | |
| 1299 + "pfadd %%mm7, %%mm2 \n\t" | |
| 1300 + "pfadd %%mm7, %%mm3 \n\t" | |
| 1301 + "movq %%mm0, (%1, %%esi) \n\t" | |
| 1302 + "movq %%mm1, 8(%1, %%esi) \n\t" | |
| 1303 + "movq %%mm2, 16(%1, %%esi) \n\t" | |
| 1304 + "movq %%mm3, 24(%1, %%esi) \n\t" | |
| 1305 + "addl $32, %%esi \n\t" | |
| 1306 + " jnz 1b \n\t" | |
| 1307 + :: "r" (src+256), "r" (dest+256), "m" (bias) | |
| 1308 + : "%esi" | |
| 1309 + ); | |
| 1310 +} | |
| 1311 + | |
| 1312 +static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias, | |
| 1313 + sample_t clev, sample_t slev) | |
| 1314 +{ | |
| 1315 + switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) { | |
| 1316 + | |
| 1317 + case CONVERT (A52_CHANNEL, A52_CHANNEL2): | |
| 1318 + memcpy (samples, samples + 256, 256 * sizeof (sample_t)); | |
| 1319 + break; | |
| 1320 + | |
| 1321 + case CONVERT (A52_CHANNEL, A52_MONO): | |
| 1322 + case CONVERT (A52_STEREO, A52_MONO): | |
| 1323 + mix_2to1_3dnow: | |
| 1324 + mix2to1_3dnow (samples, samples + 256, bias); | |
| 1325 + break; | |
| 1326 + | |
| 1327 + case CONVERT (A52_2F1R, A52_MONO): | |
| 1328 + if (slev == 0) | |
| 1329 + goto mix_2to1_3dnow; | |
| 1330 + case CONVERT (A52_3F, A52_MONO): | |
| 1331 + mix_3to1_3dnow: | |
| 1332 + mix3to1_3dnow (samples, bias); | |
| 1333 + break; | |
| 1334 + | |
| 1335 + case CONVERT (A52_3F1R, A52_MONO): | |
| 1336 + if (slev == 0) | |
| 1337 + goto mix_3to1_3dnow; | |
| 1338 + case CONVERT (A52_2F2R, A52_MONO): | |
| 1339 + if (slev == 0) | |
| 1340 + goto mix_2to1_3dnow; | |
| 1341 + mix4to1_3dnow (samples, bias); | |
| 1342 + break; | |
| 1343 + | |
| 1344 + case CONVERT (A52_3F2R, A52_MONO): | |
| 1345 + if (slev == 0) | |
| 1346 + goto mix_3to1_3dnow; | |
| 1347 + mix5to1_3dnow (samples, bias); | |
| 1348 + break; | |
| 1349 + | |
| 1350 + case CONVERT (A52_MONO, A52_DOLBY): | |
| 1351 + memcpy (samples + 256, samples, 256 * sizeof (sample_t)); | |
| 1352 + break; | |
| 1353 + | |
| 1354 + case CONVERT (A52_3F, A52_STEREO): | |
| 1355 + case CONVERT (A52_3F, A52_DOLBY): | |
| 1356 + mix_3to2_3dnow: | |
| 1357 + mix3to2_3dnow (samples, bias); | |
| 1358 + break; | |
| 1359 + | |
| 1360 + case CONVERT (A52_2F1R, A52_STEREO): | |
| 1361 + if (slev == 0) | |
| 1362 + break; | |
| 1363 + mix21to2_3dnow (samples, samples + 256, bias); | |
| 1364 + break; | |
| 1365 + | |
| 1366 + case CONVERT (A52_2F1R, A52_DOLBY): | |
| 1367 + mix21toS_3dnow (samples, bias); | |
| 1368 + break; | |
| 1369 + | |
| 1370 + case CONVERT (A52_3F1R, A52_STEREO): | |
| 1371 + if (slev == 0) | |
| 1372 + goto mix_3to2_3dnow; | |
| 1373 + mix31to2_3dnow (samples, bias); | |
| 1374 + break; | |
| 1375 + | |
| 1376 + case CONVERT (A52_3F1R, A52_DOLBY): | |
| 1377 + mix31toS_3dnow (samples, bias); | |
| 1378 + break; | |
| 1379 + | |
| 1380 + case CONVERT (A52_2F2R, A52_STEREO): | |
| 1381 + if (slev == 0) | |
| 1382 + break; | |
| 1383 + mix2to1_3dnow (samples, samples + 512, bias); | |
| 1384 + mix2to1_3dnow (samples + 256, samples + 768, bias); | |
| 1385 + break; | |
| 1386 + | |
| 1387 + case CONVERT (A52_2F2R, A52_DOLBY): | |
| 1388 + mix22toS_3dnow (samples, bias); | |
| 1389 + break; | |
| 1390 + | |
| 1391 + case CONVERT (A52_3F2R, A52_STEREO): | |
| 1392 + if (slev == 0) | |
| 1393 + goto mix_3to2_3dnow; | |
| 1394 + mix32to2_3dnow (samples, bias); | |
| 1395 + break; | |
| 1396 + | |
| 1397 + case CONVERT (A52_3F2R, A52_DOLBY): | |
| 1398 + mix32toS_3dnow (samples, bias); | |
| 1399 + break; | |
| 1400 + | |
| 1401 + case CONVERT (A52_3F1R, A52_3F): | |
| 1402 + if (slev == 0) | |
| 1403 + break; | |
| 1404 + mix21to2_3dnow (samples, samples + 512, bias); | |
| 1405 + break; | |
| 1406 + | |
| 1407 + case CONVERT (A52_3F2R, A52_3F): | |
| 1408 + if (slev == 0) | |
| 1409 + break; | |
| 1410 + mix2to1_3dnow (samples, samples + 768, bias); | |
| 1411 + mix2to1_3dnow (samples + 512, samples + 1024, bias); | |
| 1412 + break; | |
| 1413 + | |
| 1414 + case CONVERT (A52_3F1R, A52_2F1R): | |
| 1415 + mix3to2_3dnow (samples, bias); | |
| 1416 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
| 1417 + break; | |
| 1418 + | |
| 1419 + case CONVERT (A52_2F2R, A52_2F1R): | |
| 1420 + mix2to1_3dnow (samples + 512, samples + 768, bias); | |
| 1421 + break; | |
| 1422 + | |
| 1423 + case CONVERT (A52_3F2R, A52_2F1R): | |
| 1424 + mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used) | |
| 1425 + move2to1_3dnow (samples + 768, samples + 512, bias); | |
| 1426 + break; | |
| 1427 + | |
| 1428 + case CONVERT (A52_3F2R, A52_3F1R): | |
| 1429 + mix2to1_3dnow (samples + 768, samples + 1024, bias); | |
| 1430 + break; | |
| 1431 + | |
| 1432 + case CONVERT (A52_2F1R, A52_2F2R): | |
| 1433 + memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t)); | |
| 1434 + break; | |
| 1435 + | |
| 1436 + case CONVERT (A52_3F1R, A52_2F2R): | |
| 1437 + mix3to2_3dnow (samples, bias); | |
| 1438 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
| 1439 + break; | |
| 1440 + | |
| 1441 + case CONVERT (A52_3F2R, A52_2F2R): | |
| 1442 + mix3to2_3dnow (samples, bias); | |
| 1443 + memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t)); | |
| 1444 + memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t)); | |
| 1445 + break; | |
| 1446 + | |
| 1447 + case CONVERT (A52_3F1R, A52_3F2R): | |
| 1448 + memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t)); | |
| 1449 + break; | |
| 1450 + } | |
| 1451 + __asm __volatile("femms":::"memory"); | |
| 1452 +} | |
| 1453 + | |
| 1454 +#endif //ARCH_X86 | |
| 1455 --- liba52/imdct.c 2005-03-22 19:59:35.000000000 +0100 | |
| 1456 +++ imdct.c 2004-04-26 22:00:57.000000000 +0200 | |
| 1457 @@ -17,17 +23,32 @@ | |
| 1458 * You should have received a copy of the GNU General Public License | |
| 1459 * along with this program; if not, write to the Free Software | |
| 1460 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
| 1461 + * | |
| 1462 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at) | |
| 1463 + * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru> | |
| 1464 + * michael did port them from libac3 (untested, perhaps totally broken) | |
| 1465 + * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org) | |
| 1466 */ | |
| 1467 | |
| 1468 #include "config.h" | |
| 1469 | |
| 1470 -#include <inttypes.h> | |
| 1471 #include <math.h> | |
| 1472 #include <stdio.h> | |
| 1473 +#ifndef M_PI | |
| 1474 +#define M_PI 3.1415926535897932384626433832795029 | |
| 1475 +#endif | |
| 1476 +#include <inttypes.h> | |
| 1477 | |
| 1478 #include "a52.h" | |
| 1479 #include "a52_internal.h" | |
| 1480 #include "mm_accel.h" | |
| 1481 +#include "mangle.h" | |
| 1482 + | |
| 1483 +#ifdef RUNTIME_CPUDETECT | |
| 1484 +#undef HAVE_3DNOWEX | |
| 1485 +#endif | |
| 1486 + | |
| 1487 +#define USE_AC3_C | |
| 1488 | |
| 1489 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias); | |
| 1490 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias); | |
| 1491 @@ -37,9 +58,22 @@ | |
| 1492 sample_t imag; | |
| 1493 } complex_t; | |
| 1494 | |
| 1495 +static void fft_128p(complex_t *a); | |
| 1496 + | |
| 1497 +static const int pm128[128] attribute_used __attribute__((aligned(16))) = | |
| 1498 +{ | |
| 1499 + 0, 16, 32, 48, 64, 80, 96, 112, 8, 40, 72, 104, 24, 56, 88, 120, | |
| 1500 + 4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44, 60, 76, 92, 108, 124, | |
| 1501 + 2, 18, 34, 50, 66, 82, 98, 114, 10, 42, 74, 106, 26, 58, 90, 122, | |
| 1502 + 6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62, 94, 126, | |
| 1503 + 1, 17, 33, 49, 65, 81, 97, 113, 9, 41, 73, 105, 25, 57, 89, 121, | |
| 1504 + 5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45, 61, 77, 93, 109, 125, | |
| 1505 + 3, 19, 35, 51, 67, 83, 99, 115, 11, 43, 75, 107, 27, 59, 91, 123, | |
| 1506 + 7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47, 63, 79, 95, 111, 127 | |
| 1507 +}; | |
| 1508 | |
| 1509 /* 128 point bit-reverse LUT */ | |
| 1510 -static uint8_t bit_reverse_512[] = { | |
| 1511 +static uint8_t attribute_used bit_reverse_512[] = { | |
| 1512 0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70, | |
| 1513 0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78, | |
| 1514 0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74, | |
| 1515 @@ -67,23 +101,42 @@ | |
| 1516 0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b, | |
| 1517 0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f}; | |
| 1518 | |
| 1519 -static complex_t buf[128]; | |
| 1520 +#ifdef ARCH_X86 | |
| 1521 +// NOTE: SSE needs 16byte alignment or it will segfault | |
| 1522 +// | |
| 1523 +static complex_t __attribute__((aligned(16))) buf[128]; | |
| 1524 +static float __attribute__((aligned(16))) sseSinCos1c[256]; | |
| 1525 +static float __attribute__((aligned(16))) sseSinCos1d[256]; | |
| 1526 +static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1}; | |
| 1527 +//static float __attribute__((aligned(16))) sseW0[4]; | |
| 1528 +static float __attribute__((aligned(16))) sseW1[8]; | |
| 1529 +static float __attribute__((aligned(16))) sseW2[16]; | |
| 1530 +static float __attribute__((aligned(16))) sseW3[32]; | |
| 1531 +static float __attribute__((aligned(16))) sseW4[64]; | |
| 1532 +static float __attribute__((aligned(16))) sseW5[128]; | |
| 1533 +static float __attribute__((aligned(16))) sseW6[256]; | |
| 1534 +static float __attribute__((aligned(16))) *sseW[7]= | |
| 1535 + {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6}; | |
| 1536 +static float __attribute__((aligned(16))) sseWindow[512]; | |
| 1537 +#else | |
| 1538 +static complex_t __attribute__((aligned(16))) buf[128]; | |
| 1539 +#endif | |
| 1540 | |
| 1541 /* Twiddle factor LUT */ | |
| 1542 -static complex_t w_1[1]; | |
| 1543 -static complex_t w_2[2]; | |
| 1544 -static complex_t w_4[4]; | |
| 1545 -static complex_t w_8[8]; | |
| 1546 -static complex_t w_16[16]; | |
| 1547 -static complex_t w_32[32]; | |
| 1548 -static complex_t w_64[64]; | |
| 1549 -static complex_t * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; | |
| 1550 +static complex_t __attribute__((aligned(16))) w_1[1]; | |
| 1551 +static complex_t __attribute__((aligned(16))) w_2[2]; | |
| 1552 +static complex_t __attribute__((aligned(16))) w_4[4]; | |
| 1553 +static complex_t __attribute__((aligned(16))) w_8[8]; | |
| 1554 +static complex_t __attribute__((aligned(16))) w_16[16]; | |
| 1555 +static complex_t __attribute__((aligned(16))) w_32[32]; | |
| 1556 +static complex_t __attribute__((aligned(16))) w_64[64]; | |
| 1557 +static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64}; | |
| 1558 | |
| 1559 /* Twiddle factors for IMDCT */ | |
| 1560 -static sample_t xcos1[128]; | |
| 1561 -static sample_t xsin1[128]; | |
| 1562 -static sample_t xcos2[64]; | |
| 1563 -static sample_t xsin2[64]; | |
| 1564 +static sample_t __attribute__((aligned(16))) xcos1[128]; | |
| 1565 +static sample_t __attribute__((aligned(16))) xsin1[128]; | |
| 1566 +static sample_t __attribute__((aligned(16))) xcos2[64]; | |
| 1567 +static sample_t __attribute__((aligned(16))) xsin2[64]; | |
| 1568 | |
| 1569 /* Windowing function for Modified DCT - Thank you acroread */ | |
| 1570 sample_t imdct_window[] = { | |
| 1571 @@ -145,16 +198,19 @@ | |
| 1572 void | |
| 1573 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias) | |
| 1574 { | |
| 1575 - int i,k; | |
| 1576 + int i; | |
| 1577 +#ifndef USE_AC3_C | |
| 1578 + int k; | |
| 1579 int p,q; | |
| 1580 int m; | |
| 1581 int two_m; | |
| 1582 int two_m_plus_one; | |
| 1583 | |
| 1584 - sample_t tmp_a_i; | |
| 1585 - sample_t tmp_a_r; | |
| 1586 sample_t tmp_b_i; | |
| 1587 sample_t tmp_b_r; | |
| 1588 +#endif | |
| 1589 + sample_t tmp_a_i; | |
| 1590 + sample_t tmp_a_r; | |
| 1591 | |
| 1592 sample_t *data_ptr; | |
| 1593 sample_t *delay_ptr; | |
| 1594 @@ -162,22 +218,21 @@ | |
| 1595 | |
| 1596 /* 512 IMDCT with source and dest data in 'data' */ | |
| 1597 | |
| 1598 - /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ | |
| 1599 + /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ | |
| 1600 for( i=0; i < 128; i++) { | |
| 1601 /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ | |
| 1602 - buf[i].real = (data[256-2*i-1] * xcos1[i]) - (data[2*i] * xsin1[i]); | |
| 1603 - buf[i].imag = -1.0 * ((data[2*i] * xcos1[i]) + (data[256-2*i-1] * xsin1[i])); | |
| 1604 - } | |
| 1605 - | |
| 1606 - /* Bit reversed shuffling */ | |
| 1607 - for(i=0; i<128; i++) { | |
| 1608 - k = bit_reverse_512[i]; | |
| 1609 - if (k < i) | |
| 1610 - swap_cmplx(&buf[i],&buf[k]); | |
| 1611 +#ifdef USE_AC3_C | |
| 1612 + int j= pm128[i]; | |
| 1613 +#else | |
| 1614 + int j= bit_reverse_512[i]; | |
| 1615 +#endif | |
| 1616 + buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); | |
| 1617 + buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); | |
| 1618 } | |
| 1619 | |
| 1620 /* FFT Merge */ | |
| 1621 - for (m=0; m < 7; m++) { | |
| 1622 +/* unoptimized variant | |
| 1623 + for (m=1; m < 7; m++) { | |
| 1624 if(m) | |
| 1625 two_m = (1 << m); | |
| 1626 else | |
| 1627 @@ -185,8 +240,8 @@ | |
| 1628 | |
| 1629 two_m_plus_one = (1 << (m+1)); | |
| 1630 | |
| 1631 - for(k = 0; k < two_m; k++) { | |
| 1632 - for(i = 0; i < 128; i += two_m_plus_one) { | |
| 1633 + for(i = 0; i < 128; i += two_m_plus_one) { | |
| 1634 + for(k = 0; k < two_m; k++) { | |
| 1635 p = k + i; | |
| 1636 q = p + two_m; | |
| 1637 tmp_a_r = buf[p].real; | |
| 1638 @@ -200,7 +255,102 @@ | |
| 1639 } | |
| 1640 } | |
| 1641 } | |
| 1642 +*/ | |
| 1643 +#ifdef USE_AC3_C | |
| 1644 + fft_128p (&buf[0]); | |
| 1645 +#else | |
| 1646 + | |
| 1647 + /* 1. iteration */ | |
| 1648 + for(i = 0; i < 128; i += 2) { | |
| 1649 + tmp_a_r = buf[i].real; | |
| 1650 + tmp_a_i = buf[i].imag; | |
| 1651 + tmp_b_r = buf[i+1].real; | |
| 1652 + tmp_b_i = buf[i+1].imag; | |
| 1653 + buf[i].real = tmp_a_r + tmp_b_r; | |
| 1654 + buf[i].imag = tmp_a_i + tmp_b_i; | |
| 1655 + buf[i+1].real = tmp_a_r - tmp_b_r; | |
| 1656 + buf[i+1].imag = tmp_a_i - tmp_b_i; | |
| 1657 + } | |
| 1658 + | |
| 1659 + /* 2. iteration */ | |
| 1660 + // Note w[1]={{1,0}, {0,-1}} | |
| 1661 + for(i = 0; i < 128; i += 4) { | |
| 1662 + tmp_a_r = buf[i].real; | |
| 1663 + tmp_a_i = buf[i].imag; | |
| 1664 + tmp_b_r = buf[i+2].real; | |
| 1665 + tmp_b_i = buf[i+2].imag; | |
| 1666 + buf[i].real = tmp_a_r + tmp_b_r; | |
| 1667 + buf[i].imag = tmp_a_i + tmp_b_i; | |
| 1668 + buf[i+2].real = tmp_a_r - tmp_b_r; | |
| 1669 + buf[i+2].imag = tmp_a_i - tmp_b_i; | |
| 1670 + tmp_a_r = buf[i+1].real; | |
| 1671 + tmp_a_i = buf[i+1].imag; | |
| 1672 + tmp_b_r = buf[i+3].imag; | |
| 1673 + tmp_b_i = buf[i+3].real; | |
| 1674 + buf[i+1].real = tmp_a_r + tmp_b_r; | |
| 1675 + buf[i+1].imag = tmp_a_i - tmp_b_i; | |
| 1676 + buf[i+3].real = tmp_a_r - tmp_b_r; | |
| 1677 + buf[i+3].imag = tmp_a_i + tmp_b_i; | |
| 1678 + } | |
| 1679 | |
| 1680 + /* 3. iteration */ | |
| 1681 + for(i = 0; i < 128; i += 8) { | |
| 1682 + tmp_a_r = buf[i].real; | |
| 1683 + tmp_a_i = buf[i].imag; | |
| 1684 + tmp_b_r = buf[i+4].real; | |
| 1685 + tmp_b_i = buf[i+4].imag; | |
| 1686 + buf[i].real = tmp_a_r + tmp_b_r; | |
| 1687 + buf[i].imag = tmp_a_i + tmp_b_i; | |
| 1688 + buf[i+4].real = tmp_a_r - tmp_b_r; | |
| 1689 + buf[i+4].imag = tmp_a_i - tmp_b_i; | |
| 1690 + tmp_a_r = buf[1+i].real; | |
| 1691 + tmp_a_i = buf[1+i].imag; | |
| 1692 + tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; | |
| 1693 + tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; | |
| 1694 + buf[1+i].real = tmp_a_r + tmp_b_r; | |
| 1695 + buf[1+i].imag = tmp_a_i + tmp_b_i; | |
| 1696 + buf[i+5].real = tmp_a_r - tmp_b_r; | |
| 1697 + buf[i+5].imag = tmp_a_i - tmp_b_i; | |
| 1698 + tmp_a_r = buf[i+2].real; | |
| 1699 + tmp_a_i = buf[i+2].imag; | |
| 1700 + tmp_b_r = buf[i+6].imag; | |
| 1701 + tmp_b_i = - buf[i+6].real; | |
| 1702 + buf[i+2].real = tmp_a_r + tmp_b_r; | |
| 1703 + buf[i+2].imag = tmp_a_i + tmp_b_i; | |
| 1704 + buf[i+6].real = tmp_a_r - tmp_b_r; | |
| 1705 + buf[i+6].imag = tmp_a_i - tmp_b_i; | |
| 1706 + tmp_a_r = buf[i+3].real; | |
| 1707 + tmp_a_i = buf[i+3].imag; | |
| 1708 + tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; | |
| 1709 + tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; | |
| 1710 + buf[i+3].real = tmp_a_r + tmp_b_r; | |
| 1711 + buf[i+3].imag = tmp_a_i + tmp_b_i; | |
| 1712 + buf[i+7].real = tmp_a_r - tmp_b_r; | |
| 1713 + buf[i+7].imag = tmp_a_i - tmp_b_i; | |
| 1714 + } | |
| 1715 + | |
| 1716 + /* 4-7. iterations */ | |
| 1717 + for (m=3; m < 7; m++) { | |
| 1718 + two_m = (1 << m); | |
| 1719 + | |
| 1720 + two_m_plus_one = two_m<<1; | |
| 1721 + | |
| 1722 + for(i = 0; i < 128; i += two_m_plus_one) { | |
| 1723 + for(k = 0; k < two_m; k++) { | |
| 1724 + int p = k + i; | |
| 1725 + int q = p + two_m; | |
| 1726 + tmp_a_r = buf[p].real; | |
| 1727 + tmp_a_i = buf[p].imag; | |
| 1728 + tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
| 1729 + tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
| 1730 + buf[p].real = tmp_a_r + tmp_b_r; | |
| 1731 + buf[p].imag = tmp_a_i + tmp_b_i; | |
| 1732 + buf[q].real = tmp_a_r - tmp_b_r; | |
| 1733 + buf[q].imag = tmp_a_i - tmp_b_i; | |
| 1734 + } | |
| 1735 + } | |
| 1736 + } | |
| 1737 +#endif | |
| 1738 /* Post IFFT complex multiply plus IFFT complex conjugate*/ | |
| 1739 for( i=0; i < 128; i++) { | |
| 1740 /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ | |
| 1741 @@ -219,12 +369,12 @@ | |
| 1742 *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
| 1743 *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
| 1744 } | |
| 1745 - | |
| 1746 + | |
| 1747 for(i=0; i< 64; i++) { | |
| 1748 *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
| 1749 *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
| 1750 } | |
| 1751 - | |
| 1752 + | |
| 1753 /* The trailing edge of the window goes into the delay line */ | |
| 1754 delay_ptr = delay; | |
| 1755 | |
| 1756 @@ -232,13 +382,717 @@ | |
| 1757 *delay_ptr++ = -buf[64+i].real * *--window_ptr; | |
| 1758 *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; | |
| 1759 } | |
| 1760 - | |
| 1761 + | |
| 1762 for(i=0; i<64; i++) { | |
| 1763 *delay_ptr++ = buf[i].imag * *--window_ptr; | |
| 1764 *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; | |
| 1765 } | |
| 1766 } | |
| 1767 | |
| 1768 +#ifdef HAVE_ALTIVEC | |
| 1769 + | |
| 1770 +#ifndef SYS_DARWIN | |
| 1771 +#include <altivec.h> | |
| 1772 +#endif | |
| 1773 + | |
| 1774 +// used to build registers permutation vectors (vcprm) | |
| 1775 +// the 's' are for words in the _s_econd vector | |
| 1776 +#define WORD_0 0x00,0x01,0x02,0x03 | |
| 1777 +#define WORD_1 0x04,0x05,0x06,0x07 | |
| 1778 +#define WORD_2 0x08,0x09,0x0a,0x0b | |
| 1779 +#define WORD_3 0x0c,0x0d,0x0e,0x0f | |
| 1780 +#define WORD_s0 0x10,0x11,0x12,0x13 | |
| 1781 +#define WORD_s1 0x14,0x15,0x16,0x17 | |
| 1782 +#define WORD_s2 0x18,0x19,0x1a,0x1b | |
| 1783 +#define WORD_s3 0x1c,0x1d,0x1e,0x1f | |
| 1784 + | |
| 1785 +#ifdef SYS_DARWIN | |
| 1786 +#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d) | |
| 1787 +#else | |
| 1788 +#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d} | |
| 1789 +#endif | |
| 1790 + | |
| 1791 +// vcprmle is used to keep the same index as in the SSE version. | |
| 1792 +// it's the same as vcprm, with the index inversed | |
| 1793 +// ('le' is Little Endian) | |
| 1794 +#define vcprmle(a,b,c,d) vcprm(d,c,b,a) | |
| 1795 + | |
| 1796 +// used to build inverse/identity vectors (vcii) | |
| 1797 +// n is _n_egative, p is _p_ositive | |
| 1798 +#define FLOAT_n -1. | |
| 1799 +#define FLOAT_p 1. | |
| 1800 + | |
| 1801 +#ifdef SYS_DARWIN | |
| 1802 +#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d) | |
| 1803 +#else | |
| 1804 +#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d} | |
| 1805 +#endif | |
| 1806 + | |
| 1807 +#ifdef SYS_DARWIN | |
| 1808 +#define FOUROF(a) (a) | |
| 1809 +#else | |
| 1810 +#define FOUROF(a) {a,a,a,a} | |
| 1811 +#endif | |
| 1812 + | |
| 1813 + | |
| 1814 +void | |
| 1815 +imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias) | |
| 1816 +{ | |
| 1817 + int i; | |
| 1818 + int k; | |
| 1819 + int p,q; | |
| 1820 + int m; | |
| 1821 + int two_m; | |
| 1822 + int two_m_plus_one; | |
| 1823 + | |
| 1824 + sample_t tmp_b_i; | |
| 1825 + sample_t tmp_b_r; | |
| 1826 + sample_t tmp_a_i; | |
| 1827 + sample_t tmp_a_r; | |
| 1828 + | |
| 1829 + sample_t *data_ptr; | |
| 1830 + sample_t *delay_ptr; | |
| 1831 + sample_t *window_ptr; | |
| 1832 + | |
| 1833 + /* 512 IMDCT with source and dest data in 'data' */ | |
| 1834 + | |
| 1835 + /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/ | |
| 1836 + for( i=0; i < 128; i++) { | |
| 1837 + /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ | |
| 1838 + int j= bit_reverse_512[i]; | |
| 1839 + buf[i].real = (data[256-2*j-1] * xcos1[j]) - (data[2*j] * xsin1[j]); | |
| 1840 + buf[i].imag = -1.0 * ((data[2*j] * xcos1[j]) + (data[256-2*j-1] * xsin1[j])); | |
| 1841 + } | |
| 1842 + | |
| 1843 + /* 1. iteration */ | |
| 1844 + for(i = 0; i < 128; i += 2) { | |
| 1845 +#if 0 | |
| 1846 + tmp_a_r = buf[i].real; | |
| 1847 + tmp_a_i = buf[i].imag; | |
| 1848 + tmp_b_r = buf[i+1].real; | |
| 1849 + tmp_b_i = buf[i+1].imag; | |
| 1850 + buf[i].real = tmp_a_r + tmp_b_r; | |
| 1851 + buf[i].imag = tmp_a_i + tmp_b_i; | |
| 1852 + buf[i+1].real = tmp_a_r - tmp_b_r; | |
| 1853 + buf[i+1].imag = tmp_a_i - tmp_b_i; | |
| 1854 +#else | |
| 1855 + vector float temp, bufv; | |
| 1856 + | |
| 1857 + bufv = vec_ld(i << 3, (float*)buf); | |
| 1858 + temp = vec_perm(bufv, bufv, vcprm(2,3,0,1)); | |
| 1859 + bufv = vec_madd(bufv, vcii(p,p,n,n), temp); | |
| 1860 + vec_st(bufv, i << 3, (float*)buf); | |
| 1861 +#endif | |
| 1862 + } | |
| 1863 + | |
| 1864 + /* 2. iteration */ | |
| 1865 + // Note w[1]={{1,0}, {0,-1}} | |
| 1866 + for(i = 0; i < 128; i += 4) { | |
| 1867 +#if 0 | |
| 1868 + tmp_a_r = buf[i].real; | |
| 1869 + tmp_a_i = buf[i].imag; | |
| 1870 + tmp_b_r = buf[i+2].real; | |
| 1871 + tmp_b_i = buf[i+2].imag; | |
| 1872 + buf[i].real = tmp_a_r + tmp_b_r; | |
| 1873 + buf[i].imag = tmp_a_i + tmp_b_i; | |
| 1874 + buf[i+2].real = tmp_a_r - tmp_b_r; | |
| 1875 + buf[i+2].imag = tmp_a_i - tmp_b_i; | |
| 1876 + tmp_a_r = buf[i+1].real; | |
| 1877 + tmp_a_i = buf[i+1].imag; | |
| 1878 + /* WARNING: im <-> re here ! */ | |
| 1879 + tmp_b_r = buf[i+3].imag; | |
| 1880 + tmp_b_i = buf[i+3].real; | |
| 1881 + buf[i+1].real = tmp_a_r + tmp_b_r; | |
| 1882 + buf[i+1].imag = tmp_a_i - tmp_b_i; | |
| 1883 + buf[i+3].real = tmp_a_r - tmp_b_r; | |
| 1884 + buf[i+3].imag = tmp_a_i + tmp_b_i; | |
| 1885 +#else | |
| 1886 + vector float buf01, buf23, temp1, temp2; | |
| 1887 + | |
| 1888 + buf01 = vec_ld((i + 0) << 3, (float*)buf); | |
| 1889 + buf23 = vec_ld((i + 2) << 3, (float*)buf); | |
| 1890 + buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2)); | |
| 1891 + | |
| 1892 + temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01); | |
| 1893 + temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01); | |
| 1894 + | |
| 1895 + vec_st(temp1, (i + 0) << 3, (float*)buf); | |
| 1896 + vec_st(temp2, (i + 2) << 3, (float*)buf); | |
| 1897 +#endif | |
| 1898 + } | |
| 1899 + | |
| 1900 + /* 3. iteration */ | |
| 1901 + for(i = 0; i < 128; i += 8) { | |
| 1902 +#if 0 | |
| 1903 + tmp_a_r = buf[i].real; | |
| 1904 + tmp_a_i = buf[i].imag; | |
| 1905 + tmp_b_r = buf[i+4].real; | |
| 1906 + tmp_b_i = buf[i+4].imag; | |
| 1907 + buf[i].real = tmp_a_r + tmp_b_r; | |
| 1908 + buf[i].imag = tmp_a_i + tmp_b_i; | |
| 1909 + buf[i+4].real = tmp_a_r - tmp_b_r; | |
| 1910 + buf[i+4].imag = tmp_a_i - tmp_b_i; | |
| 1911 + tmp_a_r = buf[1+i].real; | |
| 1912 + tmp_a_i = buf[1+i].imag; | |
| 1913 + tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; | |
| 1914 + tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; | |
| 1915 + buf[1+i].real = tmp_a_r + tmp_b_r; | |
| 1916 + buf[1+i].imag = tmp_a_i + tmp_b_i; | |
| 1917 + buf[i+5].real = tmp_a_r - tmp_b_r; | |
| 1918 + buf[i+5].imag = tmp_a_i - tmp_b_i; | |
| 1919 + tmp_a_r = buf[i+2].real; | |
| 1920 + tmp_a_i = buf[i+2].imag; | |
| 1921 + /* WARNING re <-> im & sign */ | |
| 1922 + tmp_b_r = buf[i+6].imag; | |
| 1923 + tmp_b_i = - buf[i+6].real; | |
| 1924 + buf[i+2].real = tmp_a_r + tmp_b_r; | |
| 1925 + buf[i+2].imag = tmp_a_i + tmp_b_i; | |
| 1926 + buf[i+6].real = tmp_a_r - tmp_b_r; | |
| 1927 + buf[i+6].imag = tmp_a_i - tmp_b_i; | |
| 1928 + tmp_a_r = buf[i+3].real; | |
| 1929 + tmp_a_i = buf[i+3].imag; | |
| 1930 + tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; | |
| 1931 + tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; | |
| 1932 + buf[i+3].real = tmp_a_r + tmp_b_r; | |
| 1933 + buf[i+3].imag = tmp_a_i + tmp_b_i; | |
| 1934 + buf[i+7].real = tmp_a_r - tmp_b_r; | |
| 1935 + buf[i+7].imag = tmp_a_i - tmp_b_i; | |
| 1936 +#else | |
| 1937 + vector float buf01, buf23, buf45, buf67; | |
| 1938 + | |
| 1939 + buf01 = vec_ld((i + 0) << 3, (float*)buf); | |
| 1940 + buf23 = vec_ld((i + 2) << 3, (float*)buf); | |
| 1941 + | |
| 1942 + tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real; | |
| 1943 + tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real; | |
| 1944 + buf[i+5].real = tmp_b_r; | |
| 1945 + buf[i+5].imag = tmp_b_i; | |
| 1946 + tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag; | |
| 1947 + tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag; | |
| 1948 + buf[i+7].real = tmp_b_r; | |
| 1949 + buf[i+7].imag = tmp_b_i; | |
| 1950 + | |
| 1951 + buf23 = vec_ld((i + 2) << 3, (float*)buf); | |
| 1952 + buf45 = vec_ld((i + 4) << 3, (float*)buf); | |
| 1953 + buf67 = vec_ld((i + 6) << 3, (float*)buf); | |
| 1954 + buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3)); | |
| 1955 + | |
| 1956 + vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf); | |
| 1957 + vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf); | |
| 1958 + vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf); | |
| 1959 + vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf); | |
| 1960 +#endif | |
| 1961 + } | |
| 1962 + | |
| 1963 + /* 4-7. iterations */ | |
| 1964 + for (m=3; m < 7; m++) { | |
| 1965 + two_m = (1 << m); | |
| 1966 + | |
| 1967 + two_m_plus_one = two_m<<1; | |
| 1968 + | |
| 1969 + for(i = 0; i < 128; i += two_m_plus_one) { | |
| 1970 + for(k = 0; k < two_m; k+=2) { | |
| 1971 +#if 0 | |
| 1972 + int p = k + i; | |
| 1973 + int q = p + two_m; | |
| 1974 + tmp_a_r = buf[p].real; | |
| 1975 + tmp_a_i = buf[p].imag; | |
| 1976 + tmp_b_r = | |
| 1977 + buf[q].real * w[m][k].real - | |
| 1978 + buf[q].imag * w[m][k].imag; | |
| 1979 + tmp_b_i = | |
| 1980 + buf[q].imag * w[m][k].real + | |
| 1981 + buf[q].real * w[m][k].imag; | |
| 1982 + buf[p].real = tmp_a_r + tmp_b_r; | |
| 1983 + buf[p].imag = tmp_a_i + tmp_b_i; | |
| 1984 + buf[q].real = tmp_a_r - tmp_b_r; | |
| 1985 + buf[q].imag = tmp_a_i - tmp_b_i; | |
| 1986 + | |
| 1987 + tmp_a_r = buf[(p + 1)].real; | |
| 1988 + tmp_a_i = buf[(p + 1)].imag; | |
| 1989 + tmp_b_r = | |
| 1990 + buf[(q + 1)].real * w[m][(k + 1)].real - | |
| 1991 + buf[(q + 1)].imag * w[m][(k + 1)].imag; | |
| 1992 + tmp_b_i = | |
| 1993 + buf[(q + 1)].imag * w[m][(k + 1)].real + | |
| 1994 + buf[(q + 1)].real * w[m][(k + 1)].imag; | |
| 1995 + buf[(p + 1)].real = tmp_a_r + tmp_b_r; | |
| 1996 + buf[(p + 1)].imag = tmp_a_i + tmp_b_i; | |
| 1997 + buf[(q + 1)].real = tmp_a_r - tmp_b_r; | |
| 1998 + buf[(q + 1)].imag = tmp_a_i - tmp_b_i; | |
| 1999 +#else | |
| 2000 + int p = k + i; | |
| 2001 + int q = p + two_m; | |
| 2002 + vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4; | |
| 2003 + const vector float vczero = (const vector float)FOUROF(0.); | |
| 2004 + // first compute buf[q] and buf[q+1] | |
| 2005 + vecq = vec_ld(q << 3, (float*)buf); | |
| 2006 + vecw = vec_ld(0, (float*)&(w[m][k])); | |
| 2007 + temp1 = vec_madd(vecq, vecw, vczero); | |
| 2008 + temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2)); | |
| 2009 + temp2 = vec_madd(temp2, vecw, vczero); | |
| 2010 + temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2)); | |
| 2011 + temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3)); | |
| 2012 + vecq = vec_madd(temp4, vcii(n,p,n,p), temp3); | |
| 2013 + // then butterfly with buf[p] and buf[p+1] | |
| 2014 + vecp = vec_ld(p << 3, (float*)buf); | |
| 2015 + | |
| 2016 + temp1 = vec_add(vecp, vecq); | |
| 2017 + temp2 = vec_sub(vecp, vecq); | |
| 2018 + | |
| 2019 + vec_st(temp1, p << 3, (float*)buf); | |
| 2020 + vec_st(temp2, q << 3, (float*)buf); | |
| 2021 +#endif | |
| 2022 + } | |
| 2023 + } | |
| 2024 + } | |
| 2025 + | |
| 2026 + /* Post IFFT complex multiply plus IFFT complex conjugate*/ | |
| 2027 + for( i=0; i < 128; i+=4) { | |
| 2028 + /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ | |
| 2029 +#if 0 | |
| 2030 + tmp_a_r = buf[(i + 0)].real; | |
| 2031 + tmp_a_i = -1.0 * buf[(i + 0)].imag; | |
| 2032 + buf[(i + 0)].real = | |
| 2033 + (tmp_a_r * xcos1[(i + 0)]) - (tmp_a_i * xsin1[(i + 0)]); | |
| 2034 + buf[(i + 0)].imag = | |
| 2035 + (tmp_a_r * xsin1[(i + 0)]) + (tmp_a_i * xcos1[(i + 0)]); | |
| 2036 + | |
| 2037 + tmp_a_r = buf[(i + 1)].real; | |
| 2038 + tmp_a_i = -1.0 * buf[(i + 1)].imag; | |
| 2039 + buf[(i + 1)].real = | |
| 2040 + (tmp_a_r * xcos1[(i + 1)]) - (tmp_a_i * xsin1[(i + 1)]); | |
| 2041 + buf[(i + 1)].imag = | |
| 2042 + (tmp_a_r * xsin1[(i + 1)]) + (tmp_a_i * xcos1[(i + 1)]); | |
| 2043 + | |
| 2044 + tmp_a_r = buf[(i + 2)].real; | |
| 2045 + tmp_a_i = -1.0 * buf[(i + 2)].imag; | |
| 2046 + buf[(i + 2)].real = | |
| 2047 + (tmp_a_r * xcos1[(i + 2)]) - (tmp_a_i * xsin1[(i + 2)]); | |
| 2048 + buf[(i + 2)].imag = | |
| 2049 + (tmp_a_r * xsin1[(i + 2)]) + (tmp_a_i * xcos1[(i + 2)]); | |
| 2050 + | |
| 2051 + tmp_a_r = buf[(i + 3)].real; | |
| 2052 + tmp_a_i = -1.0 * buf[(i + 3)].imag; | |
| 2053 + buf[(i + 3)].real = | |
| 2054 + (tmp_a_r * xcos1[(i + 3)]) - (tmp_a_i * xsin1[(i + 3)]); | |
| 2055 + buf[(i + 3)].imag = | |
| 2056 + (tmp_a_r * xsin1[(i + 3)]) + (tmp_a_i * xcos1[(i + 3)]); | |
| 2057 +#else | |
| 2058 + vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2; | |
| 2059 + vector float temp0022, temp1133, tempCS01; | |
| 2060 + const vector float vczero = (const vector float)FOUROF(0.); | |
| 2061 + | |
| 2062 + bufv_0 = vec_ld((i + 0) << 3, (float*)buf); | |
| 2063 + bufv_2 = vec_ld((i + 2) << 3, (float*)buf); | |
| 2064 + | |
| 2065 + cosv = vec_ld(i << 2, xcos1); | |
| 2066 + sinv = vec_ld(i << 2, xsin1); | |
| 2067 + | |
| 2068 + temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2)); | |
| 2069 + temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3)); | |
| 2070 + tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1)); | |
| 2071 + temp1 = vec_madd(temp0022, tempCS01, vczero); | |
| 2072 + tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1)); | |
| 2073 + temp2 = vec_madd(temp1133, tempCS01, vczero); | |
| 2074 + bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1); | |
| 2075 + | |
| 2076 + vec_st(bufv_0, (i + 0) << 3, (float*)buf); | |
| 2077 + | |
| 2078 + /* idem with bufv_2 and high-order cosv/sinv */ | |
| 2079 + | |
| 2080 + temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2)); | |
| 2081 + temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3)); | |
| 2082 + tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3)); | |
| 2083 + temp1 = vec_madd(temp0022, tempCS01, vczero); | |
| 2084 + tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3)); | |
| 2085 + temp2 = vec_madd(temp1133, tempCS01, vczero); | |
| 2086 + bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1); | |
| 2087 + | |
| 2088 + vec_st(bufv_2, (i + 2) << 3, (float*)buf); | |
| 2089 + | |
| 2090 +#endif | |
| 2091 + } | |
| 2092 + | |
| 2093 + data_ptr = data; | |
| 2094 + delay_ptr = delay; | |
| 2095 + window_ptr = imdct_window; | |
| 2096 + | |
| 2097 + /* Window and convert to real valued signal */ | |
| 2098 + for(i=0; i< 64; i++) { | |
| 2099 + *data_ptr++ = -buf[64+i].imag * *window_ptr++ + *delay_ptr++ + bias; | |
| 2100 + *data_ptr++ = buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; | |
| 2101 + } | |
| 2102 + | |
| 2103 + for(i=0; i< 64; i++) { | |
| 2104 + *data_ptr++ = -buf[i].real * *window_ptr++ + *delay_ptr++ + bias; | |
| 2105 + *data_ptr++ = buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; | |
| 2106 + } | |
| 2107 + | |
| 2108 + /* The trailing edge of the window goes into the delay line */ | |
| 2109 + delay_ptr = delay; | |
| 2110 + | |
| 2111 + for(i=0; i< 64; i++) { | |
| 2112 + *delay_ptr++ = -buf[64+i].real * *--window_ptr; | |
| 2113 + *delay_ptr++ = buf[64-i-1].imag * *--window_ptr; | |
| 2114 + } | |
| 2115 + | |
| 2116 + for(i=0; i<64; i++) { | |
| 2117 + *delay_ptr++ = buf[i].imag * *--window_ptr; | |
| 2118 + *delay_ptr++ = -buf[128-i-1].real * *--window_ptr; | |
| 2119 + } | |
| 2120 +} | |
| 2121 +#endif | |
| 2122 + | |
| 2123 + | |
| 2124 +// Stuff below this line is borrowed from libac3 | |
| 2125 +#include "srfftp.h" | |
| 2126 +#ifdef ARCH_X86 | |
| 2127 +#ifndef HAVE_3DNOW | |
| 2128 +#define HAVE_3DNOW 1 | |
| 2129 +#endif | |
| 2130 +#include "srfftp_3dnow.h" | |
| 2131 + | |
| 2132 +const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }}; | |
| 2133 +const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }}; | |
| 2134 +const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 }; | |
| 2135 + | |
| 2136 +#undef HAVE_3DNOWEX | |
| 2137 +#include "imdct_3dnow.h" | |
| 2138 +#define HAVE_3DNOWEX | |
| 2139 +#include "imdct_3dnow.h" | |
| 2140 + | |
| 2141 +void | |
| 2142 +imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias) | |
| 2143 +{ | |
| 2144 +/* int i,k; | |
| 2145 + int p,q;*/ | |
| 2146 + int m; | |
| 2147 + int two_m; | |
| 2148 + int two_m_plus_one; | |
| 2149 + | |
| 2150 +/* sample_t tmp_a_i; | |
| 2151 + sample_t tmp_a_r; | |
| 2152 + sample_t tmp_b_i; | |
| 2153 + sample_t tmp_b_r;*/ | |
| 2154 + | |
| 2155 + sample_t *data_ptr; | |
| 2156 + sample_t *delay_ptr; | |
| 2157 + sample_t *window_ptr; | |
| 2158 + | |
| 2159 + /* 512 IMDCT with source and dest data in 'data' */ | |
| 2160 + /* see the c version (dct_do_512()), its allmost identical, just in C */ | |
| 2161 + | |
| 2162 + /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ | |
| 2163 + /* Bit reversed shuffling */ | |
| 2164 + asm volatile( | |
| 2165 + "xorl %%esi, %%esi \n\t" | |
| 2166 + "leal "MANGLE(bit_reverse_512)", %%eax \n\t" | |
| 2167 + "movl $1008, %%edi \n\t" | |
| 2168 + "pushl %%ebp \n\t" //use ebp without telling gcc | |
| 2169 + ".balign 16 \n\t" | |
| 2170 + "1: \n\t" | |
| 2171 + "movlps (%0, %%esi), %%xmm0 \n\t" // XXXI | |
| 2172 + "movhps 8(%0, %%edi), %%xmm0 \n\t" // RXXI | |
| 2173 + "movlps 8(%0, %%esi), %%xmm1 \n\t" // XXXi | |
| 2174 + "movhps (%0, %%edi), %%xmm1 \n\t" // rXXi | |
| 2175 + "shufps $0x33, %%xmm1, %%xmm0 \n\t" // irIR | |
| 2176 + "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t" | |
| 2177 + "mulps %%xmm0, %%xmm2 \n\t" | |
| 2178 + "shufps $0xB1, %%xmm0, %%xmm0 \n\t" // riRI | |
| 2179 + "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" | |
| 2180 + "subps %%xmm0, %%xmm2 \n\t" | |
| 2181 + "movzbl (%%eax), %%edx \n\t" | |
| 2182 + "movzbl 1(%%eax), %%ebp \n\t" | |
| 2183 + "movlps %%xmm2, (%1, %%edx,8) \n\t" | |
| 2184 + "movhps %%xmm2, (%1, %%ebp,8) \n\t" | |
| 2185 + "addl $16, %%esi \n\t" | |
| 2186 + "addl $2, %%eax \n\t" // avoid complex addressing for P4 crap | |
| 2187 + "subl $16, %%edi \n\t" | |
| 2188 + " jnc 1b \n\t" | |
| 2189 + "popl %%ebp \n\t"//no we didnt touch ebp *g* | |
| 2190 + :: "b" (data), "c" (buf) | |
| 2191 + : "%esi", "%edi", "%eax", "%edx" | |
| 2192 + ); | |
| 2193 + | |
| 2194 + | |
| 2195 + /* FFT Merge */ | |
| 2196 +/* unoptimized variant | |
| 2197 + for (m=1; m < 7; m++) { | |
| 2198 + if(m) | |
| 2199 + two_m = (1 << m); | |
| 2200 + else | |
| 2201 + two_m = 1; | |
| 2202 + | |
| 2203 + two_m_plus_one = (1 << (m+1)); | |
| 2204 + | |
| 2205 + for(i = 0; i < 128; i += two_m_plus_one) { | |
| 2206 + for(k = 0; k < two_m; k++) { | |
| 2207 + p = k + i; | |
| 2208 + q = p + two_m; | |
| 2209 + tmp_a_r = buf[p].real; | |
| 2210 + tmp_a_i = buf[p].imag; | |
| 2211 + tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag; | |
| 2212 + tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag; | |
| 2213 + buf[p].real = tmp_a_r + tmp_b_r; | |
| 2214 + buf[p].imag = tmp_a_i + tmp_b_i; | |
| 2215 + buf[q].real = tmp_a_r - tmp_b_r; | |
| 2216 + buf[q].imag = tmp_a_i - tmp_b_i; | |
| 2217 + } | |
| 2218 + } | |
| 2219 + } | |
| 2220 +*/ | |
| 2221 + | |
| 2222 + /* 1. iteration */ | |
| 2223 + // Note w[0][0]={1,0} | |
| 2224 + asm volatile( | |
| 2225 + "xorps %%xmm1, %%xmm1 \n\t" | |
| 2226 + "xorps %%xmm2, %%xmm2 \n\t" | |
| 2227 + "movl %0, %%esi \n\t" | |
| 2228 + ".balign 16 \n\t" | |
| 2229 + "1: \n\t" | |
| 2230 + "movlps (%%esi), %%xmm0 \n\t" //buf[p] | |
| 2231 + "movlps 8(%%esi), %%xmm1\n\t" //buf[q] | |
| 2232 + "movhps (%%esi), %%xmm0 \n\t" //buf[p] | |
| 2233 + "movhps 8(%%esi), %%xmm2\n\t" //buf[q] | |
| 2234 + "addps %%xmm1, %%xmm0 \n\t" | |
| 2235 + "subps %%xmm2, %%xmm0 \n\t" | |
| 2236 + "movaps %%xmm0, (%%esi) \n\t" | |
| 2237 + "addl $16, %%esi \n\t" | |
| 2238 + "cmpl %1, %%esi \n\t" | |
| 2239 + " jb 1b \n\t" | |
| 2240 + :: "g" (buf), "r" (buf + 128) | |
| 2241 + : "%esi" | |
| 2242 + ); | |
| 2243 + | |
| 2244 + /* 2. iteration */ | |
| 2245 + // Note w[1]={{1,0}, {0,-1}} | |
| 2246 + asm volatile( | |
| 2247 + "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1 | |
| 2248 + "movl %0, %%esi \n\t" | |
| 2249 + ".balign 16 \n\t" | |
| 2250 + "1: \n\t" | |
| 2251 + "movaps 16(%%esi), %%xmm2 \n\t" //r2,i2,r3,i3 | |
| 2252 + "shufps $0xB4, %%xmm2, %%xmm2 \n\t" //r2,i2,i3,r3 | |
| 2253 + "mulps %%xmm7, %%xmm2 \n\t" //r2,i2,i3,-r3 | |
| 2254 + "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | |
| 2255 + "movaps (%%esi), %%xmm1 \n\t" //r0,i0,r1,i1 | |
| 2256 + "addps %%xmm2, %%xmm0 \n\t" | |
| 2257 + "subps %%xmm2, %%xmm1 \n\t" | |
| 2258 + "movaps %%xmm0, (%%esi) \n\t" | |
| 2259 + "movaps %%xmm1, 16(%%esi) \n\t" | |
| 2260 + "addl $32, %%esi \n\t" | |
| 2261 + "cmpl %1, %%esi \n\t" | |
| 2262 + " jb 1b \n\t" | |
| 2263 + :: "g" (buf), "r" (buf + 128) | |
| 2264 + : "%esi" | |
| 2265 + ); | |
| 2266 + | |
| 2267 + /* 3. iteration */ | |
| 2268 +/* | |
| 2269 + Note sseW2+0={1,1,sqrt(2),sqrt(2)) | |
| 2270 + Note sseW2+16={0,0,sqrt(2),-sqrt(2)) | |
| 2271 + Note sseW2+32={0,0,-sqrt(2),-sqrt(2)) | |
| 2272 + Note sseW2+48={1,-1,sqrt(2),-sqrt(2)) | |
| 2273 +*/ | |
| 2274 + asm volatile( | |
| 2275 + "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t" | |
| 2276 + "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t" | |
| 2277 + "xorps %%xmm5, %%xmm5 \n\t" | |
| 2278 + "xorps %%xmm2, %%xmm2 \n\t" | |
| 2279 + "movl %0, %%esi \n\t" | |
| 2280 + ".balign 16 \n\t" | |
| 2281 + "1: \n\t" | |
| 2282 + "movaps 32(%%esi), %%xmm2 \n\t" //r4,i4,r5,i5 | |
| 2283 + "movaps 48(%%esi), %%xmm3 \n\t" //r6,i6,r7,i7 | |
| 2284 + "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5 | |
| 2285 + "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7 | |
| 2286 + "mulps %%xmm2, %%xmm4 \n\t" | |
| 2287 + "mulps %%xmm3, %%xmm5 \n\t" | |
| 2288 + "shufps $0xB1, %%xmm2, %%xmm2 \n\t" //i4,r4,i5,r5 | |
| 2289 + "shufps $0xB1, %%xmm3, %%xmm3 \n\t" //i6,r6,i7,r7 | |
| 2290 + "mulps %%xmm6, %%xmm3 \n\t" | |
| 2291 + "mulps %%xmm7, %%xmm2 \n\t" | |
| 2292 + "movaps (%%esi), %%xmm0 \n\t" //r0,i0,r1,i1 | |
| 2293 + "movaps 16(%%esi), %%xmm1 \n\t" //r2,i2,r3,i3 | |
| 2294 + "addps %%xmm4, %%xmm2 \n\t" | |
| 2295 + "addps %%xmm5, %%xmm3 \n\t" | |
| 2296 + "movaps %%xmm2, %%xmm4 \n\t" | |
| 2297 + "movaps %%xmm3, %%xmm5 \n\t" | |
| 2298 + "addps %%xmm0, %%xmm2 \n\t" | |
| 2299 + "addps %%xmm1, %%xmm3 \n\t" | |
| 2300 + "subps %%xmm4, %%xmm0 \n\t" | |
| 2301 + "subps %%xmm5, %%xmm1 \n\t" | |
| 2302 + "movaps %%xmm2, (%%esi) \n\t" | |
| 2303 + "movaps %%xmm3, 16(%%esi) \n\t" | |
| 2304 + "movaps %%xmm0, 32(%%esi) \n\t" | |
| 2305 + "movaps %%xmm1, 48(%%esi) \n\t" | |
| 2306 + "addl $64, %%esi \n\t" | |
| 2307 + "cmpl %1, %%esi \n\t" | |
| 2308 + " jb 1b \n\t" | |
| 2309 + :: "g" (buf), "r" (buf + 128) | |
| 2310 + : "%esi" | |
| 2311 + ); | |
| 2312 + | |
| 2313 + /* 4-7. iterations */ | |
| 2314 + for (m=3; m < 7; m++) { | |
| 2315 + two_m = (1 << m); | |
| 2316 + two_m_plus_one = two_m<<1; | |
| 2317 + asm volatile( | |
| 2318 + "movl %0, %%esi \n\t" | |
| 2319 + ".balign 16 \n\t" | |
| 2320 + "1: \n\t" | |
| 2321 + "xorl %%edi, %%edi \n\t" // k | |
| 2322 + "leal (%%esi, %3), %%edx \n\t" | |
| 2323 + "2: \n\t" | |
| 2324 + "movaps (%%edx, %%edi), %%xmm1 \n\t" | |
| 2325 + "movaps (%4, %%edi, 2), %%xmm2 \n\t" | |
| 2326 + "mulps %%xmm1, %%xmm2 \n\t" | |
| 2327 + "shufps $0xB1, %%xmm1, %%xmm1 \n\t" | |
| 2328 + "mulps 16(%4, %%edi, 2), %%xmm1 \n\t" | |
| 2329 + "movaps (%%esi, %%edi), %%xmm0 \n\t" | |
| 2330 + "addps %%xmm2, %%xmm1 \n\t" | |
| 2331 + "movaps %%xmm1, %%xmm2 \n\t" | |
| 2332 + "addps %%xmm0, %%xmm1 \n\t" | |
| 2333 + "subps %%xmm2, %%xmm0 \n\t" | |
| 2334 + "movaps %%xmm1, (%%esi, %%edi) \n\t" | |
| 2335 + "movaps %%xmm0, (%%edx, %%edi) \n\t" | |
| 2336 + "addl $16, %%edi \n\t" | |
| 2337 + "cmpl %3, %%edi \n\t" //FIXME (opt) count against 0 | |
| 2338 + " jb 2b \n\t" | |
| 2339 + "addl %2, %%esi \n\t" | |
| 2340 + "cmpl %1, %%esi \n\t" | |
| 2341 + " jb 1b \n\t" | |
| 2342 + :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3), | |
| 2343 + "r" (sseW[m]) | |
| 2344 + : "%esi", "%edi", "%edx" | |
| 2345 + ); | |
| 2346 + } | |
| 2347 + | |
| 2348 + /* Post IFFT complex multiply plus IFFT complex conjugate*/ | |
| 2349 + asm volatile( | |
| 2350 + "movl $-1024, %%esi \n\t" | |
| 2351 + ".balign 16 \n\t" | |
| 2352 + "1: \n\t" | |
| 2353 + "movaps (%0, %%esi), %%xmm0 \n\t" | |
| 2354 + "movaps (%0, %%esi), %%xmm1 \n\t" | |
| 2355 + "shufps $0xB1, %%xmm0, %%xmm0 \n\t" | |
| 2356 + "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t" | |
| 2357 + "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t" | |
| 2358 + "addps %%xmm1, %%xmm0 \n\t" | |
| 2359 + "movaps %%xmm0, (%0, %%esi) \n\t" | |
| 2360 + "addl $16, %%esi \n\t" | |
| 2361 + " jnz 1b \n\t" | |
| 2362 + :: "r" (buf+128) | |
| 2363 + : "%esi" | |
| 2364 + ); | |
| 2365 + | |
| 2366 + | |
| 2367 + data_ptr = data; | |
| 2368 + delay_ptr = delay; | |
| 2369 + window_ptr = imdct_window; | |
| 2370 + | |
| 2371 + /* Window and convert to real valued signal */ | |
| 2372 + asm volatile( | |
| 2373 + "xorl %%edi, %%edi \n\t" // 0 | |
| 2374 + "xorl %%esi, %%esi \n\t" // 0 | |
| 2375 + "movss %3, %%xmm2 \n\t" // bias | |
| 2376 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | |
| 2377 + ".balign 16 \n\t" | |
| 2378 + "1: \n\t" | |
| 2379 + "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | |
| 2380 + "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | |
| 2381 + "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | |
| 2382 + "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | |
| 2383 + "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | |
| 2384 + "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | |
| 2385 + "addps (%2, %%esi), %%xmm0 \n\t" | |
| 2386 + "addps %%xmm2, %%xmm0 \n\t" | |
| 2387 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
| 2388 + "addl $16, %%esi \n\t" | |
| 2389 + "subl $16, %%edi \n\t" | |
| 2390 + "cmpl $512, %%esi \n\t" | |
| 2391 + " jb 1b \n\t" | |
| 2392 + :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | |
| 2393 + : "%esi", "%edi" | |
| 2394 + ); | |
| 2395 + data_ptr+=128; | |
| 2396 + delay_ptr+=128; | |
| 2397 +// window_ptr+=128; | |
| 2398 + | |
| 2399 + asm volatile( | |
| 2400 + "movl $1024, %%edi \n\t" // 512 | |
| 2401 + "xorl %%esi, %%esi \n\t" // 0 | |
| 2402 + "movss %3, %%xmm2 \n\t" // bias | |
| 2403 + "shufps $0x00, %%xmm2, %%xmm2 \n\t" // bias, bias, ... | |
| 2404 + ".balign 16 \n\t" | |
| 2405 + "1: \n\t" | |
| 2406 + "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | |
| 2407 + "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | |
| 2408 + "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | |
| 2409 + "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | |
| 2410 + "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | |
| 2411 + "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | |
| 2412 + "addps (%2, %%esi), %%xmm0 \n\t" | |
| 2413 + "addps %%xmm2, %%xmm0 \n\t" | |
| 2414 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
| 2415 + "addl $16, %%esi \n\t" | |
| 2416 + "subl $16, %%edi \n\t" | |
| 2417 + "cmpl $512, %%esi \n\t" | |
| 2418 + " jb 1b \n\t" | |
| 2419 + :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias) | |
| 2420 + : "%esi", "%edi" | |
| 2421 + ); | |
| 2422 + data_ptr+=128; | |
| 2423 +// window_ptr+=128; | |
| 2424 + | |
| 2425 + /* The trailing edge of the window goes into the delay line */ | |
| 2426 + delay_ptr = delay; | |
| 2427 + | |
| 2428 + asm volatile( | |
| 2429 + "xorl %%edi, %%edi \n\t" // 0 | |
| 2430 + "xorl %%esi, %%esi \n\t" // 0 | |
| 2431 + ".balign 16 \n\t" | |
| 2432 + "1: \n\t" | |
| 2433 + "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? ? A | |
| 2434 + "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? ? C | |
| 2435 + "movhps -16(%0, %%edi), %%xmm1 \n\t" // D ? ? C | |
| 2436 + "movhps -8(%0, %%edi), %%xmm0 \n\t" // B ? ? A | |
| 2437 + "shufps $0xCC, %%xmm1, %%xmm0 \n\t" // D C B A | |
| 2438 + "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | |
| 2439 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
| 2440 + "addl $16, %%esi \n\t" | |
| 2441 + "subl $16, %%edi \n\t" | |
| 2442 + "cmpl $512, %%esi \n\t" | |
| 2443 + " jb 1b \n\t" | |
| 2444 + :: "r" (buf+64), "r" (delay_ptr) | |
| 2445 + : "%esi", "%edi" | |
| 2446 + ); | |
| 2447 + delay_ptr+=128; | |
| 2448 +// window_ptr-=128; | |
| 2449 + | |
| 2450 + asm volatile( | |
| 2451 + "movl $1024, %%edi \n\t" // 1024 | |
| 2452 + "xorl %%esi, %%esi \n\t" // 0 | |
| 2453 + ".balign 16 \n\t" | |
| 2454 + "1: \n\t" | |
| 2455 + "movlps (%0, %%esi), %%xmm0 \n\t" // ? ? A ? | |
| 2456 + "movlps 8(%0, %%esi), %%xmm1 \n\t" // ? ? C ? | |
| 2457 + "movhps -16(%0, %%edi), %%xmm1 \n\t" // ? D C ? | |
| 2458 + "movhps -8(%0, %%edi), %%xmm0 \n\t" // ? B A ? | |
| 2459 + "shufps $0x99, %%xmm1, %%xmm0 \n\t" // D C B A | |
| 2460 + "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t" | |
| 2461 + "movaps %%xmm0, (%1, %%esi) \n\t" | |
| 2462 + "addl $16, %%esi \n\t" | |
| 2463 + "subl $16, %%edi \n\t" | |
| 2464 + "cmpl $512, %%esi \n\t" | |
| 2465 + " jb 1b \n\t" | |
| 2466 + :: "r" (buf), "r" (delay_ptr) | |
| 2467 + : "%esi", "%edi" | |
| 2468 + ); | |
| 2469 +} | |
| 2470 +#endif //arch_x86 | |
| 2471 + | |
| 2472 void | |
| 2473 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias) | |
| 2474 { | |
| 2475 @@ -379,13 +1233,19 @@ | |
| 2476 { | |
| 2477 int i, j, k; | |
| 2478 | |
| 2479 - fprintf (stderr, "No accelerated IMDCT transform found\n"); | |
| 2480 - | |
| 2481 /* Twiddle factors to turn IFFT into IMDCT */ | |
| 2482 for (i = 0; i < 128; i++) { | |
| 2483 xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1)); | |
| 2484 xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1)); | |
| 2485 } | |
| 2486 +#ifdef ARCH_X86 | |
| 2487 + for (i = 0; i < 128; i++) { | |
| 2488 + sseSinCos1c[2*i+0]= xcos1[i]; | |
| 2489 + sseSinCos1c[2*i+1]= -xcos1[i]; | |
| 2490 + sseSinCos1d[2*i+0]= xsin1[i]; | |
| 2491 + sseSinCos1d[2*i+1]= xsin1[i]; | |
| 2492 + } | |
| 2493 +#endif | |
| 2494 | |
| 2495 /* More twiddle factors to turn IFFT into IMDCT */ | |
| 2496 for (i = 0; i < 64; i++) { | |
| 2497 @@ -400,7 +1260,334 @@ | |
| 2498 w[i][k].imag = sin (-M_PI * k / j); | |
| 2499 } | |
| 2500 } | |
| 2501 +#ifdef ARCH_X86 | |
| 2502 + for (i = 1; i < 7; i++) { | |
| 2503 + j = 1 << i; | |
| 2504 + for (k = 0; k < j; k+=2) { | |
| 2505 + | |
| 2506 + sseW[i][4*k + 0] = w[i][k+0].real; | |
| 2507 + sseW[i][4*k + 1] = w[i][k+0].real; | |
| 2508 + sseW[i][4*k + 2] = w[i][k+1].real; | |
| 2509 + sseW[i][4*k + 3] = w[i][k+1].real; | |
| 2510 + | |
| 2511 + sseW[i][4*k + 4] = -w[i][k+0].imag; | |
| 2512 + sseW[i][4*k + 5] = w[i][k+0].imag; | |
| 2513 + sseW[i][4*k + 6] = -w[i][k+1].imag; | |
| 2514 + sseW[i][4*k + 7] = w[i][k+1].imag; | |
| 2515 + | |
| 2516 + //we multiply more or less uninitalized numbers so we need to use exactly 0.0 | |
| 2517 + if(k==0) | |
| 2518 + { | |
| 2519 +// sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0; | |
| 2520 + sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0; | |
| 2521 + } | |
| 2522 + | |
| 2523 + if(2*k == j) | |
| 2524 + { | |
| 2525 + sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0; | |
| 2526 +// sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0); | |
| 2527 + } | |
| 2528 + } | |
| 2529 + } | |
| 2530 + | |
| 2531 + for(i=0; i<128; i++) | |
| 2532 + { | |
| 2533 + sseWindow[2*i+0]= -imdct_window[2*i+0]; | |
| 2534 + sseWindow[2*i+1]= imdct_window[2*i+1]; | |
| 2535 + } | |
| 2536 + | |
| 2537 + for(i=0; i<64; i++) | |
| 2538 + { | |
| 2539 + sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1]; | |
| 2540 + sseWindow[256 + 2*i+1]= imdct_window[254 - 2*i+0]; | |
| 2541 + sseWindow[384 + 2*i+0]= imdct_window[126 - 2*i+1]; | |
| 2542 + sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0]; | |
| 2543 + } | |
| 2544 +#endif // arch_x86 | |
| 2545 + | |
| 2546 imdct_512 = imdct_do_512; | |
| 2547 +#ifdef ARCH_X86 | |
| 2548 + if(mm_accel & MM_ACCEL_X86_SSE) | |
| 2549 + { | |
| 2550 + fprintf (stderr, "Using SSE optimized IMDCT transform\n"); | |
| 2551 + imdct_512 = imdct_do_512_sse; | |
| 2552 + } | |
| 2553 + else | |
| 2554 + if(mm_accel & MM_ACCEL_X86_3DNOWEXT) | |
| 2555 + { | |
| 2556 + fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n"); | |
| 2557 + imdct_512 = imdct_do_512_3dnowex; | |
| 2558 + } | |
| 2559 + else | |
| 2560 + if(mm_accel & MM_ACCEL_X86_3DNOW) | |
| 2561 + { | |
| 2562 + fprintf (stderr, "Using 3DNow optimized IMDCT transform\n"); | |
| 2563 + imdct_512 = imdct_do_512_3dnow; | |
| 2564 + } | |
| 2565 + else | |
| 2566 +#endif // arch_x86 | |
| 2567 +#ifdef HAVE_ALTIVEC | |
| 2568 + if (mm_accel & MM_ACCEL_PPC_ALTIVEC) | |
| 2569 + { | |
| 2570 + fprintf(stderr, "Using AltiVec optimized IMDCT transform\n"); | |
| 2571 + imdct_512 = imdct_do_512_altivec; | |
| 2572 + } | |
| 2573 + else | |
| 2574 +#endif | |
| 2575 + fprintf (stderr, "No accelerated IMDCT transform found\n"); | |
| 2576 imdct_256 = imdct_do_256; | |
| 2577 } | |
| 2578 } | |
| 2579 + | |
| 2580 +static void fft_asmb(int k, complex_t *x, complex_t *wTB, | |
| 2581 + const complex_t *d, const complex_t *d_3) | |
| 2582 +{ | |
| 2583 + register complex_t *x2k, *x3k, *x4k, *wB; | |
| 2584 + register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i; | |
| 2585 + | |
| 2586 + x2k = x + 2 * k; | |
| 2587 + x3k = x2k + 2 * k; | |
| 2588 + x4k = x3k + 2 * k; | |
| 2589 + wB = wTB + 2 * k; | |
| 2590 + | |
| 2591 + TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]); | |
| 2592 + TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]); | |
| 2593 + | |
| 2594 + --k; | |
| 2595 + for(;;) { | |
| 2596 + TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]); | |
| 2597 + TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]); | |
| 2598 + if (!--k) break; | |
| 2599 + x += 2; | |
| 2600 + x2k += 2; | |
| 2601 + x3k += 2; | |
| 2602 + x4k += 2; | |
| 2603 + d += 2; | |
| 2604 + d_3 += 2; | |
| 2605 + wTB += 2; | |
| 2606 + wB += 2; | |
| 2607 + } | |
| 2608 + | |
| 2609 +} | |
| 2610 + | |
| 2611 +static void fft_asmb16(complex_t *x, complex_t *wTB) | |
| 2612 +{ | |
| 2613 + register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i; | |
| 2614 + int k = 2; | |
| 2615 + | |
| 2616 + /* transform x[0], x[8], x[4], x[12] */ | |
| 2617 + TRANSZERO(x[0],x[4],x[8],x[12]); | |
| 2618 + | |
| 2619 + /* transform x[1], x[9], x[5], x[13] */ | |
| 2620 + TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]); | |
| 2621 + | |
| 2622 + /* transform x[2], x[10], x[6], x[14] */ | |
| 2623 + TRANSHALF_16(x[2],x[6],x[10],x[14]); | |
| 2624 + | |
| 2625 + /* transform x[3], x[11], x[7], x[15] */ | |
| 2626 + TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]); | |
| 2627 + | |
| 2628 +} | |
| 2629 + | |
| 2630 +static void fft_4(complex_t *x) | |
| 2631 +{ | |
| 2632 + /* delta_p = 1 here */ | |
| 2633 + /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4} | |
| 2634 + */ | |
| 2635 + | |
| 2636 + register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i; | |
| 2637 + | |
| 2638 + yt_r = x[0].real; | |
| 2639 + yb_r = yt_r - x[2].real; | |
| 2640 + yt_r += x[2].real; | |
| 2641 + | |
| 2642 + u_r = x[1].real; | |
| 2643 + vi_i = x[3].real - u_r; | |
| 2644 + u_r += x[3].real; | |
| 2645 + | |
| 2646 + u_i = x[1].imag; | |
| 2647 + vi_r = u_i - x[3].imag; | |
| 2648 + u_i += x[3].imag; | |
| 2649 + | |
| 2650 + yt_i = yt_r; | |
| 2651 + yt_i += u_r; | |
| 2652 + x[0].real = yt_i; | |
| 2653 + yt_r -= u_r; | |
| 2654 + x[2].real = yt_r; | |
| 2655 + yt_i = yb_r; | |
| 2656 + yt_i += vi_r; | |
| 2657 + x[1].real = yt_i; | |
| 2658 + yb_r -= vi_r; | |
| 2659 + x[3].real = yb_r; | |
| 2660 + | |
| 2661 + yt_i = x[0].imag; | |
| 2662 + yb_i = yt_i - x[2].imag; | |
| 2663 + yt_i += x[2].imag; | |
| 2664 + | |
| 2665 + yt_r = yt_i; | |
| 2666 + yt_r += u_i; | |
| 2667 + x[0].imag = yt_r; | |
| 2668 + yt_i -= u_i; | |
| 2669 + x[2].imag = yt_i; | |
| 2670 + yt_r = yb_i; | |
| 2671 + yt_r += vi_i; | |
| 2672 + x[1].imag = yt_r; | |
| 2673 + yb_i -= vi_i; | |
| 2674 + x[3].imag = yb_i; | |
| 2675 +} | |
| 2676 + | |
| 2677 + | |
| 2678 +static void fft_8(complex_t *x) | |
| 2679 +{ | |
| 2680 + /* delta_p = diag{1, sqrt(i)} here */ | |
| 2681 + /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8} | |
| 2682 + */ | |
| 2683 + register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i; | |
| 2684 + | |
| 2685 + wT1_r = x[1].real; | |
| 2686 + wT1_i = x[1].imag; | |
| 2687 + wB1_r = x[3].real; | |
| 2688 + wB1_i = x[3].imag; | |
| 2689 + | |
| 2690 + x[1] = x[2]; | |
| 2691 + x[2] = x[4]; | |
| 2692 + x[3] = x[6]; | |
| 2693 + fft_4(&x[0]); | |
| 2694 + | |
| 2695 + | |
| 2696 + /* x[0] x[4] */ | |
| 2697 + wT2_r = x[5].real; | |
| 2698 + wT2_r += x[7].real; | |
| 2699 + wT2_r += wT1_r; | |
| 2700 + wT2_r += wB1_r; | |
| 2701 + wT2_i = wT2_r; | |
| 2702 + wT2_r += x[0].real; | |
| 2703 + wT2_i = x[0].real - wT2_i; | |
| 2704 + x[0].real = wT2_r; | |
| 2705 + x[4].real = wT2_i; | |
| 2706 + | |
| 2707 + wT2_i = x[5].imag; | |
| 2708 + wT2_i += x[7].imag; | |
| 2709 + wT2_i += wT1_i; | |
| 2710 + wT2_i += wB1_i; | |
| 2711 + wT2_r = wT2_i; | |
| 2712 + wT2_r += x[0].imag; | |
| 2713 + wT2_i = x[0].imag - wT2_i; | |
| 2714 + x[0].imag = wT2_r; | |
| 2715 + x[4].imag = wT2_i; | |
| 2716 + | |
| 2717 + /* x[2] x[6] */ | |
| 2718 + wT2_r = x[5].imag; | |
| 2719 + wT2_r -= x[7].imag; | |
| 2720 + wT2_r += wT1_i; | |
| 2721 + wT2_r -= wB1_i; | |
| 2722 + wT2_i = wT2_r; | |
| 2723 + wT2_r += x[2].real; | |
| 2724 + wT2_i = x[2].real - wT2_i; | |
| 2725 + x[2].real = wT2_r; | |
| 2726 + x[6].real = wT2_i; | |
| 2727 + | |
| 2728 + wT2_i = x[5].real; | |
| 2729 + wT2_i -= x[7].real; | |
| 2730 + wT2_i += wT1_r; | |
| 2731 + wT2_i -= wB1_r; | |
| 2732 + wT2_r = wT2_i; | |
| 2733 + wT2_r += x[2].imag; | |
| 2734 + wT2_i = x[2].imag - wT2_i; | |
| 2735 + x[2].imag = wT2_i; | |
| 2736 + x[6].imag = wT2_r; | |
| 2737 + | |
| 2738 + | |
| 2739 + /* x[1] x[5] */ | |
| 2740 + wT2_r = wT1_r; | |
| 2741 + wT2_r += wB1_i; | |
| 2742 + wT2_r -= x[5].real; | |
| 2743 + wT2_r -= x[7].imag; | |
| 2744 + wT2_i = wT1_i; | |
| 2745 + wT2_i -= wB1_r; | |
| 2746 + wT2_i -= x[5].imag; | |
| 2747 + wT2_i += x[7].real; | |
| 2748 + | |
| 2749 + wB2_r = wT2_r; | |
| 2750 + wB2_r += wT2_i; | |
| 2751 + wT2_i -= wT2_r; | |
| 2752 + wB2_r *= HSQRT2; | |
| 2753 + wT2_i *= HSQRT2; | |
| 2754 + wT2_r = wB2_r; | |
| 2755 + wB2_r += x[1].real; | |
| 2756 + wT2_r = x[1].real - wT2_r; | |
| 2757 + | |
| 2758 + wB2_i = x[5].real; | |
| 2759 + x[1].real = wB2_r; | |
| 2760 + x[5].real = wT2_r; | |
| 2761 + | |
| 2762 + wT2_r = wT2_i; | |
| 2763 + wT2_r += x[1].imag; | |
| 2764 + wT2_i = x[1].imag - wT2_i; | |
| 2765 + wB2_r = x[5].imag; | |
| 2766 + x[1].imag = wT2_r; | |
| 2767 + x[5].imag = wT2_i; | |
| 2768 + | |
| 2769 + /* x[3] x[7] */ | |
| 2770 + wT1_r -= wB1_i; | |
| 2771 + wT1_i += wB1_r; | |
| 2772 + wB1_r = wB2_i - x[7].imag; | |
| 2773 + wB1_i = wB2_r + x[7].real; | |
| 2774 + wT1_r -= wB1_r; | |
| 2775 + wT1_i -= wB1_i; | |
| 2776 + wB1_r = wT1_r + wT1_i; | |
| 2777 + wB1_r *= HSQRT2; | |
| 2778 + wT1_i -= wT1_r; | |
| 2779 + wT1_i *= HSQRT2; | |
| 2780 + wB2_r = x[3].real; | |
| 2781 + wB2_i = wB2_r + wT1_i; | |
| 2782 + wB2_r -= wT1_i; | |
| 2783 + x[3].real = wB2_i; | |
| 2784 + x[7].real = wB2_r; | |
| 2785 + wB2_i = x[3].imag; | |
| 2786 + wB2_r = wB2_i + wB1_r; | |
| 2787 + wB2_i -= wB1_r; | |
| 2788 + x[3].imag = wB2_i; | |
| 2789 + x[7].imag = wB2_r; | |
| 2790 +} | |
| 2791 + | |
| 2792 + | |
| 2793 +static void fft_128p(complex_t *a) | |
| 2794 +{ | |
| 2795 + fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]); | |
| 2796 + fft_asmb16(&a[0], &a[8]); | |
| 2797 + | |
| 2798 + fft_8(&a[16]), fft_8(&a[24]); | |
| 2799 + fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]); | |
| 2800 + | |
| 2801 + fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]); | |
| 2802 + fft_asmb16(&a[32], &a[40]); | |
| 2803 + | |
| 2804 + fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]); | |
| 2805 + fft_asmb16(&a[48], &a[56]); | |
| 2806 + | |
| 2807 + fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]); | |
| 2808 + | |
| 2809 + fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]); | |
| 2810 + /* fft_16(&a[64]); */ | |
| 2811 + fft_asmb16(&a[64], &a[72]); | |
| 2812 + | |
| 2813 + fft_8(&a[80]); fft_8(&a[88]); | |
| 2814 + | |
| 2815 + /* fft_32(&a[64]); */ | |
| 2816 + fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]); | |
| 2817 + | |
| 2818 + fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]); | |
| 2819 + /* fft_16(&a[96]); */ | |
| 2820 + fft_asmb16(&a[96], &a[104]); | |
| 2821 + | |
| 2822 + fft_8(&a[112]), fft_8(&a[120]); | |
| 2823 + /* fft_32(&a[96]); */ | |
| 2824 + fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]); | |
| 2825 + | |
| 2826 + /* fft_128(&a[0]); */ | |
| 2827 + fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]); | |
| 2828 +} | |
| 2829 + | |
| 2830 + | |
| 2831 + | |
| 2832 --- liba52/imdct_mlib.c 2005-03-22 19:59:35.000000000 +0100 | |
| 2833 +++ imdct_mlib.c 2004-03-19 01:15:51.000000000 +0100 | |
| 2834 @@ -23,11 +29,11 @@ | |
| 2835 | |
| 2836 #ifdef LIBA52_MLIB | |
| 2837 | |
| 2838 -#include <inttypes.h> | |
| 2839 -#include <string.h> | |
| 2840 #include <mlib_types.h> | |
| 2841 #include <mlib_status.h> | |
| 2842 #include <mlib_signal.h> | |
| 2843 +#include <string.h> | |
| 2844 +#include <inttypes.h> | |
| 2845 | |
| 2846 #include "a52.h" | |
| 2847 #include "a52_internal.h" | |
| 2848 @@ -42,7 +48,7 @@ | |
| 2849 sample_t *data_ptr; | |
| 2850 sample_t *delay_ptr; | |
| 2851 sample_t *window_ptr; | |
| 2852 - sample_t tmp[256] __attribute__ ((__aligned__ (16))); | |
| 2853 + sample_t tmp[256] __attribute__((aligned(16))); | |
| 2854 int i; | |
| 2855 | |
| 2856 memcpy(tmp, data, 256 * sizeof(sample_t)); | |
| 2857 @@ -91,7 +97,7 @@ | |
| 2858 sample_t *data_ptr; | |
| 2859 sample_t *delay_ptr; | |
| 2860 sample_t *window_ptr; | |
| 2861 - sample_t tmp[256] __attribute__ ((__aligned__ (16))); | |
| 2862 + sample_t tmp[256] __attribute__((aligned(16))); | |
| 2863 int i; | |
| 2864 | |
| 2865 memcpy(tmp, data, 256 * sizeof(sample_t)); | |
| 2866 --- include/mm_accel.h 2005-03-22 19:58:53.000000000 +0100 | |
| 2867 +++ mm_accel.h 2004-03-19 01:15:52.000000000 +0100 | |
| 2868 @@ -19,12 +25,22 @@ | |
| 2869 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
| 2870 */ | |
| 2871 | |
| 2872 +#ifndef MM_ACCEL_H | |
| 2873 +#define MM_ACCEL_H | |
| 2874 + | |
| 2875 /* generic accelerations */ | |
| 2876 #define MM_ACCEL_MLIB 0x00000001 | |
| 2877 | |
| 2878 /* x86 accelerations */ | |
| 2879 #define MM_ACCEL_X86_MMX 0x80000000 | |
| 2880 #define MM_ACCEL_X86_3DNOW 0x40000000 | |
| 2881 +#define MM_ACCEL_X86_3DNOWEXT 0x08000000 | |
| 2882 #define MM_ACCEL_X86_MMXEXT 0x20000000 | |
| 2883 +#define MM_ACCEL_X86_SSE 0x10000000 | |
| 2884 + | |
| 2885 +/* PPC accelerations */ | |
| 2886 +#define MM_ACCEL_PPC_ALTIVEC 0x00010000 | |
| 2887 | |
| 2888 uint32_t mm_accel (void); | |
| 2889 + | |
| 2890 +#endif /* MM_ACCEL_H */ | |
| 2891 --- liba52/parse.c 2005-03-22 19:59:35.000000000 +0100 | |
| 2892 +++ parse.c 2004-04-01 15:41:29.000000000 +0200 | |
| 2893 @@ -21,21 +27,19 @@ | |
| 2894 | |
| 2895 #include "config.h" | |
| 2896 | |
| 2897 -#include <inttypes.h> | |
| 2898 #include <stdlib.h> | |
| 2899 #include <string.h> | |
| 2900 +#include <inttypes.h> | |
| 2901 | |
| 2902 #include "a52.h" | |
| 2903 #include "a52_internal.h" | |
| 2904 #include "bitstream.h" | |
| 2905 #include "tables.h" | |
| 2906 +#include "mm_accel.h" | |
| 2907 | |
| 2908 #ifdef HAVE_MEMALIGN | |
| 2909 /* some systems have memalign() but no declaration for it */ | |
| 2910 void * memalign (size_t align, size_t size); | |
| 2911 -#else | |
| 2912 -/* assume malloc alignment is sufficient */ | |
| 2913 -#define memalign(align,size) malloc (size) | |
| 2914 #endif | |
| 2915 | |
| 2916 typedef struct { | |
| 2917 @@ -54,12 +58,28 @@ | |
| 2918 sample_t * samples; | |
| 2919 int i; | |
| 2920 | |
| 2921 - imdct_init (mm_accel); | |
| 2922 - | |
| 2923 samples = memalign (16, 256 * 12 * sizeof (sample_t)); | |
| 2924 +#if defined(__MINGW32__) && defined(HAVE_SSE) | |
| 2925 + for(i=0;i<10;i++){ | |
| 2926 + if((int)samples%16){ | |
| 2927 + sample_t* samplestmp=malloc(256 * 12 * sizeof (sample_t)); | |
| 2928 + free(samples); | |
| 2929 + samples = samplestmp; | |
| 2930 + } | |
| 2931 + else break; | |
| 2932 + } | |
| 2933 +#endif | |
| 2934 + if(((int)samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){ | |
| 2935 + mm_accel &=~MM_ACCEL_X86_SSE; | |
| 2936 + printf("liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n"); | |
| 2937 + } | |
| 2938 + | |
| 2939 if (samples == NULL) | |
| 2940 - return NULL; | |
| 2941 - | |
| 2942 + return NULL; | |
| 2943 + | |
| 2944 + imdct_init (mm_accel); | |
| 2945 + downmix_accel_init(mm_accel); | |
| 2946 + | |
| 2947 for (i = 0; i < 256 * 12; i++) | |
| 2948 samples[i] = 0; | |
| 2949 | |
| 2950 @@ -124,7 +144,7 @@ | |
| 2951 state->acmod = acmod = buf[6] >> 5; | |
| 2952 | |
| 2953 bitstream_set_ptr (buf + 6); | |
| 2954 - bitstream_get (3); /* skip acmod we already parsed */ | |
| 2955 + bitstream_skip (3); /* skip acmod we already parsed */ | |
| 2956 | |
| 2957 if ((acmod == 2) && (bitstream_get (2) == 2)) /* dsurmod */ | |
| 2958 acmod = A52_DOLBY; | |
| 2959 @@ -144,7 +164,7 @@ | |
| 2960 if (state->lfeon && (*flags & A52_LFE)) | |
| 2961 state->output |= A52_LFE; | |
| 2962 *flags = state->output; | |
| 2963 - // the 2* compensates for differences in imdct | |
| 2964 + /* the 2* compensates for differences in imdct */ | |
| 2965 state->dynrng = state->level = 2 * *level; | |
| 2966 state->bias = bias; | |
| 2967 state->dynrnge = 1; | |
| 2968 @@ -152,28 +172,28 @@ | |
| 2969 | |
| 2970 chaninfo = !acmod; | |
| 2971 do { | |
| 2972 - bitstream_get (5); /* dialnorm */ | |
| 2973 + bitstream_skip (5); /* dialnorm */ | |
| 2974 if (bitstream_get (1)) /* compre */ | |
| 2975 - bitstream_get (8); /* compr */ | |
| 2976 + bitstream_skip (8); /* compr */ | |
| 2977 if (bitstream_get (1)) /* langcode */ | |
| 2978 - bitstream_get (8); /* langcod */ | |
| 2979 + bitstream_skip (8); /* langcod */ | |
| 2980 if (bitstream_get (1)) /* audprodie */ | |
| 2981 - bitstream_get (7); /* mixlevel + roomtyp */ | |
| 2982 + bitstream_skip (7); /* mixlevel + roomtyp */ | |
| 2983 } while (chaninfo--); | |
| 2984 | |
| 2985 - bitstream_get (2); /* copyrightb + origbs */ | |
| 2986 + bitstream_skip (2); /* copyrightb + origbs */ | |
| 2987 | |
| 2988 if (bitstream_get (1)) /* timecod1e */ | |
| 2989 - bitstream_get (14); /* timecod1 */ | |
| 2990 + bitstream_skip (14); /* timecod1 */ | |
| 2991 if (bitstream_get (1)) /* timecod2e */ | |
| 2992 - bitstream_get (14); /* timecod2 */ | |
| 2993 + bitstream_skip (14); /* timecod2 */ | |
| 2994 | |
| 2995 if (bitstream_get (1)) { /* addbsie */ | |
| 2996 int addbsil; | |
| 2997 | |
| 2998 addbsil = bitstream_get (6); | |
| 2999 do { | |
| 3000 - bitstream_get (8); /* addbsi */ | |
| 3001 + bitstream_skip (8); /* addbsi */ | |
| 3002 } while (addbsil--); | |
| 3003 } | |
| 3004 | |
| 3005 @@ -647,7 +667,7 @@ | |
| 3006 if (parse_exponents (chexpstr[i], nchgrps, state->fbw_exp[i][0], | |
| 3007 state->fbw_exp[i] + 1)) | |
| 3008 return 1; | |
| 3009 - bitstream_get (2); /* gainrng */ | |
| 3010 + bitstream_skip (2); /* gainrng */ | |
| 3011 } | |
| 3012 if (lfeexpstr != EXP_REUSE) { | |
| 3013 do_bit_alloc |= 32; | |
| 3014 @@ -729,7 +749,7 @@ | |
| 3015 if (bitstream_get (1)) { /* skiple */ | |
| 3016 i = bitstream_get (9); /* skipl */ | |
| 3017 while (i--) | |
| 3018 - bitstream_get (8); | |
| 3019 + bitstream_skip (8); | |
| 3020 } | |
| 3021 | |
| 3022 if (state->output & A52_LFE) | |
| 3023 |
