Mercurial > libavcodec.hg
annotate x86/fft_3dn2.c @ 12530:63edd10ad4bc libavcodec tip
Try to fix crashes introduced by r25218
r25218 made assumptions about the existence of past reference frames that
weren't necessarily true.
| author | darkshikari |
|---|---|
| date | Tue, 28 Sep 2010 09:06:22 +0000 |
| parents | b64b8e5a2d3a |
| children |
| rev | line source |
|---|---|
| 8430 | 1 /* |
| 2 * FFT/MDCT transform with Extended 3DNow! optimizations | |
| 3 * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt | |
| 4 * | |
| 5 * This file is part of FFmpeg. | |
| 6 * | |
| 7 * FFmpeg is free software; you can redistribute it and/or | |
| 8 * modify it under the terms of the GNU Lesser General Public | |
| 9 * License as published by the Free Software Foundation; either | |
| 10 * version 2.1 of the License, or (at your option) any later version. | |
| 11 * | |
| 12 * FFmpeg is distributed in the hope that it will be useful, | |
| 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 15 * Lesser General Public License for more details. | |
| 16 * | |
| 17 * You should have received a copy of the GNU Lesser General Public | |
| 18 * License along with FFmpeg; if not, write to the Free Software | |
| 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 20 */ | |
| 21 | |
| 22 #include "libavutil/x86_cpu.h" | |
| 23 #include "libavcodec/dsputil.h" | |
|
10175
5cf49858179a
Move per-arch fft init bits into the corresponding subdirs
mru
parents:
8430
diff
changeset
|
24 #include "fft.h" |
| 8430 | 25 |
| 11369 | 26 DECLARE_ALIGNED(8, static const int, m1m1)[2] = { 1<<31, 1<<31 }; |
| 8430 | 27 |
| 28 #ifdef EMULATE_3DNOWEXT | |
| 29 #define PSWAPD(s,d)\ | |
| 30 "movq "#s","#d"\n"\ | |
| 31 "psrlq $32,"#d"\n"\ | |
| 32 "punpckldq "#s","#d"\n" | |
| 33 #define ff_fft_calc_3dn2 ff_fft_calc_3dn | |
| 34 #define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn | |
| 35 #define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn | |
| 36 #define ff_imdct_calc_3dn2 ff_imdct_calc_3dn | |
| 37 #define ff_imdct_half_3dn2 ff_imdct_half_3dn | |
| 38 #else | |
| 39 #define PSWAPD(s,d) "pswapd "#s","#d"\n" | |
| 40 #endif | |
| 41 | |
| 42 void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits); | |
| 43 void ff_fft_dispatch_interleave_3dn2(FFTComplex *z, int nbits); | |
| 44 | |
| 45 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) | |
| 46 { | |
| 47 int n = 1<<s->nbits; | |
| 48 int i; | |
| 49 ff_fft_dispatch_interleave_3dn2(z, s->nbits); | |
| 50 __asm__ volatile("femms"); | |
| 51 if(n <= 8) | |
| 52 for(i=0; i<n; i+=2) | |
| 53 FFSWAP(FFTSample, z[i].im, z[i+1].re); | |
| 54 } | |
| 55 | |
| 10199 | 56 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) |
| 8430 | 57 { |
| 58 x86_reg j, k; | |
|
12405
b64b8e5a2d3a
imdct/x86: Use "s->mdct_size" instead of "1 << s->mdct_bits".
alexc
parents:
11369
diff
changeset
|
59 long n = s->mdct_size; |
| 8430 | 60 long n2 = n >> 1; |
| 61 long n4 = n >> 2; | |
| 62 long n8 = n >> 3; | |
| 10199 | 63 const uint16_t *revtab = s->revtab; |
| 8430 | 64 const FFTSample *tcos = s->tcos; |
| 65 const FFTSample *tsin = s->tsin; | |
| 66 const FFTSample *in1, *in2; | |
| 67 FFTComplex *z = (FFTComplex *)output; | |
| 68 | |
| 69 /* pre rotation */ | |
| 70 in1 = input; | |
| 71 in2 = input + n2 - 1; | |
| 72 #ifdef EMULATE_3DNOWEXT | |
| 73 __asm__ volatile("movd %0, %%mm7" ::"r"(1<<31)); | |
| 74 #endif | |
| 75 for(k = 0; k < n4; k++) { | |
| 76 // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it | |
| 77 __asm__ volatile( | |
| 78 "movd %0, %%mm0 \n" | |
| 79 "movd %2, %%mm1 \n" | |
| 80 "punpckldq %1, %%mm0 \n" | |
| 81 "punpckldq %3, %%mm1 \n" | |
| 82 "movq %%mm0, %%mm2 \n" | |
| 83 PSWAPD( %%mm1, %%mm3 ) | |
| 84 "pfmul %%mm1, %%mm0 \n" | |
| 85 "pfmul %%mm3, %%mm2 \n" | |
| 86 #ifdef EMULATE_3DNOWEXT | |
| 87 "movq %%mm0, %%mm1 \n" | |
| 88 "punpckhdq %%mm2, %%mm0 \n" | |
| 89 "punpckldq %%mm2, %%mm1 \n" | |
| 90 "pxor %%mm7, %%mm0 \n" | |
| 91 "pfadd %%mm1, %%mm0 \n" | |
| 92 #else | |
| 93 "pfpnacc %%mm2, %%mm0 \n" | |
| 94 #endif | |
| 95 ::"m"(in2[-2*k]), "m"(in1[2*k]), | |
| 96 "m"(tcos[k]), "m"(tsin[k]) | |
| 97 ); | |
| 98 __asm__ volatile( | |
| 99 "movq %%mm0, %0 \n\t" | |
| 100 :"=m"(z[revtab[k]]) | |
| 101 ); | |
| 102 } | |
| 103 | |
| 10199 | 104 ff_fft_dispatch_3dn2(z, s->nbits); |
| 8430 | 105 |
| 106 #define CMUL(j,mm0,mm1)\ | |
| 107 "movq (%2,"#j",2), %%mm6 \n"\ | |
| 108 "movq 8(%2,"#j",2), "#mm0"\n"\ | |
| 109 "movq %%mm6, "#mm1"\n"\ | |
| 110 "movq "#mm0",%%mm7 \n"\ | |
| 111 "pfmul (%3,"#j"), %%mm6 \n"\ | |
| 112 "pfmul (%4,"#j"), "#mm0"\n"\ | |
| 113 "pfmul (%4,"#j"), "#mm1"\n"\ | |
| 114 "pfmul (%3,"#j"), %%mm7 \n"\ | |
| 115 "pfsub %%mm6, "#mm0"\n"\ | |
| 116 "pfadd %%mm7, "#mm1"\n" | |
| 117 | |
| 118 /* post rotation */ | |
| 119 j = -n2; | |
| 120 k = n2-8; | |
| 121 __asm__ volatile( | |
| 122 "1: \n" | |
| 123 CMUL(%0, %%mm0, %%mm1) | |
| 124 CMUL(%1, %%mm2, %%mm3) | |
| 125 "movd %%mm0, (%2,%0,2) \n" | |
| 126 "movd %%mm1,12(%2,%1,2) \n" | |
| 127 "movd %%mm2, (%2,%1,2) \n" | |
| 128 "movd %%mm3,12(%2,%0,2) \n" | |
| 129 "psrlq $32, %%mm0 \n" | |
| 130 "psrlq $32, %%mm1 \n" | |
| 131 "psrlq $32, %%mm2 \n" | |
| 132 "psrlq $32, %%mm3 \n" | |
| 133 "movd %%mm0, 8(%2,%0,2) \n" | |
| 134 "movd %%mm1, 4(%2,%1,2) \n" | |
| 135 "movd %%mm2, 8(%2,%1,2) \n" | |
| 136 "movd %%mm3, 4(%2,%0,2) \n" | |
| 137 "sub $8, %1 \n" | |
| 138 "add $8, %0 \n" | |
| 139 "jl 1b \n" | |
| 140 :"+r"(j), "+r"(k) | |
| 141 :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8) | |
| 142 :"memory" | |
| 143 ); | |
| 144 __asm__ volatile("femms"); | |
| 145 } | |
| 146 | |
| 10199 | 147 void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input) |
| 8430 | 148 { |
| 149 x86_reg j, k; | |
|
12405
b64b8e5a2d3a
imdct/x86: Use "s->mdct_size" instead of "1 << s->mdct_bits".
alexc
parents:
11369
diff
changeset
|
150 long n = s->mdct_size; |
| 8430 | 151 long n4 = n >> 2; |
| 152 | |
| 153 ff_imdct_half_3dn2(s, output+n4, input); | |
| 154 | |
| 155 j = -n; | |
| 156 k = n-8; | |
| 157 __asm__ volatile( | |
| 158 "movq %4, %%mm7 \n" | |
| 159 "1: \n" | |
| 160 PSWAPD((%2,%1), %%mm0) | |
| 161 PSWAPD((%3,%0), %%mm1) | |
| 162 "pxor %%mm7, %%mm0 \n" | |
| 163 "movq %%mm1, (%3,%1) \n" | |
| 164 "movq %%mm0, (%2,%0) \n" | |
| 165 "sub $8, %1 \n" | |
| 166 "add $8, %0 \n" | |
| 167 "jl 1b \n" | |
| 168 :"+r"(j), "+r"(k) | |
| 169 :"r"(output+n4), "r"(output+n4*3), | |
| 170 "m"(*m1m1) | |
| 171 ); | |
| 172 __asm__ volatile("femms"); | |
| 173 } | |
| 174 |
