Mercurial > libavcodec.hg
annotate i386/fft_3dn2.c @ 3559:c02459cd0d31 libavcodec
slightly faster ff_imdct_calc_3dn2() on amd64. (gcc added a bunch of useless movsxd)
| author | lorenm |
|---|---|
| date | Tue, 08 Aug 2006 21:47:11 +0000 |
| parents | 5ea82888103e |
| children | f1a16d793fc5 |
| rev | line source |
|---|---|
| 3175 | 1 /* |
| 2 * FFT/MDCT transform with Extended 3DNow! optimizations | |
| 3555 | 3 * Copyright (c) 2006 Zuxy MENG Jie, Loren Merritt |
| 3175 | 4 * Based on fft_sse.c copyright (c) 2002 Fabrice Bellard. |
| 5 * | |
| 6 * This library is free software; you can redistribute it and/or | |
| 7 * modify it under the terms of the GNU Lesser General Public | |
| 8 * License as published by the Free Software Foundation; either | |
| 9 * version 2 of the License, or (at your option) any later version. | |
| 10 * | |
| 11 * This library is distributed in the hope that it will be useful, | |
| 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 14 * Lesser General Public License for more details. | |
| 15 * | |
| 16 * You should have received a copy of the GNU Lesser General Public | |
| 17 * License along with this library; if not, write to the Free Software | |
| 18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 19 */ | |
| 20 #include "../dsputil.h" | |
| 21 #include <math.h> | |
| 22 | |
| 23 #ifdef HAVE_MM3DNOW | |
| 24 | |
| 25 #include <mm3dnow.h> | |
| 26 | |
| 27 static const int p1m1[2] __attribute__((aligned(8))) = | |
| 28 { 0, 1 << 31 }; | |
| 29 | |
| 30 static const int m1p1[2] __attribute__((aligned(8))) = | |
| 31 { 1 << 31, 0 }; | |
| 32 | |
| 33 void ff_fft_calc_3dn2(FFTContext *s, FFTComplex *z) | |
| 34 { | |
| 35 int ln = s->nbits; | |
| 36 int j, np, np2; | |
| 37 int nblocks, nloops; | |
| 38 register FFTComplex *p, *q; | |
| 39 FFTComplex *cptr, *cptr1; | |
| 40 int k; | |
| 41 | |
| 42 np = 1 << ln; | |
| 43 /* FEMMS is not a must here but recommended by AMD */ | |
| 44 _m_femms(); | |
| 45 | |
| 46 { | |
| 47 __m64 *r, a0, a1, b0, b1, c; | |
| 48 | |
| 49 r = (__m64 *)&z[0]; | |
| 50 if (s->inverse) | |
| 51 c = *(__m64 *)m1p1; | |
| 52 else | |
| 53 c = *(__m64 *)p1m1; | |
| 54 | |
| 55 j = (np >> 2); | |
| 56 do { | |
| 57 /* do the pass 0 butterfly */ | |
| 58 a0 = _m_pfadd(r[0], r[1]); | |
| 59 a1 = _m_pfsub(r[0], r[1]); | |
| 60 | |
| 61 /* do the pass 0 butterfly */ | |
| 62 b0 = _m_pfadd(r[2], r[3]); | |
| 63 b1 = _m_pfsub(r[2], r[3]); | |
| 64 | |
| 65 /* multiply third by -i */ | |
| 66 b1 = _m_pswapd(b1); | |
| 67 b1 = _m_pxor(b1, c); | |
| 68 | |
| 69 r[0] = _m_pfadd(a0, b0); | |
| 70 r[1] = _m_pfadd(a1, b1); | |
| 71 r[2] = _m_pfsub(a0, b0); | |
| 72 r[3] = _m_pfsub(a1, b1); | |
| 73 r += 4; | |
| 74 } while (--j != 0); | |
| 75 } | |
| 76 /* pass 2 .. ln-1 */ | |
| 77 | |
| 78 nblocks = np >> 3; | |
| 79 nloops = 1 << 2; | |
| 80 np2 = np >> 1; | |
| 81 | |
| 82 cptr1 = s->exptab1; | |
| 83 do { | |
| 84 p = z; | |
| 85 q = z + nloops; | |
| 86 j = nblocks; | |
| 87 do { | |
| 88 cptr = cptr1; | |
| 89 k = nloops >> 1; | |
| 90 do { | |
| 91 __m64 a0, a1, b0, b1, c0, c1, t10, t11, t20, t21; | |
| 92 | |
| 93 a0 = *(__m64 *)&p[0]; | |
| 94 a1 = *(__m64 *)&p[1]; | |
| 95 b0 = *(__m64 *)&q[0]; | |
| 96 b1 = *(__m64 *)&q[1]; | |
| 97 | |
| 98 /* complex mul */ | |
| 99 c0 = *(__m64 *)&cptr[0]; | |
| 100 c1 = *(__m64 *)&cptr[1]; | |
| 101 /* cre*re cim*im */ | |
| 102 t10 = _m_pfmul(c0, b0); | |
| 103 t11 = _m_pfmul(c1, b1); | |
| 104 /* no need to access cptr[2] & cptr[3] */ | |
| 105 c0 = _m_pswapd(c0); | |
| 106 c1 = _m_pswapd(c1); | |
| 107 /* cim*re cre*im */ | |
| 108 t20 = _m_pfmul(c0, b0); | |
| 109 t21 = _m_pfmul(c1, b1); | |
| 110 | |
| 111 /* cre*re-cim*im cim*re+cre*im */ | |
| 112 b0 = _m_pfpnacc(t10, t20); | |
| 113 b1 = _m_pfpnacc(t11, t21); | |
| 114 | |
| 115 /* butterfly */ | |
| 116 *(__m64 *)&p[0] = _m_pfadd(a0, b0); | |
| 117 *(__m64 *)&p[1] = _m_pfadd(a1, b1); | |
| 118 *(__m64 *)&q[0] = _m_pfsub(a0, b0); | |
| 119 *(__m64 *)&q[1] = _m_pfsub(a1, b1); | |
| 120 | |
| 121 p += 2; | |
| 122 q += 2; | |
| 123 cptr += 4; | |
| 124 } while (--k); | |
| 125 | |
| 126 p += nloops; | |
| 127 q += nloops; | |
| 128 } while (--j); | |
| 129 cptr1 += nloops * 2; | |
| 130 nblocks = nblocks >> 1; | |
| 131 nloops = nloops << 1; | |
| 132 } while (nblocks != 0); | |
| 133 _m_femms(); | |
| 134 } | |
| 135 | |
| 136 #endif | |
| 3555 | 137 |
| 138 void ff_imdct_calc_3dn2(MDCTContext *s, FFTSample *output, | |
| 139 const FFTSample *input, FFTSample *tmp) | |
| 140 { | |
|
3559
c02459cd0d31
slightly faster ff_imdct_calc_3dn2() on amd64. (gcc added a bunch of useless movsxd)
lorenm
parents:
3555
diff
changeset
|
141 long k, n8, n4, n2, n; |
| 3555 | 142 const uint16_t *revtab = s->fft.revtab; |
| 143 const FFTSample *tcos = s->tcos; | |
| 144 const FFTSample *tsin = s->tsin; | |
| 145 const FFTSample *in1, *in2; | |
| 146 FFTComplex *z = (FFTComplex *)tmp; | |
| 147 | |
| 148 n = 1 << s->nbits; | |
| 149 n2 = n >> 1; | |
| 150 n4 = n >> 2; | |
| 151 n8 = n >> 3; | |
| 152 | |
| 153 /* pre rotation */ | |
| 154 in1 = input; | |
| 155 in2 = input + n2 - 1; | |
| 156 for(k = 0; k < n4; k++) { | |
| 157 asm volatile( | |
| 158 "movd %1, %%mm0 \n\t" | |
| 159 "movd %3, %%mm1 \n\t" | |
| 160 "punpckldq %2, %%mm0 \n\t" | |
| 161 "punpckldq %4, %%mm1 \n\t" | |
| 162 "movq %%mm0, %%mm2 \n\t" | |
| 163 "pfmul %%mm1, %%mm0 \n\t" | |
| 164 "pswapd %%mm1, %%mm1 \n\t" | |
| 165 "pfmul %%mm1, %%mm2 \n\t" | |
| 166 "pfpnacc %%mm2, %%mm0 \n\t" | |
| 167 "movq %%mm0, %0 \n\t" | |
| 168 :"=m"(z[revtab[k]]) | |
| 169 :"m"(in2[-2*k]), "m"(in1[2*k]), | |
| 170 "m"(tcos[k]), "m"(tsin[k]) | |
| 171 ); | |
| 172 } | |
| 173 | |
| 174 ff_fft_calc(&s->fft, z); | |
| 175 | |
| 176 /* post rotation + reordering */ | |
| 177 for(k = 0; k < n4; k++) { | |
| 178 asm volatile( | |
| 179 "movq %0, %%mm0 \n\t" | |
| 180 "movd %1, %%mm1 \n\t" | |
| 181 "punpckldq %2, %%mm1 \n\t" | |
| 182 "movq %%mm0, %%mm2 \n\t" | |
| 183 "pfmul %%mm1, %%mm0 \n\t" | |
| 184 "pswapd %%mm1, %%mm1 \n\t" | |
| 185 "pfmul %%mm1, %%mm2 \n\t" | |
| 186 "pfpnacc %%mm2, %%mm0 \n\t" | |
| 187 "movq %%mm0, %0 \n\t" | |
| 188 :"+m"(z[k]) | |
| 189 :"m"(tcos[k]), "m"(tsin[k]) | |
| 190 ); | |
| 191 } | |
| 192 | |
| 193 asm volatile("movd %0, %%mm7" ::"r"(1<<31)); | |
| 194 for(k = 0; k < n8; k++) { | |
| 195 asm volatile( | |
| 196 "movq %4, %%mm0 \n\t" | |
| 197 "pswapd %5, %%mm1 \n\t" | |
| 198 "movq %%mm0, %%mm2 \n\t" | |
| 199 "pxor %%mm7, %%mm2 \n\t" | |
| 200 "punpckldq %%mm1, %%mm2 \n\t" | |
| 201 "pswapd %%mm2, %%mm3 \n\t" | |
| 202 "punpckhdq %%mm1, %%mm0 \n\t" | |
| 203 "pswapd %%mm0, %%mm4 \n\t" | |
| 204 "pxor %%mm7, %%mm0 \n\t" | |
| 205 "pxor %%mm7, %%mm4 \n\t" | |
| 206 "movq %%mm0, %0 \n\t" // { -z[n8+k].im, z[n8-1-k].re } | |
| 207 "movq %%mm4, %1 \n\t" // { -z[n8-1-k].re, z[n8+k].im } | |
| 208 "movq %%mm2, %2 \n\t" // { -z[n8+k].re, z[n8-1-k].im } | |
| 209 "movq %%mm3, %3 \n\t" // { z[n8-1-k].im, -z[n8+k].re } | |
| 210 :"=m"(output[2*k]), "=m"(output[n2-2-2*k]), | |
| 211 "=m"(output[n2+2*k]), "=m"(output[n-2-2*k]) | |
| 212 :"m"(z[n8+k]), "m"(z[n8-1-k]) | |
| 213 :"memory" | |
| 214 ); | |
| 215 } | |
| 216 asm volatile("emms"); | |
| 217 } |
