Mercurial > libavcodec.hg
annotate libpostproc/postprocess_template.c @ 2071:41d30bae5019 libavcodec
attempt to create some separation in the FLAC system with respect to
demuxer and decoder layers by enabling the FLAC decoder to decode data
without needing the entire file, from start to finish
| author | melanson |
|---|---|
| date | Thu, 10 Jun 2004 04:13:43 +0000 |
| parents | 703b80c99891 |
| children | 185f3b18ec1f |
| rev | line source |
|---|---|
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1 /* |
| 223 | 2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
4 This program is free software; you can redistribute it and/or modify |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
5 it under the terms of the GNU General Public License as published by |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
6 the Free Software Foundation; either version 2 of the License, or |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
7 (at your option) any later version. |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
8 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
9 This program is distributed in the hope that it will be useful, |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
12 GNU General Public License for more details. |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
13 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
14 You should have received a copy of the GNU General Public License |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
15 along with this program; if not, write to the Free Software |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
17 */ |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
18 |
| 1109 | 19 /** |
| 20 * @file postprocess_template.c | |
| 21 * mmx/mmx2/3dnow postprocess code. | |
| 22 */ | |
| 23 | |
| 24 | |
| 169 | 25 #undef PAVGB |
| 26 #undef PMINUB | |
| 27 #undef PMAXUB | |
| 104 | 28 |
| 29 #ifdef HAVE_MMX2 | |
| 30 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
| 31 #elif defined (HAVE_3DNOW) | |
| 32 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
| 33 #endif | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
34 |
| 134 | 35 #ifdef HAVE_MMX2 |
| 36 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" | |
| 37 #elif defined (HAVE_MMX) | |
| 38 #define PMINUB(b,a,t) \ | |
| 39 "movq " #a ", " #t " \n\t"\ | |
| 40 "psubusb " #b ", " #t " \n\t"\ | |
| 41 "psubb " #t ", " #a " \n\t" | |
| 42 #endif | |
| 43 | |
| 44 #ifdef HAVE_MMX2 | |
| 45 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" | |
| 46 #elif defined (HAVE_MMX) | |
| 47 #define PMAXUB(a,b) \ | |
| 48 "psubusb " #a ", " #b " \n\t"\ | |
| 49 "paddb " #a ", " #b " \n\t" | |
| 50 #endif | |
| 51 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
52 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
| 787 | 53 #ifdef HAVE_MMX |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
54 /** |
| 111 | 55 * Check if the middle 8x8 Block in the given 8x16 block is flat |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
56 */ |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
57 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
58 int numEq= 0, dcOk; |
| 111 | 59 src+= stride*4; // src points to begin of the 8x8 Block |
| 119 | 60 asm volatile( |
| 1331 | 61 "movq %0, %%mm7 \n\t" |
| 62 "movq %1, %%mm6 \n\t" | |
| 63 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) | |
| 64 ); | |
| 65 | |
| 66 asm volatile( | |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
67 "leal (%2, %3), %%eax \n\t" |
| 119 | 68 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 69 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
| 791 | 70 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
71 "movq (%2), %%mm0 \n\t" |
| 119 | 72 "movq (%%eax), %%mm1 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
73 "movq %%mm0, %%mm3 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
74 "movq %%mm0, %%mm4 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
75 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
76 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
77 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
78 "paddb %%mm7, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
79 "pcmpgtb %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
80 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
81 "movq (%%eax,%3), %%mm2 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
82 PMAXUB(%%mm2, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
83 PMINUB(%%mm2, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
84 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
85 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
86 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
87 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
88 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
89 "movq (%%eax, %3, 2), %%mm1 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
90 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
91 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
92 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
93 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
94 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
95 "paddb %%mm2, %%mm0 \n\t" |
| 787 | 96 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
97 "leal (%%eax, %3, 4), %%eax \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
98 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
99 "movq (%2, %3, 4), %%mm2 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
100 PMAXUB(%%mm2, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
101 PMINUB(%%mm2, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
102 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
103 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
104 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
105 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
106 |
| 787 | 107 "movq (%%eax), %%mm1 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
108 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
109 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
110 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
111 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
112 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
113 "paddb %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
114 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
115 "movq (%%eax, %3), %%mm2 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
116 PMAXUB(%%mm2, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
117 PMINUB(%%mm2, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
118 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
119 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
120 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
121 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
122 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
123 "movq (%%eax, %3, 2), %%mm1 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
124 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
125 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
126 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
127 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
128 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
129 "paddb %%mm2, %%mm0 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
130 "psubusb %%mm3, %%mm4 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
131 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
132 " \n\t" |
| 167 | 133 #ifdef HAVE_MMX2 |
| 134 "pxor %%mm7, %%mm7 \n\t" | |
| 135 "psadbw %%mm7, %%mm0 \n\t" | |
| 136 #else | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
137 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
138 "psrlw $8, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
139 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
140 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
141 "psrlq $16, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
142 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
143 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
144 "psrlq $32, %%mm0 \n\t" |
| 167 | 145 "paddb %%mm1, %%mm0 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
146 #endif |
| 1331 | 147 "movq %4, %%mm7 \n\t" // QP,..., QP |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
148 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
149 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
150 "packssdw %%mm4, %%mm4 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
151 "movd %%mm0, %0 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
152 "movd %%mm4, %1 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
153 |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
154 : "=r" (numEq), "=r" (dcOk) |
| 1331 | 155 : "r" (src), "r" (stride), "m" (c->pQPb) |
| 787 | 156 : "%eax" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
157 ); |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
158 |
| 167 | 159 numEq= (-numEq) &0xFF; |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
160 if(numEq > c->ppMode.flatnessThreshold){ |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
161 if(dcOk) return 0; |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
162 else return 1; |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
163 }else{ |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
164 return 2; |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
165 } |
| 787 | 166 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
167 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
168 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
169 /** |
| 111 | 170 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
| 107 | 171 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
172 */ |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
173 #ifndef HAVE_ALTIVEC |
| 787 | 174 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
175 { |
| 96 | 176 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 177 src+= stride*3; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
178 asm volatile( //"movv %0 %1 %2\n\t" |
| 787 | 179 "movq %2, %%mm0 \n\t" // QP,..., QP |
| 180 "pxor %%mm4, %%mm4 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
181 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
182 "movq (%0), %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
183 "movq (%0, %1), %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
184 "movq %%mm5, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
185 "movq %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
186 "psubusb %%mm6, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
187 "psubusb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
188 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
189 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
| 787 | 190 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
191 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
192 "pand %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
193 "pandn %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
194 "por %%mm2, %%mm6 \n\t"// First Line to Filter |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
195 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
196 "movq (%0, %1, 8), %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
197 "leal (%0, %1, 4), %%eax \n\t" |
| 787 | 198 "leal (%0, %1, 8), %%ecx \n\t" |
| 199 "subl %1, %%ecx \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
200 "addl %1, %0 \n\t" // %0 points to line 1 not 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
201 "movq (%0, %1, 8), %%mm7 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
202 "movq %%mm5, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
203 "movq %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
204 "psubusb %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
205 "psubusb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
206 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
207 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
| 787 | 208 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
209 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
210 "pand %%mm2, %%mm7 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
211 "pandn %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
212 "por %%mm2, %%mm7 \n\t" // First Line to Filter |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
213 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
214 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
215 // 1 2 3 4 5 6 7 8 |
| 787 | 216 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
217 // 6 4 2 2 1 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
218 // 6 4 4 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
219 // 6 8 2 |
| 111 | 220 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
221 "movq (%0, %1), %%mm0 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
222 "movq %%mm0, %%mm1 \n\t" // 1 |
| 96 | 223 PAVGB(%%mm6, %%mm0) //1 1 /2 |
| 224 PAVGB(%%mm6, %%mm0) //3 1 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
225 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
226 "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
227 "movq %%mm2, %%mm5 \n\t" // 1 |
| 96 | 228 PAVGB((%%eax), %%mm2) // 11 /2 |
| 229 PAVGB((%0, %1, 2), %%mm2) // 211 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
230 "movq %%mm2, %%mm3 \n\t" // 211 /4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
231 "movq (%0), %%mm4 \n\t" // 1 |
| 96 | 232 PAVGB(%%mm4, %%mm3) // 4 211 /8 |
| 233 PAVGB(%%mm0, %%mm3) //642211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
234 "movq %%mm3, (%0) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
235 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
236 "movq %%mm1, %%mm0 \n\t" // 1 |
| 96 | 237 PAVGB(%%mm6, %%mm0) //1 1 /2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
238 "movq %%mm4, %%mm3 \n\t" // 1 |
| 96 | 239 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
| 240 PAVGB((%%eax,%1,2), %%mm5) // 11 /2 | |
| 241 PAVGB((%%eax), %%mm5) // 211 /4 | |
| 242 PAVGB(%%mm5, %%mm3) // 2 2211 /8 | |
| 243 PAVGB(%%mm0, %%mm3) //4242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
244 "movq %%mm3, (%0,%1) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
245 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
| 96 | 246 PAVGB(%%mm4, %%mm6) //11 /2 |
| 787 | 247 "movq (%%ecx), %%mm0 \n\t" // 1 |
| 96 | 248 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
249 "movq %%mm0, %%mm3 \n\t" // 11/2 |
| 96 | 250 PAVGB(%%mm1, %%mm0) // 2 11/4 |
| 251 PAVGB(%%mm6, %%mm0) //222 11/8 | |
| 252 PAVGB(%%mm2, %%mm0) //22242211/16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
253 "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
254 "movq %%mm0, (%0, %1, 2) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
255 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
256 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 787 | 257 PAVGB((%%ecx), %%mm0) // 11 /2 |
| 96 | 258 PAVGB(%%mm0, %%mm6) //11 11 /4 |
| 259 PAVGB(%%mm1, %%mm4) // 11 /2 | |
| 260 PAVGB(%%mm2, %%mm1) // 11 /2 | |
| 261 PAVGB(%%mm1, %%mm6) //1122 11 /8 | |
| 262 PAVGB(%%mm5, %%mm6) //112242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
263 "movq (%%eax), %%mm5 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
264 "movq %%mm6, (%%eax) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
265 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
266 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 |
| 96 | 267 PAVGB(%%mm7, %%mm6) // 11 /2 |
| 268 PAVGB(%%mm4, %%mm6) // 11 11 /4 | |
| 269 PAVGB(%%mm3, %%mm6) // 11 2211 /8 | |
| 270 PAVGB(%%mm5, %%mm2) // 11 /2 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
271 "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
| 96 | 272 PAVGB(%%mm4, %%mm2) // 112 /4 |
| 273 PAVGB(%%mm2, %%mm6) // 112242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
274 "movq %%mm6, (%0, %1, 4) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
275 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
| 96 | 276 PAVGB(%%mm7, %%mm1) // 11 2 /4 |
| 277 PAVGB(%%mm4, %%mm5) // 11 /2 | |
| 278 PAVGB(%%mm5, %%mm0) // 11 11 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
279 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
| 96 | 280 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
| 281 PAVGB(%%mm0, %%mm1) // 11224222 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
282 "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
283 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
| 787 | 284 PAVGB((%%ecx), %%mm2) // 112 4 /8 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
285 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 96 | 286 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
| 287 PAVGB(%%mm7, %%mm6) // 1 12 /4 | |
| 288 PAVGB(%%mm2, %%mm6) // 1122424 /4 | |
| 787 | 289 "movq %%mm6, (%%ecx) \n\t" // X |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
290 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
| 96 | 291 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
| 292 PAVGB(%%mm7, %%mm5) // 11 6 /8 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
293 |
| 96 | 294 PAVGB(%%mm3, %%mm0) // 112 /4 |
| 295 PAVGB(%%mm0, %%mm5) // 112246 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
296 "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
| 140 | 297 "subl %1, %0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
298 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
299 : |
| 787 | 300 : "r" (src), "r" (stride), "m" (c->pQPb) |
| 301 : "%eax", "%ecx" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
302 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
303 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
304 const int l1= stride; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
305 const int l2= stride + l1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
306 const int l3= stride + l2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
307 const int l4= stride + l3; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
308 const int l5= stride + l4; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
309 const int l6= stride + l5; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
310 const int l7= stride + l6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
311 const int l8= stride + l7; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
312 const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
313 int x; |
| 111 | 314 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
315 for(x=0; x<BLOCK_SIZE; x++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
316 { |
| 787 | 317 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; |
| 318 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
319 |
| 2038 | 320 int sums[10]; |
| 321 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; | |
| 322 sums[1] = sums[0] - first + src[l4]; | |
| 323 sums[2] = sums[1] - first + src[l5]; | |
| 324 sums[3] = sums[2] - first + src[l6]; | |
| 325 sums[4] = sums[3] - first + src[l7]; | |
| 326 sums[5] = sums[4] - src[l1] + src[l8]; | |
| 327 sums[6] = sums[5] - src[l2] + last; | |
| 328 sums[7] = sums[6] - src[l3] + last; | |
| 329 sums[8] = sums[7] - src[l4] + last; | |
| 330 sums[9] = sums[8] - src[l5] + last; | |
| 331 | |
| 332 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; | |
| 333 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; | |
| 334 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; | |
| 335 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; | |
| 336 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; | |
| 337 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; | |
| 338 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; | |
| 339 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
340 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
341 src++; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
342 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
343 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
344 } |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
345 #endif //HAVE_ALTIVEC |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
346 |
| 787 | 347 #if 0 |
| 96 | 348 /** |
| 349 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | |
| 350 * values are correctly clipped (MMX2) | |
| 351 * values are wraparound (C) | |
| 352 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | |
| 353 0 8 16 24 | |
| 354 x = 8 | |
| 355 x/2 = 4 | |
| 356 x/8 = 1 | |
| 357 1 12 12 23 | |
| 358 */ | |
| 169 | 359 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
| 96 | 360 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
361 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 362 src+= stride*3; |
| 96 | 363 // FIXME rounding |
| 364 asm volatile( | |
| 365 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 210 | 366 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 96 | 367 "leal (%0, %1), %%eax \n\t" |
| 787 | 368 "leal (%%eax, %1, 4), %%ecx \n\t" |
| 96 | 369 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 370 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
| 210 | 371 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
| 96 | 372 "movq %%mm0, %%mm1 \n\t" // QP,..., QP |
| 210 | 373 "paddusb "MANGLE(b02)", %%mm0 \n\t" |
| 96 | 374 "psrlw $2, %%mm0 \n\t" |
| 210 | 375 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 |
| 96 | 376 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... |
| 377 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | |
| 787 | 378 "movq (%%ecx), %%mm3 \n\t" // line 5 |
| 96 | 379 "movq %%mm2, %%mm4 \n\t" // line 4 |
| 380 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | |
| 381 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
382 PAVGB(%%mm3, %%mm5) |
| 96 | 383 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 |
| 384 "psubusb %%mm3, %%mm4 \n\t" | |
| 385 "psubusb %%mm2, %%mm3 \n\t" | |
| 386 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | |
| 387 "psubusb %%mm0, %%mm4 \n\t" | |
| 388 "pcmpeqb %%mm7, %%mm4 \n\t" | |
| 389 "pand %%mm4, %%mm5 \n\t" // d/2 | |
| 390 | |
| 391 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | |
| 392 "paddb %%mm5, %%mm2 \n\t" | |
| 393 // "psubb %%mm6, %%mm2 \n\t" | |
| 394 "movq %%mm2, (%0,%1, 4) \n\t" | |
| 395 | |
| 787 | 396 "movq (%%ecx), %%mm2 \n\t" |
| 96 | 397 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 |
| 398 "psubb %%mm5, %%mm2 \n\t" | |
| 399 // "psubb %%mm6, %%mm2 \n\t" | |
| 787 | 400 "movq %%mm2, (%%ecx) \n\t" |
| 96 | 401 |
| 402 "paddb %%mm6, %%mm5 \n\t" | |
| 403 "psrlw $2, %%mm5 \n\t" | |
| 210 | 404 "pand "MANGLE(b3F)", %%mm5 \n\t" |
| 405 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 | |
| 96 | 406 |
| 407 "movq (%%eax, %1, 2), %%mm2 \n\t" | |
| 408 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | |
| 409 "paddsb %%mm5, %%mm2 \n\t" | |
| 410 "psubb %%mm6, %%mm2 \n\t" | |
| 411 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
| 412 | |
| 787 | 413 "movq (%%ecx, %1), %%mm2 \n\t" |
| 96 | 414 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 |
| 415 "psubsb %%mm5, %%mm2 \n\t" | |
| 416 "psubb %%mm6, %%mm2 \n\t" | |
| 787 | 417 "movq %%mm2, (%%ecx, %1) \n\t" |
| 96 | 418 |
| 419 : | |
| 420 : "r" (src), "r" (stride) | |
| 787 | 421 : "%eax", "%ecx" |
| 96 | 422 ); |
| 423 #else | |
| 424 const int l1= stride; | |
| 425 const int l2= stride + l1; | |
| 426 const int l3= stride + l2; | |
| 427 const int l4= stride + l3; | |
| 428 const int l5= stride + l4; | |
| 429 const int l6= stride + l5; | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
430 // const int l7= stride + l6; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
431 // const int l8= stride + l7; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
432 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
433 int x; |
| 141 | 434 const int QP15= QP + (QP>>2); |
| 111 | 435 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
436 for(x=0; x<BLOCK_SIZE; x++) |
| 96 | 437 { |
| 141 | 438 const int v = (src[x+l5] - src[x+l4]); |
| 439 if(ABS(v) < QP15) | |
| 96 | 440 { |
| 141 | 441 src[x+l3] +=v>>3; |
| 442 src[x+l4] +=v>>1; | |
| 443 src[x+l5] -=v>>1; | |
| 444 src[x+l6] -=v>>3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
445 |
| 96 | 446 } |
| 447 } | |
| 448 | |
| 449 #endif | |
| 450 } | |
| 787 | 451 #endif |
| 96 | 452 |
| 453 /** | |
| 454 * Experimental Filter 1 | |
| 99 | 455 * will not damage linear gradients |
| 456 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
457 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
458 * MMX2 version does correct clipping C version doesnt |
| 96 | 459 */ |
| 787 | 460 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
| 96 | 461 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
462 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 463 src+= stride*3; |
| 464 | |
| 96 | 465 asm volatile( |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
466 "pxor %%mm7, %%mm7 \n\t" // 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
467 "leal (%0, %1), %%eax \n\t" |
| 787 | 468 "leal (%%eax, %1, 4), %%ecx \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
469 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 470 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
471 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
472 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
473 "movq %%mm1, %%mm2 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
474 "psubusb %%mm0, %%mm1 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
475 "psubusb %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
476 "por %%mm1, %%mm0 \n\t" // |l2 - l3| |
| 787 | 477 "movq (%%ecx), %%mm3 \n\t" // line 5 |
| 478 "movq (%%ecx, %1), %%mm4 \n\t" // line 6 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
479 "movq %%mm3, %%mm5 \n\t" // line 5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
480 "psubusb %%mm4, %%mm3 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
481 "psubusb %%mm5, %%mm4 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
482 "por %%mm4, %%mm3 \n\t" // |l5 - l6| |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
483 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
484 "movq %%mm2, %%mm1 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
485 "psubusb %%mm5, %%mm2 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
486 "movq %%mm2, %%mm4 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
487 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
488 "psubusb %%mm1, %%mm5 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
489 "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
490 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
491 "movq %%mm4, %%mm3 \n\t" // d |
| 787 | 492 "movq %2, %%mm0 \n\t" |
| 334 | 493 "paddusb %%mm0, %%mm0 \n\t" |
| 494 "psubusb %%mm0, %%mm4 \n\t" | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
495 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
| 210 | 496 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
497 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
498 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
499 PAVGB(%%mm7, %%mm3) // d/2 |
| 99 | 500 "movq %%mm3, %%mm1 \n\t" // d/2 |
| 501 PAVGB(%%mm7, %%mm3) // d/4 | |
| 502 PAVGB(%%mm1, %%mm3) // 3*d/8 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
503 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
504 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
505 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
506 "psubusb %%mm3, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
507 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
508 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
509 |
| 787 | 510 "movq (%%ecx), %%mm0 \n\t" // line 5 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
511 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
512 "paddusb %%mm3, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
513 "pxor %%mm2, %%mm0 \n\t" |
| 787 | 514 "movq %%mm0, (%%ecx) \n\t" // line 5 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
515 |
| 99 | 516 PAVGB(%%mm7, %%mm1) // d/4 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
517 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
518 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
519 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
| 99 | 520 "psubusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
521 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
522 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
523 |
| 787 | 524 "movq (%%ecx, %1), %%mm0 \n\t" // line 6 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
525 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
| 99 | 526 "paddusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
527 "pxor %%mm2, %%mm0 \n\t" |
| 787 | 528 "movq %%mm0, (%%ecx, %1) \n\t" // line 6 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
529 |
| 99 | 530 PAVGB(%%mm7, %%mm1) // d/8 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
531 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
532 "movq (%%eax, %1), %%mm0 \n\t" // line 2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
533 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
| 99 | 534 "psubusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
535 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
536 "movq %%mm0, (%%eax, %1) \n\t" // line 2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
537 |
| 787 | 538 "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
539 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
| 99 | 540 "paddusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
541 "pxor %%mm2, %%mm0 \n\t" |
| 787 | 542 "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7 |
| 96 | 543 |
| 544 : | |
| 787 | 545 : "r" (src), "r" (stride), "m" (co->pQPb) |
| 546 : "%eax", "%ecx" | |
| 96 | 547 ); |
| 548 #else | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
549 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
550 const int l1= stride; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
551 const int l2= stride + l1; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
552 const int l3= stride + l2; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
553 const int l4= stride + l3; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
554 const int l5= stride + l4; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
555 const int l6= stride + l5; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
556 const int l7= stride + l6; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
557 // const int l8= stride + l7; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
558 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
559 int x; |
| 111 | 560 |
| 561 src+= stride*3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
562 for(x=0; x<BLOCK_SIZE; x++) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
563 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
564 int a= src[l3] - src[l4]; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
565 int b= src[l4] - src[l5]; |
| 99 | 566 int c= src[l5] - src[l6]; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
567 |
| 141 | 568 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); |
| 569 d= MAX(d, 0); | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
570 |
| 787 | 571 if(d < co->QP*2) |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
572 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
573 int v = d * SIGN(-b); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
574 |
| 141 | 575 src[l2] +=v>>3; |
| 576 src[l3] +=v>>2; | |
| 577 src[l4] +=(3*v)>>3; | |
| 578 src[l5] -=(3*v)>>3; | |
| 579 src[l6] -=v>>2; | |
| 580 src[l7] -=v>>3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
581 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
582 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
583 src++; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
584 } |
| 96 | 585 #endif |
| 586 } | |
| 587 | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
588 #ifndef HAVE_ALTIVEC |
| 787 | 589 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
590 { |
| 163 | 591 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 592 /* | |
| 593 uint8_t tmp[16]; | |
| 594 const int l1= stride; | |
| 595 const int l2= stride + l1; | |
| 596 const int l3= stride + l2; | |
| 597 const int l4= (int)tmp - (int)src - stride*3; | |
| 598 const int l5= (int)tmp - (int)src - stride*3 + 8; | |
| 599 const int l6= stride*3 + l3; | |
| 600 const int l7= stride + l6; | |
| 601 const int l8= stride + l7; | |
| 602 | |
| 603 memcpy(tmp, src+stride*7, 8); | |
| 604 memcpy(tmp+8, src+stride*8, 8); | |
| 605 */ | |
| 111 | 606 src+= stride*4; |
| 163 | 607 asm volatile( |
| 608 | |
| 609 #if 0 //sligtly more accurate and slightly slower | |
| 610 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 611 "leal (%0, %1), %%eax \n\t" | |
| 787 | 612 "leal (%%eax, %1, 4), %%ecx \n\t" |
| 163 | 613 // 0 1 2 3 4 5 6 7 |
| 787 | 614 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
| 615 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
| 163 | 616 |
| 617 | |
| 618 "movq (%0, %1, 2), %%mm0 \n\t" // l2 | |
| 619 "movq (%0), %%mm1 \n\t" // l0 | |
| 620 "movq %%mm0, %%mm2 \n\t" // l2 | |
| 621 PAVGB(%%mm7, %%mm0) // ~l2/2 | |
| 622 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | |
| 623 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | |
| 624 | |
| 625 "movq (%%eax), %%mm1 \n\t" // l1 | |
| 626 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 | |
| 627 "movq %%mm1, %%mm4 \n\t" // l1 | |
| 628 PAVGB(%%mm7, %%mm1) // ~l1/2 | |
| 629 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | |
| 630 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | |
| 631 | |
| 632 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | |
| 633 "psubusb %%mm1, %%mm0 \n\t" | |
| 634 "psubusb %%mm4, %%mm1 \n\t" | |
| 635 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | |
| 636 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 | |
| 637 | |
| 638 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
| 639 "movq %%mm0, %%mm4 \n\t" // l4 | |
| 640 PAVGB(%%mm7, %%mm0) // ~l4/2 | |
| 641 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | |
| 642 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | |
| 643 | |
| 787 | 644 "movq (%%ecx), %%mm2 \n\t" // l5 |
| 163 | 645 "movq %%mm3, %%mm5 \n\t" // l3 |
| 646 PAVGB(%%mm7, %%mm3) // ~l3/2 | |
| 647 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | |
| 648 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | |
| 649 | |
| 650 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | |
| 651 "psubusb %%mm3, %%mm0 \n\t" | |
| 652 "psubusb %%mm6, %%mm3 \n\t" | |
| 653 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | |
| 654 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | |
| 655 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | |
| 656 | |
| 787 | 657 "movq (%%ecx, %1), %%mm6 \n\t" // l6 |
| 163 | 658 "movq %%mm6, %%mm5 \n\t" // l6 |
| 659 PAVGB(%%mm7, %%mm6) // ~l6/2 | |
| 660 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | |
| 661 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | |
| 662 | |
| 787 | 663 "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7 |
| 163 | 664 "movq %%mm2, %%mm4 \n\t" // l5 |
| 665 PAVGB(%%mm7, %%mm2) // ~l5/2 | |
| 666 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | |
| 667 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | |
| 668 | |
| 669 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | |
| 670 "psubusb %%mm2, %%mm6 \n\t" | |
| 671 "psubusb %%mm4, %%mm2 \n\t" | |
| 672 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | |
| 673 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | |
| 674 | |
| 675 | |
| 676 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | |
| 787 | 677 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? |
| 210 | 678 "paddusb "MANGLE(b01)", %%mm4 \n\t" |
| 163 | 679 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
| 680 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | |
| 681 "pand %%mm4, %%mm3 \n\t" | |
| 682 | |
| 683 "movq %%mm3, %%mm1 \n\t" | |
| 210 | 684 // "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 685 PAVGB(%%mm7, %%mm3) |
| 686 PAVGB(%%mm7, %%mm3) | |
| 687 "paddusb %%mm1, %%mm3 \n\t" | |
| 210 | 688 // "paddusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 689 |
| 690 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 | |
| 691 "movq (%0, %1, 4), %%mm5 \n\t" //l4 | |
| 692 "movq (%0, %1, 4), %%mm4 \n\t" //l4 | |
| 693 "psubusb %%mm6, %%mm5 \n\t" | |
| 694 "psubusb %%mm4, %%mm6 \n\t" | |
| 695 "por %%mm6, %%mm5 \n\t" // |l3-l4| | |
| 696 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | |
| 697 "pxor %%mm6, %%mm0 \n\t" | |
| 698 "pand %%mm0, %%mm3 \n\t" | |
| 699 PMINUB(%%mm5, %%mm3, %%mm0) | |
| 700 | |
| 210 | 701 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 702 PAVGB(%%mm7, %%mm3) |
| 703 | |
| 704 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 705 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 706 "pxor %%mm6, %%mm0 \n\t" | |
| 707 "pxor %%mm6, %%mm2 \n\t" | |
| 708 "psubb %%mm3, %%mm0 \n\t" | |
| 709 "paddb %%mm3, %%mm2 \n\t" | |
| 710 "pxor %%mm6, %%mm0 \n\t" | |
| 711 "pxor %%mm6, %%mm2 \n\t" | |
| 712 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 713 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 714 #endif | |
| 715 | |
| 716 "leal (%0, %1), %%eax \n\t" | |
| 717 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | |
| 718 // 0 1 2 3 4 5 6 7 | |
| 787 | 719 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
| 720 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
| 163 | 721 |
| 722 | |
| 723 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 | |
| 724 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
| 725 "pxor %%mm6, %%mm1 \n\t" // -l3-1 | |
| 726 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | |
| 727 // mm1=-l3-1, mm0=128-q | |
| 728 | |
| 729 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 | |
| 730 "movq (%%eax, %1), %%mm3 \n\t" // l2 | |
| 731 "pxor %%mm6, %%mm2 \n\t" // -l5-1 | |
| 732 "movq %%mm2, %%mm5 \n\t" // -l5-1 | |
| 210 | 733 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
| 787 | 734 "leal (%%eax, %1, 4), %%ecx \n\t" |
| 163 | 735 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 |
| 736 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | |
| 737 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | |
| 738 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | |
| 739 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | |
| 740 | |
| 741 "movq (%%eax), %%mm2 \n\t" // l1 | |
| 742 "pxor %%mm6, %%mm2 \n\t" // -l1-1 | |
| 743 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | |
| 744 PAVGB((%0), %%mm1) // (l0-l3+256)/2 | |
| 210 | 745 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 |
| 163 | 746 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
| 747 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | |
| 748 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | |
| 749 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | |
| 750 | |
| 787 | 751 PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2 |
| 752 "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7 | |
| 163 | 753 "pxor %%mm6, %%mm1 \n\t" // -l7-1 |
| 754 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | |
| 210 | 755 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
| 163 | 756 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
| 757 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | |
| 758 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | |
| 759 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | |
| 760 | |
| 210 | 761 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
| 762 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 | |
| 163 | 763 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 |
| 764 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | |
| 765 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | |
| 766 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | |
| 767 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | |
| 768 | |
| 769 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | |
| 770 | |
| 210 | 771 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
| 787 | 772 "movq %2, %%mm2 \n\t" // QP |
| 163 | 773 PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
| 774 "psubb %%mm6, %%mm2 \n\t" | |
| 775 | |
| 776 "movq %%mm4, %%mm1 \n\t" | |
| 777 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | |
| 778 "pxor %%mm1, %%mm4 \n\t" | |
| 779 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | |
| 780 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | |
| 781 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | |
| 782 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | |
| 783 | |
| 784 "movq %%mm4, %%mm3 \n\t" // d | |
| 210 | 785 "psubusb "MANGLE(b01)", %%mm4 \n\t" |
| 163 | 786 PAVGB(%%mm7, %%mm4) // d/32 |
| 787 PAVGB(%%mm7, %%mm4) // (d + 32)/64 | |
| 788 "paddb %%mm3, %%mm4 \n\t" // 5d/64 | |
| 789 "pand %%mm2, %%mm4 \n\t" | |
| 790 | |
| 210 | 791 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 |
| 163 | 792 "psubb %%mm0, %%mm5 \n\t" // q |
| 793 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | |
| 794 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | |
| 795 "pxor %%mm7, %%mm5 \n\t" | |
| 796 | |
| 797 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | |
| 798 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | |
| 799 | |
| 800 "pand %%mm7, %%mm4 \n\t" | |
| 801 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 802 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 803 "pxor %%mm1, %%mm0 \n\t" | |
| 804 "pxor %%mm1, %%mm2 \n\t" | |
| 805 "paddb %%mm4, %%mm0 \n\t" | |
| 806 "psubb %%mm4, %%mm2 \n\t" | |
| 807 "pxor %%mm1, %%mm0 \n\t" | |
| 808 "pxor %%mm1, %%mm2 \n\t" | |
| 809 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 810 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 811 | |
| 812 : | |
| 787 | 813 : "r" (src), "r" (stride), "m" (c->pQPb) |
| 814 : "%eax", "%ecx" | |
| 163 | 815 ); |
| 816 | |
| 817 /* | |
| 818 { | |
| 819 int x; | |
| 820 src-= stride; | |
| 821 for(x=0; x<BLOCK_SIZE; x++) | |
| 822 { | |
| 823 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
| 824 if(ABS(middleEnergy)< 8*QP) | |
| 825 { | |
| 826 const int q=(src[l4] - src[l5])/2; | |
| 827 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
| 828 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
| 829 | |
| 830 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
| 831 d= MAX(d, 0); | |
| 832 | |
| 833 d= (5*d + 32) >> 6; | |
| 834 d*= SIGN(-middleEnergy); | |
| 835 | |
| 836 if(q>0) | |
| 837 { | |
| 838 d= d<0 ? 0 : d; | |
| 839 d= d>q ? q : d; | |
| 840 } | |
| 841 else | |
| 842 { | |
| 843 d= d>0 ? 0 : d; | |
| 844 d= d<q ? q : d; | |
| 845 } | |
| 846 | |
| 847 src[l4]-= d; | |
| 848 src[l5]+= d; | |
| 849 } | |
| 850 src++; | |
| 851 } | |
| 852 src-=8; | |
| 853 for(x=0; x<8; x++) | |
| 854 { | |
| 855 int y; | |
| 856 for(y=4; y<6; y++) | |
| 857 { | |
| 858 int d= src[x+y*stride] - tmp[x+(y-4)*8]; | |
| 859 int ad= ABS(d); | |
| 860 static int max=0; | |
| 861 static int sum=0; | |
| 862 static int num=0; | |
| 863 static int bias=0; | |
| 864 | |
| 865 if(max<ad) max=ad; | |
| 866 sum+= ad>3 ? 1 : 0; | |
| 867 if(ad>3) | |
| 868 { | |
| 869 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | |
| 870 } | |
| 871 if(y==4) bias+=d; | |
| 872 num++; | |
| 873 if(num%1000000 == 0) | |
| 874 { | |
| 875 printf(" %d %d %d %d\n", num, sum, max, bias); | |
| 876 } | |
| 877 } | |
| 878 } | |
| 879 } | |
| 880 */ | |
| 881 #elif defined (HAVE_MMX) | |
| 882 src+= stride*4; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
883 asm volatile( |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
884 "pxor %%mm7, %%mm7 \n\t" |
| 787 | 885 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars |
| 886 "andl $0xFFFFFFF8, %%ecx \n\t" // align | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
887 // 0 1 2 3 4 5 6 7 |
| 787 | 888 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 |
| 889 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
890 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
891 "movq (%0), %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
892 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
893 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
894 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
895 |
| 810 | 896 "movq (%0, %1), %%mm2 \n\t" |
| 897 "leal (%0, %1, 2), %%eax \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
898 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
899 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
900 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
901 |
| 810 | 902 "movq (%%eax), %%mm4 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
903 "movq %%mm4, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
904 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
905 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
906 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
907 "paddw %%mm0, %%mm0 \n\t" // 2L0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
908 "paddw %%mm1, %%mm1 \n\t" // 2H0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
909 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
910 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
911 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
912 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
913 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
914 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
915 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
916 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
917 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
918 |
| 810 | 919 "movq (%%eax, %1), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
920 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
921 "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
922 "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
923 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
924 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
925 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
926 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
927 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
| 787 | 928 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 929 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
930 |
| 810 | 931 "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
932 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
933 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
934 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
935 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
936 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
937 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
| 787 | 938 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 |
| 939 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
940 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
941 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
942 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
943 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
944 |
| 810 | 945 "leal (%%eax, %1), %0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
946 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
947 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
948 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
949 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
950 //50 opcodes so far |
| 810 | 951 "movq (%0, %1, 2), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
952 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
953 "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
954 "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
955 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
956 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
957 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
958 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
959 |
| 810 | 960 "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
961 "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
962 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
| 810 | 963 "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
964 "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
965 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
966 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
967 "paddw %%mm0, %%mm0 \n\t" // 2L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
968 "paddw %%mm1, %%mm1 \n\t" // 2H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
969 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
970 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
971 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
972 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
973 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
974 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
975 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
976 |
| 810 | 977 "movq (%0, %1, 4), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
978 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
979 "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
980 "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
981 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
982 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
983 "paddw %%mm3, %%mm3 \n\t" // 2H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
984 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
985 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
986 |
| 787 | 987 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 988 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 140 | 989 |
| 990 #ifdef HAVE_MMX2 | |
| 991 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 992 "psubw %%mm0, %%mm6 \n\t" | |
| 993 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
| 994 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 995 "psubw %%mm1, %%mm6 \n\t" | |
| 996 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
| 997 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 998 "psubw %%mm2, %%mm6 \n\t" | |
| 999 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
| 1000 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1001 "psubw %%mm3, %%mm6 \n\t" | |
| 1002 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
| 1003 #else | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1004 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1005 "pcmpgtw %%mm0, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1006 "pxor %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1007 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1008 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1009 "pcmpgtw %%mm1, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1010 "pxor %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1011 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1012 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1013 "pcmpgtw %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1014 "pxor %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1015 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1016 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1017 "pcmpgtw %%mm3, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1018 "pxor %%mm6, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1019 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
| 140 | 1020 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1021 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1022 #ifdef HAVE_MMX2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1023 "pminsw %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1024 "pminsw %%mm3, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1025 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1026 "movq %%mm0, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1027 "psubusw %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1028 "psubw %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1029 "movq %%mm1, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1030 "psubusw %%mm3, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1031 "psubw %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1032 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1033 |
|
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
1034 "movd %2, %%mm2 \n\t" // QP |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
1035 "punpcklbw %%mm7, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
1036 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1037 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1038 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1039 "pxor %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1040 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1041 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1042 "pxor %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1043 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1044 // 100 opcodes |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1045 "psllw $3, %%mm2 \n\t" // 8QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1046 "movq %%mm2, %%mm3 \n\t" // 8QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1047 "pcmpgtw %%mm4, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1048 "pcmpgtw %%mm5, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1049 "pand %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1050 "pand %%mm3, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1051 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1052 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1053 "psubusw %%mm0, %%mm4 \n\t" // hd |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1054 "psubusw %%mm1, %%mm5 \n\t" // ld |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1055 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1056 |
| 211 | 1057 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1058 "pmullw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1059 "pmullw %%mm2, %%mm5 \n\t" |
| 211 | 1060 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1061 "paddw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1062 "paddw %%mm2, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1063 "psrlw $6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1064 "psrlw $6, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1065 |
| 787 | 1066 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 |
| 1067 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1068 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1069 "pxor %%mm2, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1070 "pxor %%mm3, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1071 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1072 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1073 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1074 "pxor %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1075 "pxor %%mm3, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1076 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1077 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1078 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1079 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1080 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1081 "pxor %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1082 "pxor %%mm7, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1083 "pand %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1084 "pand %%mm3, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1085 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1086 #ifdef HAVE_MMX2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1087 "pminsw %%mm0, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1088 "pminsw %%mm1, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1089 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1090 "movq %%mm4, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1091 "psubusw %%mm0, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1092 "psubw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1093 "movq %%mm5, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1094 "psubusw %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1095 "psubw %%mm2, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1096 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1097 "pxor %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1098 "pxor %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1099 "psubw %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1100 "psubw %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1101 "packsswb %%mm5, %%mm4 \n\t" |
| 810 | 1102 "movq (%0), %%mm0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1103 "paddb %%mm4, %%mm0 \n\t" |
| 810 | 1104 "movq %%mm0, (%0) \n\t" |
| 1105 "movq (%0, %1), %%mm0 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1106 "psubb %%mm4, %%mm0 \n\t" |
| 810 | 1107 "movq %%mm0, (%0, %1) \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1108 |
| 810 | 1109 : "+r" (src) |
| 1110 : "r" (stride), "m" (c->pQPb) | |
| 1111 : "%eax", "%ecx" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1112 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1113 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1114 const int l1= stride; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1115 const int l2= stride + l1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1116 const int l3= stride + l2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1117 const int l4= stride + l3; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1118 const int l5= stride + l4; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1119 const int l6= stride + l5; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1120 const int l7= stride + l6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1121 const int l8= stride + l7; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1122 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
1123 int x; |
| 111 | 1124 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
1125 for(x=0; x<BLOCK_SIZE; x++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1126 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1127 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
| 787 | 1128 if(ABS(middleEnergy) < 8*c->QP) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1129 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1130 const int q=(src[l4] - src[l5])/2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1131 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1132 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1133 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1134 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1135 d= MAX(d, 0); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1136 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1137 d= (5*d + 32) >> 6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1138 d*= SIGN(-middleEnergy); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1139 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1140 if(q>0) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1141 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1142 d= d<0 ? 0 : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1143 d= d>q ? q : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1144 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1145 else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1146 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1147 d= d>0 ? 0 : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1148 d= d<q ? q : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1149 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1150 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1151 src[l4]-= d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1152 src[l5]+= d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1153 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1154 src++; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1155 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1156 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1157 } |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1158 #endif //HAVE_ALTIVEC |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1159 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1160 #ifndef HAVE_ALTIVEC |
| 787 | 1161 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1162 { |
| 132 | 1163 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1164 asm volatile( |
| 787 | 1165 "pxor %%mm6, %%mm6 \n\t" |
| 1166 "pcmpeqb %%mm7, %%mm7 \n\t" | |
| 1167 "movq %2, %%mm0 \n\t" | |
| 1168 "punpcklbw %%mm6, %%mm0 \n\t" | |
| 1169 "psrlw $1, %%mm0 \n\t" | |
| 1170 "psubw %%mm7, %%mm0 \n\t" | |
| 1171 "packuswb %%mm0, %%mm0 \n\t" | |
| 1172 "movq %%mm0, %3 \n\t" | |
| 130 | 1173 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1174 "leal (%0, %1), %%eax \n\t" |
| 787 | 1175 "leal (%%eax, %1, 4), %%edx \n\t" |
| 1176 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1177 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1178 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1179 |
| 169 | 1180 #undef FIND_MIN_MAX |
| 132 | 1181 #ifdef HAVE_MMX2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1182 #define FIND_MIN_MAX(addr)\ |
| 130 | 1183 "movq " #addr ", %%mm0 \n\t"\ |
| 167 | 1184 "pminub %%mm0, %%mm7 \n\t"\ |
| 1185 "pmaxub %%mm0, %%mm6 \n\t" | |
| 132 | 1186 #else |
| 1187 #define FIND_MIN_MAX(addr)\ | |
| 1188 "movq " #addr ", %%mm0 \n\t"\ | |
| 167 | 1189 "movq %%mm7, %%mm1 \n\t"\ |
| 1190 "psubusb %%mm0, %%mm6 \n\t"\ | |
| 1191 "paddb %%mm0, %%mm6 \n\t"\ | |
| 132 | 1192 "psubusb %%mm0, %%mm1 \n\t"\ |
| 167 | 1193 "psubb %%mm1, %%mm7 \n\t" |
| 132 | 1194 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1195 |
| 130 | 1196 FIND_MIN_MAX((%%eax)) |
| 1197 FIND_MIN_MAX((%%eax, %1)) | |
| 1198 FIND_MIN_MAX((%%eax, %1, 2)) | |
| 1199 FIND_MIN_MAX((%0, %1, 4)) | |
| 787 | 1200 FIND_MIN_MAX((%%edx)) |
| 1201 FIND_MIN_MAX((%%edx, %1)) | |
| 1202 FIND_MIN_MAX((%%edx, %1, 2)) | |
| 130 | 1203 FIND_MIN_MAX((%0, %1, 8)) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1204 |
| 167 | 1205 "movq %%mm7, %%mm4 \n\t" |
| 1206 "psrlq $8, %%mm7 \n\t" | |
| 1207 #ifdef HAVE_MMX2 | |
| 1208 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1209 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
| 1210 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1211 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
| 1212 "pminub %%mm4, %%mm7 \n\t" | |
| 1213 #else | |
| 1214 "movq %%mm7, %%mm1 \n\t" | |
| 1215 "psubusb %%mm4, %%mm1 \n\t" | |
| 1216 "psubb %%mm1, %%mm7 \n\t" | |
| 1217 "movq %%mm7, %%mm4 \n\t" | |
| 1218 "psrlq $16, %%mm7 \n\t" | |
| 1219 "movq %%mm7, %%mm1 \n\t" | |
| 1220 "psubusb %%mm4, %%mm1 \n\t" | |
| 1221 "psubb %%mm1, %%mm7 \n\t" | |
| 1222 "movq %%mm7, %%mm4 \n\t" | |
| 1223 "psrlq $32, %%mm7 \n\t" | |
| 1224 "movq %%mm7, %%mm1 \n\t" | |
| 1225 "psubusb %%mm4, %%mm1 \n\t" | |
| 1226 "psubb %%mm1, %%mm7 \n\t" | |
| 1227 #endif | |
| 1228 | |
| 1229 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1230 "movq %%mm6, %%mm4 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1231 "psrlq $8, %%mm6 \n\t" |
| 132 | 1232 #ifdef HAVE_MMX2 |
| 167 | 1233 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1234 "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
| 167 | 1235 "pmaxub %%mm4, %%mm6 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1236 "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
| 167 | 1237 "pmaxub %%mm4, %%mm6 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1238 #else |
| 167 | 1239 "psubusb %%mm4, %%mm6 \n\t" |
| 1240 "paddb %%mm4, %%mm6 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1241 "movq %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1242 "psrlq $16, %%mm6 \n\t" |
| 167 | 1243 "psubusb %%mm4, %%mm6 \n\t" |
| 1244 "paddb %%mm4, %%mm6 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1245 "movq %%mm6, %%mm4 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1246 "psrlq $32, %%mm6 \n\t" |
| 167 | 1247 "psubusb %%mm4, %%mm6 \n\t" |
| 1248 "paddb %%mm4, %%mm6 \n\t" | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1249 #endif |
| 167 | 1250 "movq %%mm6, %%mm0 \n\t" // max |
| 1251 "psubb %%mm7, %%mm6 \n\t" // max - min | |
| 1252 "movd %%mm6, %%ecx \n\t" | |
| 210 | 1253 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" |
| 167 | 1254 " jb 1f \n\t" |
| 787 | 1255 "leal -24(%%esp), %%ecx \n\t" |
| 1256 "andl $0xFFFFFFF8, %%ecx \n\t" | |
| 167 | 1257 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1258 "punpcklbw %%mm7, %%mm7 \n\t" |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1259 "punpcklbw %%mm7, %%mm7 \n\t" |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1260 "punpcklbw %%mm7, %%mm7 \n\t" |
| 787 | 1261 "movq %%mm7, (%%ecx) \n\t" |
| 130 | 1262 |
| 1263 "movq (%0), %%mm0 \n\t" // L10 | |
| 1264 "movq %%mm0, %%mm1 \n\t" // L10 | |
| 1265 "movq %%mm0, %%mm2 \n\t" // L10 | |
| 1266 "psllq $8, %%mm1 \n\t" | |
| 1267 "psrlq $8, %%mm2 \n\t" | |
| 1268 "movd -4(%0), %%mm3 \n\t" | |
| 1269 "movd 8(%0), %%mm4 \n\t" | |
| 1270 "psrlq $24, %%mm3 \n\t" | |
| 1271 "psllq $56, %%mm4 \n\t" | |
| 1272 "por %%mm3, %%mm1 \n\t" // L00 | |
| 1273 "por %%mm4, %%mm2 \n\t" // L20 | |
| 1274 "movq %%mm1, %%mm3 \n\t" // L00 | |
| 1275 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 | |
| 1276 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 | |
| 1277 "psubusb %%mm7, %%mm0 \n\t" | |
| 1278 "psubusb %%mm7, %%mm2 \n\t" | |
| 1279 "psubusb %%mm7, %%mm3 \n\t" | |
| 210 | 1280 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 |
| 1281 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 | |
| 1282 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 | |
| 130 | 1283 "paddb %%mm2, %%mm0 \n\t" |
| 1284 "paddb %%mm3, %%mm0 \n\t" | |
| 1285 | |
| 1286 "movq (%%eax), %%mm2 \n\t" // L11 | |
| 1287 "movq %%mm2, %%mm3 \n\t" // L11 | |
| 1288 "movq %%mm2, %%mm4 \n\t" // L11 | |
| 1289 "psllq $8, %%mm3 \n\t" | |
| 1290 "psrlq $8, %%mm4 \n\t" | |
| 1291 "movd -4(%%eax), %%mm5 \n\t" | |
| 1292 "movd 8(%%eax), %%mm6 \n\t" | |
| 1293 "psrlq $24, %%mm5 \n\t" | |
| 1294 "psllq $56, %%mm6 \n\t" | |
| 1295 "por %%mm5, %%mm3 \n\t" // L01 | |
| 1296 "por %%mm6, %%mm4 \n\t" // L21 | |
| 1297 "movq %%mm3, %%mm5 \n\t" // L01 | |
| 1298 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 | |
| 1299 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 | |
| 1300 "psubusb %%mm7, %%mm2 \n\t" | |
| 1301 "psubusb %%mm7, %%mm4 \n\t" | |
| 1302 "psubusb %%mm7, %%mm5 \n\t" | |
| 210 | 1303 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 |
| 1304 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 | |
| 1305 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 | |
| 130 | 1306 "paddb %%mm4, %%mm2 \n\t" |
| 1307 "paddb %%mm5, %%mm2 \n\t" | |
| 1308 // 0, 2, 3, 1 | |
| 1309 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ | |
| 1310 "movq " #src ", " #sx " \n\t" /* src[0] */\ | |
| 1311 "movq " #sx ", " #lx " \n\t" /* src[0] */\ | |
| 1312 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ | |
| 1313 "psllq $8, " #lx " \n\t"\ | |
| 1314 "psrlq $8, " #t0 " \n\t"\ | |
| 1315 "movd -4" #src ", " #t1 " \n\t"\ | |
| 1316 "psrlq $24, " #t1 " \n\t"\ | |
| 1317 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ | |
| 1318 "movd 8" #src ", " #t1 " \n\t"\ | |
| 1319 "psllq $56, " #t1 " \n\t"\ | |
| 1320 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | |
| 1321 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | |
| 1322 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | |
| 1323 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | |
| 135 | 1324 PAVGB(lx, pplx) \ |
| 787 | 1325 "movq " #lx ", 8(%%ecx) \n\t"\ |
| 1326 "movq (%%ecx), " #lx " \n\t"\ | |
| 140 | 1327 "psubusb " #lx ", " #t1 " \n\t"\ |
| 1328 "psubusb " #lx ", " #t0 " \n\t"\ | |
| 1329 "psubusb " #lx ", " #sx " \n\t"\ | |
| 210 | 1330 "movq "MANGLE(b00)", " #lx " \n\t"\ |
| 140 | 1331 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
| 1332 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ | |
| 1333 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ | |
| 130 | 1334 "paddb " #t1 ", " #t0 " \n\t"\ |
| 1335 "paddb " #t0 ", " #sx " \n\t"\ | |
| 1336 \ | |
| 1337 PAVGB(plx, pplx) /* filtered */\ | |
| 1338 "movq " #dst ", " #t0 " \n\t" /* dst */\ | |
| 134 | 1339 "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
| 787 | 1340 "psubusb %3, " #t0 " \n\t"\ |
| 1341 "paddusb %3, " #t1 " \n\t"\ | |
| 134 | 1342 PMAXUB(t0, pplx)\ |
| 1343 PMINUB(t1, pplx, t0)\ | |
| 130 | 1344 "paddb " #sx ", " #ppsx " \n\t"\ |
| 1345 "paddb " #psx ", " #ppsx " \n\t"\ | |
| 210 | 1346 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
| 1347 "pand "MANGLE(b08)", " #ppsx " \n\t"\ | |
| 140 | 1348 "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
| 134 | 1349 "pand " #ppsx ", " #pplx " \n\t"\ |
| 130 | 1350 "pandn " #dst ", " #ppsx " \n\t"\ |
| 140 | 1351 "por " #pplx ", " #ppsx " \n\t"\ |
| 135 | 1352 "movq " #ppsx ", " #dst " \n\t"\ |
| 787 | 1353 "movq 8(%%ecx), " #lx " \n\t" |
| 134 | 1354 |
| 130 | 1355 /* |
| 1356 0000000 | |
| 1357 1111111 | |
| 1358 | |
| 1359 1111110 | |
| 1360 1111101 | |
| 1361 1111100 | |
| 1362 1111011 | |
| 1363 1111010 | |
| 1364 1111001 | |
| 1365 | |
| 1366 1111000 | |
| 1367 1110111 | |
| 1368 | |
| 1369 */ | |
| 1370 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) | |
| 1371 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1372 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1373 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
| 787 | 1374 DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
| 1375 DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1376 DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
| 1377 DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1378 DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1379 |
| 167 | 1380 "1: \n\t" |
| 787 | 1381 : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2) |
| 1382 : "%eax", "%edx", "%ecx" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1383 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1384 #else |
| 134 | 1385 int y; |
| 1386 int min=255; | |
| 1387 int max=0; | |
| 1388 int avg; | |
| 1389 uint8_t *p; | |
| 1390 int s[10]; | |
| 787 | 1391 const int QP2= c->QP/2 + 1; |
| 134 | 1392 |
| 1393 for(y=1; y<9; y++) | |
| 1394 { | |
| 1395 int x; | |
| 1396 p= src + stride*y; | |
| 1397 for(x=1; x<9; x++) | |
| 1398 { | |
| 1399 p++; | |
| 1400 if(*p > max) max= *p; | |
| 1401 if(*p < min) min= *p; | |
| 1402 } | |
| 1403 } | |
| 787 | 1404 avg= (min + max + 1)>>1; |
| 134 | 1405 |
| 167 | 1406 if(max - min <deringThreshold) return; |
| 1407 | |
| 134 | 1408 for(y=0; y<10; y++) |
| 1409 { | |
| 1410 int t = 0; | |
| 787 | 1411 |
| 1412 if(src[stride*y + 0] > avg) t+= 1; | |
| 1413 if(src[stride*y + 1] > avg) t+= 2; | |
| 1414 if(src[stride*y + 2] > avg) t+= 4; | |
| 1415 if(src[stride*y + 3] > avg) t+= 8; | |
| 1416 if(src[stride*y + 4] > avg) t+= 16; | |
| 1417 if(src[stride*y + 5] > avg) t+= 32; | |
| 1418 if(src[stride*y + 6] > avg) t+= 64; | |
| 1419 if(src[stride*y + 7] > avg) t+= 128; | |
| 1420 if(src[stride*y + 8] > avg) t+= 256; | |
| 1421 if(src[stride*y + 9] > avg) t+= 512; | |
| 1422 | |
| 134 | 1423 t |= (~t)<<16; |
| 1424 t &= (t<<1) & (t>>1); | |
| 1425 s[y] = t; | |
| 1426 } | |
| 787 | 1427 |
| 1428 for(y=1; y<9; y++) | |
| 1429 { | |
| 1430 int t = s[y-1] & s[y] & s[y+1]; | |
| 1431 t|= t>>16; | |
| 1432 s[y-1]= t; | |
| 1433 } | |
| 134 | 1434 |
| 1435 for(y=1; y<9; y++) | |
| 1436 { | |
| 1437 int x; | |
| 787 | 1438 int t = s[y-1]; |
| 134 | 1439 |
| 1440 p= src + stride*y; | |
| 1441 for(x=1; x<9; x++) | |
| 1442 { | |
| 1443 p++; | |
| 1444 if(t & (1<<x)) | |
| 1445 { | |
| 1446 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | |
| 1447 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | |
| 1448 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | |
| 1449 f= (f + 8)>>4; | |
| 1450 | |
| 167 | 1451 #ifdef DEBUG_DERING_THRESHOLD |
| 1452 asm volatile("emms\n\t":); | |
| 1453 { | |
| 1454 static long long numPixels=0; | |
| 1455 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | |
| 1456 // if((max-min)<20 || (max-min)*QP<200) | |
| 1457 // if((max-min)*QP < 500) | |
| 1458 // if(max-min<QP/2) | |
| 1459 if(max-min < 20) | |
| 1460 { | |
| 1461 static int numSkiped=0; | |
| 1462 static int errorSum=0; | |
| 1463 static int worstQP=0; | |
| 1464 static int worstRange=0; | |
| 1465 static int worstDiff=0; | |
| 1466 int diff= (f - *p); | |
| 1467 int absDiff= ABS(diff); | |
| 1468 int error= diff*diff; | |
| 1469 | |
| 1470 if(x==1 || x==8 || y==1 || y==8) continue; | |
| 1471 | |
| 1472 numSkiped++; | |
| 1473 if(absDiff > worstDiff) | |
| 1474 { | |
| 1475 worstDiff= absDiff; | |
| 1476 worstQP= QP; | |
| 1477 worstRange= max-min; | |
| 1478 } | |
| 1479 errorSum+= error; | |
| 1480 | |
| 1481 if(1024LL*1024LL*1024LL % numSkiped == 0) | |
| 1482 { | |
| 1483 printf( "sum:%1.3f, skip:%d, wQP:%d, " | |
| 1484 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | |
| 1485 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, | |
| 1486 worstDiff, (float)numSkiped/numPixels); | |
| 1487 } | |
| 1488 } | |
| 1489 } | |
| 1490 #endif | |
| 787 | 1491 if (*p + QP2 < f) *p= *p + QP2; |
| 1492 else if(*p - QP2 > f) *p= *p - QP2; | |
| 134 | 1493 else *p=f; |
| 1494 } | |
| 1495 } | |
| 1496 } | |
| 167 | 1497 #ifdef DEBUG_DERING_THRESHOLD |
| 1498 if(max-min < 20) | |
| 1499 { | |
| 1500 for(y=1; y<9; y++) | |
| 1501 { | |
| 1502 int x; | |
| 1503 int t = 0; | |
| 1504 p= src + stride*y; | |
| 1505 for(x=1; x<9; x++) | |
| 1506 { | |
| 1507 p++; | |
| 1508 *p = MIN(*p + 20, 255); | |
| 1509 } | |
| 1510 } | |
| 1511 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | |
| 1512 } | |
| 1513 #endif | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1514 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1515 } |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
1516 #endif //HAVE_ALTIVEC |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1517 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1518 /** |
| 1109 | 1519 * Deinterlaces the given block by linearly interpolating every second line. |
| 142 | 1520 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1521 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1522 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1523 */ |
| 169 | 1524 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1525 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1526 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1527 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1528 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1529 "leal (%0, %1), %%eax \n\t" |
| 787 | 1530 "leal (%%eax, %1, 4), %%ecx \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1531 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1532 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1533 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1534 "movq (%0), %%mm0 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1535 "movq (%%eax, %1), %%mm1 \n\t" |
| 111 | 1536 PAVGB(%%mm1, %%mm0) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1537 "movq %%mm0, (%%eax) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1538 "movq (%0, %1, 4), %%mm0 \n\t" |
| 111 | 1539 PAVGB(%%mm0, %%mm1) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1540 "movq %%mm1, (%%eax, %1, 2) \n\t" |
| 787 | 1541 "movq (%%ecx, %1), %%mm1 \n\t" |
| 111 | 1542 PAVGB(%%mm1, %%mm0) |
| 787 | 1543 "movq %%mm0, (%%ecx) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1544 "movq (%0, %1, 8), %%mm0 \n\t" |
| 111 | 1545 PAVGB(%%mm0, %%mm1) |
| 787 | 1546 "movq %%mm1, (%%ecx, %1, 2) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1547 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1548 : : "r" (src), "r" (stride) |
| 787 | 1549 : "%eax", "%ecx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1550 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1551 #else |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1552 int a, b, x; |
| 142 | 1553 src+= 4*stride; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1554 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1555 for(x=0; x<2; x++){ |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1556 a= *(uint32_t*)&src[stride*0]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1557 b= *(uint32_t*)&src[stride*2]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1558 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1559 a= *(uint32_t*)&src[stride*4]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1560 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1561 b= *(uint32_t*)&src[stride*6]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1562 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1563 a= *(uint32_t*)&src[stride*8]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1564 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1565 src += 4; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1566 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1567 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1568 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1569 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1570 /** |
| 1109 | 1571 * Deinterlaces the given block by cubic interpolating every second line. |
| 142 | 1572 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1573 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1574 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 1575 * this filter will read lines 3-15 and write 7-13 | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1576 */ |
| 169 | 1577 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1578 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1579 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1580 src+= stride*3; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1581 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1582 "leal (%0, %1), %%eax \n\t" |
| 787 | 1583 "leal (%%eax, %1, 4), %%edx \n\t" |
| 1584 "leal (%%edx, %1, 4), %%ecx \n\t" | |
| 111 | 1585 "addl %1, %%ecx \n\t" |
| 1586 "pxor %%mm7, %%mm7 \n\t" | |
| 1587 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 787 | 1588 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1589 |
| 111 | 1590 #define DEINT_CUBIC(a,b,c,d,e)\ |
| 1591 "movq " #a ", %%mm0 \n\t"\ | |
| 1592 "movq " #b ", %%mm1 \n\t"\ | |
| 1593 "movq " #d ", %%mm2 \n\t"\ | |
| 1594 "movq " #e ", %%mm3 \n\t"\ | |
| 1595 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ | |
| 1596 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ | |
| 1597 "movq %%mm0, %%mm2 \n\t"\ | |
| 1598 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
| 1599 "punpckhbw %%mm7, %%mm2 \n\t"\ | |
| 1600 "movq %%mm1, %%mm3 \n\t"\ | |
| 1601 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
| 1602 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
| 1603 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ | |
| 1604 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ | |
| 1605 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ | |
| 1606 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ | |
| 1607 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | |
| 1608 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | |
| 1609 "packuswb %%mm3, %%mm1 \n\t"\ | |
| 1610 "movq %%mm1, " #c " \n\t" | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1611 |
| 787 | 1612 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1)) |
| 1613 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8)) | |
| 1614 DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx)) | |
| 1615 DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2)) | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1616 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1617 : : "r" (src), "r" (stride) |
| 787 | 1618 : "%eax", "%edx", "ecx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1619 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1620 #else |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1621 int x; |
| 142 | 1622 src+= stride*3; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1623 for(x=0; x<8; x++) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1624 { |
| 1157 | 1625 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); |
| 1626 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); | |
| 1627 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); | |
| 1628 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1629 src++; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1630 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1631 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1632 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1633 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1634 /** |
| 1109 | 1635 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. |
| 142 | 1636 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1637 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1638 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 787 | 1639 * this filter will read lines 4-13 and write 5-11 |
| 1640 */ | |
| 1641 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) | |
| 1642 { | |
| 1643 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 1644 src+= stride*4; | |
| 1645 asm volatile( | |
| 1646 "leal (%0, %1), %%eax \n\t" | |
| 1647 "leal (%%eax, %1, 4), %%edx \n\t" | |
| 1648 "pxor %%mm7, %%mm7 \n\t" | |
| 1649 "movq (%2), %%mm0 \n\t" | |
| 1650 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 1651 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
| 1652 | |
| 1653 #define DEINT_FF(a,b,c,d)\ | |
| 1654 "movq " #a ", %%mm1 \n\t"\ | |
| 1655 "movq " #b ", %%mm2 \n\t"\ | |
| 1656 "movq " #c ", %%mm3 \n\t"\ | |
| 1657 "movq " #d ", %%mm4 \n\t"\ | |
| 1658 PAVGB(%%mm3, %%mm1) \ | |
| 1659 PAVGB(%%mm4, %%mm0) \ | |
| 1660 "movq %%mm0, %%mm3 \n\t"\ | |
| 1661 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
| 1662 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
| 1663 "movq %%mm1, %%mm4 \n\t"\ | |
| 1664 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
| 1665 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
| 1666 "psllw $2, %%mm1 \n\t"\ | |
| 1667 "psllw $2, %%mm4 \n\t"\ | |
| 1668 "psubw %%mm0, %%mm1 \n\t"\ | |
| 1669 "psubw %%mm3, %%mm4 \n\t"\ | |
| 1670 "movq %%mm2, %%mm5 \n\t"\ | |
| 1671 "movq %%mm2, %%mm0 \n\t"\ | |
| 1672 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
| 1673 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
| 1674 "paddw %%mm2, %%mm1 \n\t"\ | |
| 1675 "paddw %%mm5, %%mm4 \n\t"\ | |
| 1676 "psraw $2, %%mm1 \n\t"\ | |
| 1677 "psraw $2, %%mm4 \n\t"\ | |
| 1678 "packuswb %%mm4, %%mm1 \n\t"\ | |
| 1679 "movq %%mm1, " #b " \n\t"\ | |
| 1680 | |
| 1681 DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2)) | |
| 1682 DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) ) | |
| 1683 DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2)) | |
| 1684 DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4)) | |
| 1685 | |
| 1686 "movq %%mm0, (%2) \n\t" | |
| 1687 : : "r" (src), "r" (stride), "r"(tmp) | |
| 1688 : "%eax", "%edx" | |
| 1689 ); | |
| 1690 #else | |
| 1691 int x; | |
| 1692 src+= stride*4; | |
| 1693 for(x=0; x<8; x++) | |
| 1694 { | |
| 1695 int t1= tmp[x]; | |
| 1696 int t2= src[stride*1]; | |
| 1697 | |
| 1157 | 1698 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); |
| 787 | 1699 t1= src[stride*4]; |
| 1157 | 1700 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); |
| 787 | 1701 t2= src[stride*6]; |
| 1157 | 1702 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); |
| 787 | 1703 t1= src[stride*8]; |
| 1157 | 1704 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); |
| 787 | 1705 tmp[x]= t1; |
| 1706 | |
| 1707 src++; | |
| 1708 } | |
| 1709 #endif | |
| 1710 } | |
| 1711 | |
| 1712 /** | |
| 1157 | 1713 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. |
| 1714 * will be called for every 8x8 block and can read & write from line 4-15 | |
| 1715 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1716 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 1717 * this filter will read lines 4-13 and write 4-11 | |
| 1718 */ | |
| 1719 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) | |
| 1720 { | |
| 1721 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 1722 src+= stride*4; | |
| 1723 asm volatile( | |
| 1724 "leal (%0, %1), %%eax \n\t" | |
| 1725 "leal (%%eax, %1, 4), %%edx \n\t" | |
| 1726 "pxor %%mm7, %%mm7 \n\t" | |
| 1727 "movq (%2), %%mm0 \n\t" | |
| 1728 "movq (%3), %%mm1 \n\t" | |
| 1729 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 1730 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
| 1731 | |
| 1732 #define DEINT_L5(t1,t2,a,b,c)\ | |
| 1733 "movq " #a ", %%mm2 \n\t"\ | |
| 1734 "movq " #b ", %%mm3 \n\t"\ | |
| 1735 "movq " #c ", %%mm4 \n\t"\ | |
| 1736 PAVGB(t2, %%mm3) \ | |
| 1737 PAVGB(t1, %%mm4) \ | |
| 1738 "movq %%mm2, %%mm5 \n\t"\ | |
| 1739 "movq %%mm2, " #t1 " \n\t"\ | |
| 1740 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
| 1741 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
| 1742 "movq %%mm2, %%mm6 \n\t"\ | |
| 1743 "paddw %%mm2, %%mm2 \n\t"\ | |
| 1744 "paddw %%mm6, %%mm2 \n\t"\ | |
| 1745 "movq %%mm5, %%mm6 \n\t"\ | |
| 1746 "paddw %%mm5, %%mm5 \n\t"\ | |
| 1747 "paddw %%mm6, %%mm5 \n\t"\ | |
| 1748 "movq %%mm3, %%mm6 \n\t"\ | |
| 1749 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
| 1750 "punpckhbw %%mm7, %%mm6 \n\t"\ | |
| 1751 "paddw %%mm3, %%mm3 \n\t"\ | |
| 1752 "paddw %%mm6, %%mm6 \n\t"\ | |
| 1753 "paddw %%mm3, %%mm2 \n\t"\ | |
| 1754 "paddw %%mm6, %%mm5 \n\t"\ | |
| 1755 "movq %%mm4, %%mm6 \n\t"\ | |
| 1756 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
| 1757 "punpckhbw %%mm7, %%mm6 \n\t"\ | |
| 1758 "psubw %%mm4, %%mm2 \n\t"\ | |
| 1759 "psubw %%mm6, %%mm5 \n\t"\ | |
| 1760 "psraw $2, %%mm2 \n\t"\ | |
| 1761 "psraw $2, %%mm5 \n\t"\ | |
| 1762 "packuswb %%mm5, %%mm2 \n\t"\ | |
| 1763 "movq %%mm2, " #a " \n\t"\ | |
| 1764 | |
| 1765 DEINT_L5(%%mm0, %%mm1, (%0) , (%%eax) , (%%eax, %1) ) | |
| 1766 DEINT_L5(%%mm1, %%mm0, (%%eax) , (%%eax, %1) , (%%eax, %1, 2)) | |
| 1767 DEINT_L5(%%mm0, %%mm1, (%%eax, %1) , (%%eax, %1, 2), (%0, %1, 4) ) | |
| 1768 DEINT_L5(%%mm1, %%mm0, (%%eax, %1, 2), (%0, %1, 4) , (%%edx) ) | |
| 1769 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%edx) , (%%edx, %1) ) | |
| 1770 DEINT_L5(%%mm1, %%mm0, (%%edx) , (%%edx, %1) , (%%edx, %1, 2)) | |
| 1771 DEINT_L5(%%mm0, %%mm1, (%%edx, %1) , (%%edx, %1, 2), (%0, %1, 8) ) | |
| 1772 DEINT_L5(%%mm1, %%mm0, (%%edx, %1, 2), (%0, %1, 8) , (%%edx, %1, 4)) | |
| 1773 | |
| 1774 "movq %%mm0, (%2) \n\t" | |
| 1775 "movq %%mm1, (%3) \n\t" | |
| 1776 : : "r" (src), "r" (stride), "r"(tmp), "r"(tmp2) | |
| 1777 : "%eax", "%edx" | |
| 1778 ); | |
| 1779 #else | |
| 1780 int x; | |
| 1781 src+= stride*4; | |
| 1782 for(x=0; x<8; x++) | |
| 1783 { | |
| 1784 int t1= tmp[x]; | |
| 1785 int t2= tmp2[x]; | |
| 1786 int t3= src[0]; | |
| 1787 | |
| 1788 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); | |
| 1789 t1= src[stride*1]; | |
| 1790 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); | |
| 1791 t2= src[stride*2]; | |
| 1792 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); | |
| 1793 t3= src[stride*3]; | |
| 1794 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); | |
| 1795 t1= src[stride*4]; | |
| 1796 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); | |
| 1797 t2= src[stride*5]; | |
| 1798 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); | |
| 1799 t3= src[stride*6]; | |
| 1800 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); | |
| 1801 t1= src[stride*7]; | |
| 1802 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); | |
| 1803 | |
| 1804 tmp[x]= t3; | |
| 1805 tmp2[x]= t1; | |
| 1806 | |
| 1807 src++; | |
| 1808 } | |
| 1809 #endif | |
| 1810 } | |
| 1811 | |
| 1812 /** | |
| 1109 | 1813 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. |
| 787 | 1814 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1815 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1816 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 142 | 1817 * this filter will read lines 4-13 and write 4-11 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1818 */ |
| 1581 | 1819 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1820 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1821 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1822 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1823 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1824 "leal (%0, %1), %%eax \n\t" |
| 787 | 1825 "leal (%%eax, %1, 4), %%edx \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1826 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1827 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1828 |
| 1581 | 1829 "movq (%2), %%mm0 \n\t" // L0 |
| 1830 "movq (%%eax), %%mm1 \n\t" // L2 | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1831 PAVGB(%%mm1, %%mm0) // L0+L2 |
| 1581 | 1832 "movq (%0), %%mm2 \n\t" // L1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1833 PAVGB(%%mm2, %%mm0) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1834 "movq %%mm0, (%0) \n\t" |
| 1581 | 1835 "movq (%%eax, %1), %%mm0 \n\t" // L3 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1836 PAVGB(%%mm0, %%mm2) // L1+L3 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1837 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1838 "movq %%mm2, (%%eax) \n\t" |
| 1581 | 1839 "movq (%%eax, %1, 2), %%mm2 \n\t" // L4 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1840 PAVGB(%%mm2, %%mm1) // L2+L4 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1841 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1842 "movq %%mm1, (%%eax, %1) \n\t" |
| 1581 | 1843 "movq (%0, %1, 4), %%mm1 \n\t" // L5 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1844 PAVGB(%%mm1, %%mm0) // L3+L5 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1845 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1846 "movq %%mm0, (%%eax, %1, 2) \n\t" |
| 1581 | 1847 "movq (%%edx), %%mm0 \n\t" // L6 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1848 PAVGB(%%mm0, %%mm2) // L4+L6 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1849 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1850 "movq %%mm2, (%0, %1, 4) \n\t" |
| 1581 | 1851 "movq (%%edx, %1), %%mm2 \n\t" // L7 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1852 PAVGB(%%mm2, %%mm1) // L5+L7 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1853 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 |
| 787 | 1854 "movq %%mm1, (%%edx) \n\t" |
| 1581 | 1855 "movq (%%edx, %1, 2), %%mm1 \n\t" // L8 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1856 PAVGB(%%mm1, %%mm0) // L6+L8 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1857 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 |
| 787 | 1858 "movq %%mm0, (%%edx, %1) \n\t" |
| 1581 | 1859 "movq (%0, %1, 8), %%mm0 \n\t" // L9 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1860 PAVGB(%%mm0, %%mm2) // L7+L9 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1861 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 |
| 787 | 1862 "movq %%mm2, (%%edx, %1, 2) \n\t" |
| 1581 | 1863 "movq %%mm1, (%2) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1864 |
| 1581 | 1865 : : "r" (src), "r" (stride), "r" (tmp) |
| 787 | 1866 : "%eax", "%edx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1867 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1868 #else |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1869 int a, b, c, x; |
| 142 | 1870 src+= 4*stride; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1871 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1872 for(x=0; x<2; x++){ |
| 1581 | 1873 a= *(uint32_t*)&tmp[stride*0]; |
| 1874 b= *(uint32_t*)&src[stride*0]; | |
| 1875 c= *(uint32_t*)&src[stride*1]; | |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1876 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1877 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1878 |
| 1581 | 1879 a= *(uint32_t*)&src[stride*2]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1880 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1881 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1882 |
| 1581 | 1883 b= *(uint32_t*)&src[stride*3]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1884 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1885 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1886 |
| 1581 | 1887 c= *(uint32_t*)&src[stride*4]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1888 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1889 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1890 |
| 1581 | 1891 a= *(uint32_t*)&src[stride*5]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1892 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1893 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1894 |
| 1581 | 1895 b= *(uint32_t*)&src[stride*6]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1896 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1897 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1898 |
| 1581 | 1899 c= *(uint32_t*)&src[stride*7]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1900 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1901 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1902 |
| 1581 | 1903 a= *(uint32_t*)&src[stride*8]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1904 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1905 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1906 |
| 1581 | 1907 *(uint32_t*)&tmp[stride*0]= c; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1908 src += 4; |
| 1581 | 1909 tmp += 4; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1910 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1911 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1912 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1913 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1914 /** |
| 1109 | 1915 * Deinterlaces the given block by applying a median filter to every second line. |
| 142 | 1916 * will be called for every 8x8 block and can read & write from line 4-15, |
| 1917 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1918 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1919 */ |
| 169 | 1920 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1921 { |
| 107 | 1922 #ifdef HAVE_MMX |
| 142 | 1923 src+= 4*stride; |
| 107 | 1924 #ifdef HAVE_MMX2 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1925 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1926 "leal (%0, %1), %%eax \n\t" |
| 787 | 1927 "leal (%%eax, %1, 4), %%edx \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1928 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1929 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1930 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1931 "movq (%0), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1932 "movq (%%eax, %1), %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1933 "movq (%%eax), %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1934 "movq %%mm0, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1935 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1936 "pminub %%mm3, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1937 "pmaxub %%mm2, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1938 "pminub %%mm1, %%mm0 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1939 "movq %%mm0, (%%eax) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1940 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1941 "movq (%0, %1, 4), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1942 "movq (%%eax, %1, 2), %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1943 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1944 "pmaxub %%mm1, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1945 "pminub %%mm3, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1946 "pmaxub %%mm0, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1947 "pminub %%mm1, %%mm2 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1948 "movq %%mm2, (%%eax, %1, 2) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1949 |
| 787 | 1950 "movq (%%edx), %%mm2 \n\t" // |
| 1951 "movq (%%edx, %1), %%mm1 \n\t" // | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1952 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1953 "pmaxub %%mm0, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1954 "pminub %%mm3, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1955 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1956 "pminub %%mm0, %%mm2 \n\t" |
| 787 | 1957 "movq %%mm2, (%%edx) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1958 |
| 787 | 1959 "movq (%%edx, %1, 2), %%mm2 \n\t" // |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1960 "movq (%0, %1, 8), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1961 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1962 "pmaxub %%mm0, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1963 "pminub %%mm3, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1964 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1965 "pminub %%mm0, %%mm2 \n\t" |
| 787 | 1966 "movq %%mm2, (%%edx, %1, 2) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1967 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1968 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1969 : : "r" (src), "r" (stride) |
| 787 | 1970 : "%eax", "%edx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1971 ); |
| 107 | 1972 |
| 1973 #else // MMX without MMX2 | |
| 1974 asm volatile( | |
| 1975 "leal (%0, %1), %%eax \n\t" | |
| 787 | 1976 "leal (%%eax, %1, 4), %%edx \n\t" |
| 107 | 1977 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1978 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
| 107 | 1979 "pxor %%mm7, %%mm7 \n\t" |
| 1980 | |
| 1981 #define MEDIAN(a,b,c)\ | |
| 1982 "movq " #a ", %%mm0 \n\t"\ | |
| 1983 "movq " #b ", %%mm2 \n\t"\ | |
| 1984 "movq " #c ", %%mm1 \n\t"\ | |
| 1985 "movq %%mm0, %%mm3 \n\t"\ | |
| 1986 "movq %%mm1, %%mm4 \n\t"\ | |
| 1987 "movq %%mm2, %%mm5 \n\t"\ | |
| 1988 "psubusb %%mm1, %%mm3 \n\t"\ | |
| 1989 "psubusb %%mm2, %%mm4 \n\t"\ | |
| 1990 "psubusb %%mm0, %%mm5 \n\t"\ | |
| 1991 "pcmpeqb %%mm7, %%mm3 \n\t"\ | |
| 1992 "pcmpeqb %%mm7, %%mm4 \n\t"\ | |
| 1993 "pcmpeqb %%mm7, %%mm5 \n\t"\ | |
| 1994 "movq %%mm3, %%mm6 \n\t"\ | |
| 1995 "pxor %%mm4, %%mm3 \n\t"\ | |
| 1996 "pxor %%mm5, %%mm4 \n\t"\ | |
| 1997 "pxor %%mm6, %%mm5 \n\t"\ | |
| 1998 "por %%mm3, %%mm1 \n\t"\ | |
| 1999 "por %%mm4, %%mm2 \n\t"\ | |
| 2000 "por %%mm5, %%mm0 \n\t"\ | |
| 2001 "pand %%mm2, %%mm0 \n\t"\ | |
| 2002 "pand %%mm1, %%mm0 \n\t"\ | |
| 2003 "movq %%mm0, " #b " \n\t" | |
| 2004 | |
| 2005 MEDIAN((%0), (%%eax), (%%eax, %1)) | |
| 2006 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) | |
| 787 | 2007 MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1)) |
| 2008 MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8)) | |
| 107 | 2009 |
| 2010 : : "r" (src), "r" (stride) | |
| 787 | 2011 : "%eax", "%edx" |
| 107 | 2012 ); |
| 2013 #endif // MMX | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2014 #else |
|
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2015 int x, y; |
| 142 | 2016 src+= 4*stride; |
|
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2017 // FIXME - there should be a way to do a few columns in parallel like w/mmx |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2018 for(x=0; x<8; x++) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2019 { |
|
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2020 uint8_t *colsrc = src; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2021 for (y=0; y<4; y++) |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2022 { |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2023 int a, b, c, d, e, f; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2024 a = colsrc[0 ]; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2025 b = colsrc[stride ]; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2026 c = colsrc[stride*2]; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2027 d = (a-b)>>31; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2028 e = (b-c)>>31; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2029 f = (c-a)>>31; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2030 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2031 colsrc += stride*2; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2032 } |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2033 src++; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2034 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2035 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2036 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2037 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2038 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2039 /** |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2040 * transposes and shift the given 8x8 Block into dst1 and dst2 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2041 */ |
| 169 | 2042 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2043 { |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2044 asm( |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2045 "leal (%0, %1), %%eax \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2046 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 2047 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2048 "movq (%0), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2049 "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2050 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2051 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2052 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2053 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2054 "movq (%%eax, %1), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2055 "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2056 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2057 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2058 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2059 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2060 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2061 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2062 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2063 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2064 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2065 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2066 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2067 "movd %%mm0, 128(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2068 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2069 "movd %%mm0, 144(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2070 "movd %%mm3, 160(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2071 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2072 "movd %%mm3, 176(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2073 "movd %%mm3, 48(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2074 "movd %%mm2, 192(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2075 "movd %%mm2, 64(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2076 "psrlq $32, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2077 "movd %%mm2, 80(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2078 "movd %%mm1, 96(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2079 "psrlq $32, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2080 "movd %%mm1, 112(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2081 |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2082 "leal (%%eax, %1, 4), %%eax \n\t" |
|
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2083 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2084 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2085 "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2086 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2087 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2088 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2089 |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2090 "movq (%%eax, %1), %%mm1 \n\t" |
|
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2091 "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2092 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2093 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2094 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2095 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2096 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2097 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2098 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2099 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2100 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2101 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2102 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2103 "movd %%mm0, 132(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2104 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2105 "movd %%mm0, 148(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2106 "movd %%mm3, 164(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2107 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2108 "movd %%mm3, 180(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2109 "movd %%mm3, 52(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2110 "movd %%mm2, 196(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2111 "movd %%mm2, 68(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2112 "psrlq $32, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2113 "movd %%mm2, 84(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2114 "movd %%mm1, 100(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2115 "psrlq $32, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2116 "movd %%mm1, 116(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2117 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2118 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2119 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2120 : "%eax" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2121 ); |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2122 } |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2123 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2124 /** |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2125 * transposes the given 8x8 block |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2126 */ |
| 169 | 2127 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2128 { |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2129 asm( |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2130 "leal (%0, %1), %%eax \n\t" |
| 787 | 2131 "leal (%%eax, %1, 4), %%edx \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2132 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 2133 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2134 "movq (%2), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2135 "movq 16(%2), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2136 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2137 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2138 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2139 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2140 "movq 32(%2), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2141 "movq 48(%2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2142 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2143 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2144 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2145 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2146 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2147 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2148 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2149 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2150 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2151 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2152 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2153 "movd %%mm0, (%0) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2154 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2155 "movd %%mm0, (%%eax) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2156 "movd %%mm3, (%%eax, %1) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2157 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2158 "movd %%mm3, (%%eax, %1, 2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2159 "movd %%mm2, (%0, %1, 4) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2160 "psrlq $32, %%mm2 \n\t" |
| 787 | 2161 "movd %%mm2, (%%edx) \n\t" |
| 2162 "movd %%mm1, (%%edx, %1) \n\t" | |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2163 "psrlq $32, %%mm1 \n\t" |
| 787 | 2164 "movd %%mm1, (%%edx, %1, 2) \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2165 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2166 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2167 "movq 64(%2), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2168 "movq 80(%2), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2169 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2170 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2171 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2172 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2173 "movq 96(%2), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2174 "movq 112(%2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2175 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2176 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2177 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2178 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2179 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2180 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2181 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2182 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2183 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2184 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2185 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2186 "movd %%mm0, 4(%0) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2187 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2188 "movd %%mm0, 4(%%eax) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2189 "movd %%mm3, 4(%%eax, %1) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2190 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2191 "movd %%mm3, 4(%%eax, %1, 2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2192 "movd %%mm2, 4(%0, %1, 4) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2193 "psrlq $32, %%mm2 \n\t" |
| 787 | 2194 "movd %%mm2, 4(%%edx) \n\t" |
| 2195 "movd %%mm1, 4(%%edx, %1) \n\t" | |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2196 "psrlq $32, %%mm1 \n\t" |
| 787 | 2197 "movd %%mm1, 4(%%edx, %1, 2) \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2198 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2199 :: "r" (dst), "r" (dstStride), "r" (src) |
| 787 | 2200 : "%eax", "%edx" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2201 ); |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2202 } |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2203 #endif |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2204 //static int test=0; |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2205 |
| 2041 | 2206 #ifndef HAVE_ALTIVEC |
| 943 | 2207 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
| 158 | 2208 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
| 156 | 2209 { |
| 787 | 2210 // to save a register (FIXME do this outside of the loops) |
| 2211 tempBluredPast[127]= maxNoise[0]; | |
| 2212 tempBluredPast[128]= maxNoise[1]; | |
| 2213 tempBluredPast[129]= maxNoise[2]; | |
| 2214 | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2215 #define FAST_L2_DIFF |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2216 //#define L1_DIFF //u should change the thresholds too if u try that one |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2217 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2218 asm volatile( |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2219 "leal (%2, %2, 2), %%eax \n\t" // 3*stride |
| 787 | 2220 "leal (%2, %2, 4), %%edx \n\t" // 5*stride |
| 2221 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2222 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 2223 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2224 //FIXME reorder? |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2225 #ifdef L1_DIFF //needs mmx2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2226 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2227 "psadbw (%1), %%mm0 \n\t" // |L0-R0| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2228 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2229 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2230 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2231 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2232 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2233 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2234 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2235 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2236 "paddw %%mm1, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2237 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| |
| 787 | 2238 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2239 "paddw %%mm2, %%mm0 \n\t" |
| 787 | 2240 "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5| |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2241 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2242 "paddw %%mm3, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2243 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2244 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2245 "paddw %%mm4, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2246 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2247 "paddw %%mm5, %%mm6 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2248 "paddw %%mm7, %%mm6 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2249 "paddw %%mm6, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2250 #elif defined (FAST_L2_DIFF) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2251 "pcmpeqb %%mm7, %%mm7 \n\t" |
| 210 | 2252 "movq "MANGLE(b80)", %%mm6 \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2253 "pxor %%mm0, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2254 #define L2_DIFF_CORE(a, b)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2255 "movq " #a ", %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2256 "movq " #b ", %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2257 "pxor %%mm7, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2258 PAVGB(%%mm2, %%mm5)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2259 "paddb %%mm6, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2260 "movq %%mm5, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2261 "psllw $8, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2262 "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2263 "pmaddwd %%mm2, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2264 "paddd %%mm2, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2265 "psrld $14, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2266 "paddd %%mm5, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2267 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2268 L2_DIFF_CORE((%0), (%1)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2269 L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2270 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2271 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2272 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
| 787 | 2273 L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2274 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2275 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2276 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2277 #else |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2278 "pxor %%mm7, %%mm7 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2279 "pxor %%mm0, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2280 #define L2_DIFF_CORE(a, b)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2281 "movq " #a ", %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2282 "movq " #b ", %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2283 "movq %%mm5, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2284 "movq %%mm2, %%mm3 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2285 "punpcklbw %%mm7, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2286 "punpckhbw %%mm7, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2287 "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2288 "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2289 "psubw %%mm2, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2290 "psubw %%mm3, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2291 "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2292 "pmaddwd %%mm1, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2293 "paddd %%mm1, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2294 "paddd %%mm5, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2295 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2296 L2_DIFF_CORE((%0), (%1)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2297 L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2298 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2299 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2300 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
| 787 | 2301 L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2302 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2303 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2304 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2305 #endif |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2306 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2307 "movq %%mm0, %%mm4 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2308 "psrlq $32, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2309 "paddd %%mm0, %%mm4 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2310 "movd %%mm4, %%ecx \n\t" |
| 158 | 2311 "shll $2, %%ecx \n\t" |
| 787 | 2312 "movl %3, %%edx \n\t" |
| 2313 "addl -4(%%edx), %%ecx \n\t" | |
| 2314 "addl 4(%%edx), %%ecx \n\t" | |
| 2315 "addl -1024(%%edx), %%ecx \n\t" | |
| 158 | 2316 "addl $4, %%ecx \n\t" |
| 787 | 2317 "addl 1024(%%edx), %%ecx \n\t" |
| 158 | 2318 "shrl $3, %%ecx \n\t" |
| 787 | 2319 "movl %%ecx, (%%edx) \n\t" |
| 158 | 2320 |
| 210 | 2321 // "movl %3, %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2322 // "movl %%ecx, test \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2323 // "jmp 4f \n\t" |
| 787 | 2324 "cmpl 512(%%edx), %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2325 " jb 2f \n\t" |
| 787 | 2326 "cmpl 516(%%edx), %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2327 " jb 1f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2328 |
| 787 | 2329 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2330 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2331 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2332 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2333 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2334 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2335 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
| 787 | 2336 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2337 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2338 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2339 "movq %%mm0, (%1) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2340 "movq %%mm1, (%1, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2341 "movq %%mm2, (%1, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2342 "movq %%mm3, (%1, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2343 "movq %%mm4, (%1, %2, 4) \n\t" // L4 |
| 787 | 2344 "movq %%mm5, (%1, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2345 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2346 "movq %%mm7, (%1, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2347 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2348 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2349 "1: \n\t" |
| 787 | 2350 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2351 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2352 "movq (%0), %%mm0 \n\t" // L0 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2353 PAVGB((%1), %%mm0) // L0 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2354 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2355 PAVGB((%1, %2), %%mm1) // L1 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2356 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2357 PAVGB((%1, %2, 2), %%mm2) // L2 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2358 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2359 PAVGB((%1, %%eax), %%mm3) // L3 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2360 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2361 PAVGB((%1, %2, 4), %%mm4) // L4 |
| 787 | 2362 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
| 2363 PAVGB((%1, %%edx), %%mm5) // L5 | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2364 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2365 PAVGB((%1, %%eax, 2), %%mm6) // L6 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2366 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2367 PAVGB((%1, %%ecx), %%mm7) // L7 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2368 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2369 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2370 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2371 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2372 "movq %%mm4, (%1, %2, 4) \n\t" // R4 |
| 787 | 2373 "movq %%mm5, (%1, %%edx) \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2374 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2375 "movq %%mm7, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2376 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2377 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2378 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2379 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2380 "movq %%mm4, (%0, %2, 4) \n\t" // L4 |
| 787 | 2381 "movq %%mm5, (%0, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2382 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2383 "movq %%mm7, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2384 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2385 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2386 "2: \n\t" |
| 787 | 2387 "cmpl 508(%%edx), %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2388 " jb 3f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2389 |
| 787 | 2390 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2391 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2392 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2393 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2394 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2395 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2396 "movq (%1), %%mm4 \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2397 "movq (%1, %2), %%mm5 \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2398 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2399 "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2400 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2401 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2402 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2403 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2404 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2405 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2406 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2407 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2408 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2409 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2410 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2411 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2412 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2413 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2414 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2415 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2416 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2417 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
| 787 | 2418 "movq (%0, %%edx), %%mm1 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2419 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2420 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2421 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
| 787 | 2422 "movq (%1, %%edx), %%mm5 \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2423 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2424 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2425 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2426 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2427 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2428 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2429 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2430 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2431 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2432 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2433 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
| 787 | 2434 "movq %%mm1, (%1, %%edx) \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2435 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2436 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2437 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
| 787 | 2438 "movq %%mm1, (%0, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2439 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2440 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2441 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2442 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2443 "3: \n\t" |
| 787 | 2444 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2445 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2446 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2447 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2448 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2449 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2450 "movq (%1), %%mm4 \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2451 "movq (%1, %2), %%mm5 \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2452 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2453 "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2454 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2455 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2456 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2457 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2458 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2459 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2460 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2461 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2462 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2463 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2464 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2465 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2466 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2467 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2468 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2469 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2470 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2471 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2472 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2473 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2474 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2475 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
| 787 | 2476 "movq (%0, %%edx), %%mm1 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2477 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2478 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2479 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
| 787 | 2480 "movq (%1, %%edx), %%mm5 \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2481 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2482 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2483 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2484 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2485 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2486 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2487 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2488 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2489 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2490 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2491 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2492 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2493 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2494 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2495 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
| 787 | 2496 "movq %%mm1, (%1, %%edx) \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2497 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2498 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2499 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
| 787 | 2500 "movq %%mm1, (%0, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2501 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2502 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2503 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2504 "4: \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2505 |
| 158 | 2506 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) |
| 787 | 2507 : "%eax", "%edx", "%ecx", "memory" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2508 ); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2509 //printf("%d\n", test); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2510 #else |
| 788 | 2511 { |
| 156 | 2512 int y; |
| 2513 int d=0; | |
| 2041 | 2514 // int sysd=0; |
| 158 | 2515 int i; |
| 156 | 2516 |
| 2517 for(y=0; y<8; y++) | |
| 2518 { | |
| 2519 int x; | |
| 2520 for(x=0; x<8; x++) | |
| 2521 { | |
| 2522 int ref= tempBlured[ x + y*stride ]; | |
| 2523 int cur= src[ x + y*stride ]; | |
| 2524 int d1=ref - cur; | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2525 // if(x==0 || x==7) d1+= d1>>1; |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2526 // if(y==0 || y==7) d1+= d1>>1; |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2527 // d+= ABS(d1); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2528 d+= d1*d1; |
| 2041 | 2529 // sysd+= d1; |
| 156 | 2530 } |
| 2531 } | |
| 158 | 2532 i=d; |
| 2533 d= ( | |
| 2534 4*d | |
| 2535 +(*(tempBluredPast-256)) | |
| 2536 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) | |
| 2537 +(*(tempBluredPast+256)) | |
| 2538 +4)>>3; | |
| 2539 *tempBluredPast=i; | |
| 2540 // ((*tempBluredPast)*3 + d + 2)>>2; | |
| 2541 | |
| 156 | 2542 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); |
| 2543 /* | |
| 2544 Switch between | |
| 2545 1 0 0 0 0 0 0 (0) | |
| 2546 64 32 16 8 4 2 1 (1) | |
| 2547 64 48 36 27 20 15 11 (33) (approx) | |
| 2548 64 56 49 43 37 33 29 (200) (approx) | |
| 2549 */ | |
| 2550 if(d > maxNoise[1]) | |
| 2551 { | |
| 2552 if(d < maxNoise[2]) | |
| 2553 { | |
| 2554 for(y=0; y<8; y++) | |
| 2555 { | |
| 2556 int x; | |
| 2557 for(x=0; x<8; x++) | |
| 2558 { | |
| 2559 int ref= tempBlured[ x + y*stride ]; | |
| 2560 int cur= src[ x + y*stride ]; | |
| 2561 tempBlured[ x + y*stride ]= | |
| 2562 src[ x + y*stride ]= | |
| 2563 (ref + cur + 1)>>1; | |
| 2564 } | |
| 2565 } | |
| 2566 } | |
| 2567 else | |
| 2568 { | |
| 2569 for(y=0; y<8; y++) | |
| 2570 { | |
| 2571 int x; | |
| 2572 for(x=0; x<8; x++) | |
| 2573 { | |
| 2574 tempBlured[ x + y*stride ]= src[ x + y*stride ]; | |
| 2575 } | |
| 2576 } | |
| 2577 } | |
| 2578 } | |
| 2579 else | |
| 2580 { | |
| 2581 if(d < maxNoise[0]) | |
| 2582 { | |
| 2583 for(y=0; y<8; y++) | |
| 2584 { | |
| 2585 int x; | |
| 2586 for(x=0; x<8; x++) | |
| 2587 { | |
| 2588 int ref= tempBlured[ x + y*stride ]; | |
| 2589 int cur= src[ x + y*stride ]; | |
| 2590 tempBlured[ x + y*stride ]= | |
| 2591 src[ x + y*stride ]= | |
| 2592 (ref*7 + cur + 4)>>3; | |
| 2593 } | |
| 2594 } | |
| 2595 } | |
| 2596 else | |
| 2597 { | |
| 2598 for(y=0; y<8; y++) | |
| 2599 { | |
| 2600 int x; | |
| 2601 for(x=0; x<8; x++) | |
| 2602 { | |
| 2603 int ref= tempBlured[ x + y*stride ]; | |
| 2604 int cur= src[ x + y*stride ]; | |
| 2605 tempBlured[ x + y*stride ]= | |
| 2606 src[ x + y*stride ]= | |
| 2607 (ref*3 + cur + 2)>>2; | |
| 2608 } | |
| 2609 } | |
| 2610 } | |
| 2611 } | |
| 788 | 2612 } |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2613 #endif |
| 156 | 2614 } |
| 2041 | 2615 #endif //HAVE_ALTIVEC |
| 156 | 2616 |
|
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2617 #ifdef HAVE_MMX |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2618 /** |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2619 * accurate deblock filter |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2620 */ |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2621 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2622 int64_t dc_mask, eq_mask; |
| 2040 | 2623 int64_t sums[10*8*2]; |
|
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2624 src+= step*3; // src points to begin of the 8x8 Block |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2625 //START_TIMER |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2626 asm volatile( |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2627 "movq %0, %%mm7 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2628 "movq %1, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2629 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2630 ); |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2631 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2632 asm volatile( |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2633 "leal (%2, %3), %%eax \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2634 // 0 1 2 3 4 5 6 7 8 9 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2635 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2636 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2637 "movq (%2), %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2638 "movq (%%eax), %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2639 "movq %%mm1, %%mm3 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2640 "movq %%mm1, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2641 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2642 "paddb %%mm7, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2643 "pcmpgtb %%mm6, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2644 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2645 "movq (%%eax,%3), %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2646 PMAXUB(%%mm2, %%mm4) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2647 PMINUB(%%mm2, %%mm3, %%mm5) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2648 "psubb %%mm2, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2649 "paddb %%mm7, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2650 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2651 "paddb %%mm1, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2652 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2653 "movq (%%eax, %3, 2), %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2654 PMAXUB(%%mm1, %%mm4) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2655 PMINUB(%%mm1, %%mm3, %%mm5) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2656 "psubb %%mm1, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2657 "paddb %%mm7, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2658 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2659 "paddb %%mm2, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2660 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2661 "leal (%%eax, %3, 4), %%eax \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2662 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2663 "movq (%2, %3, 4), %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2664 PMAXUB(%%mm2, %%mm4) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2665 PMINUB(%%mm2, %%mm3, %%mm5) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2666 "psubb %%mm2, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2667 "paddb %%mm7, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2668 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2669 "paddb %%mm1, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2670 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2671 "movq (%%eax), %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2672 PMAXUB(%%mm1, %%mm4) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2673 PMINUB(%%mm1, %%mm3, %%mm5) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2674 "psubb %%mm1, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2675 "paddb %%mm7, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2676 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2677 "paddb %%mm2, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2678 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2679 "movq (%%eax, %3), %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2680 PMAXUB(%%mm2, %%mm4) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2681 PMINUB(%%mm2, %%mm3, %%mm5) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2682 "psubb %%mm2, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2683 "paddb %%mm7, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2684 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2685 "paddb %%mm1, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2686 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2687 "movq (%%eax, %3, 2), %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2688 PMAXUB(%%mm1, %%mm4) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2689 PMINUB(%%mm1, %%mm3, %%mm5) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2690 "psubb %%mm1, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2691 "paddb %%mm7, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2692 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2693 "paddb %%mm2, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2694 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2695 "movq (%2, %3, 8), %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2696 PMAXUB(%%mm2, %%mm4) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2697 PMINUB(%%mm2, %%mm3, %%mm5) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2698 "psubb %%mm2, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2699 "paddb %%mm7, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2700 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2701 "paddb %%mm1, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2702 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2703 "movq (%%eax, %3, 4), %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2704 "psubb %%mm1, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2705 "paddb %%mm7, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2706 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2707 "paddb %%mm2, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2708 "psubusb %%mm3, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2709 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2710 "movq %4, %%mm7 \n\t" // QP,..., QP |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2711 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2712 "pcmpgtb %%mm4, %%mm7 \n\t" // Diff < 2QP -> FF |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2713 "movq %%mm7, %1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2714 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2715 "pxor %%mm6, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2716 "movq %5, %%mm7 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2717 "punpcklbw %%mm7, %%mm7 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2718 "punpcklbw %%mm7, %%mm7 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2719 "punpcklbw %%mm7, %%mm7 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2720 "psubb %%mm0, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2721 "pcmpgtb %%mm7, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2722 "movq %%mm6, %0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2723 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2724 : "=m" (eq_mask), "=m" (dc_mask) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2725 : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2726 : "%eax" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2727 ); |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2728 |
| 2040 | 2729 if(dc_mask & eq_mask){ |
| 2730 int offset= -8*step; | |
| 2731 int64_t *temp_sums= sums; | |
| 2732 | |
| 2733 asm volatile( | |
| 2734 "movq %2, %%mm0 \n\t" // QP,..., QP | |
| 2735 "pxor %%mm4, %%mm4 \n\t" | |
| 2736 | |
| 2737 "movq (%0), %%mm6 \n\t" | |
| 2738 "movq (%0, %1), %%mm5 \n\t" | |
| 2739 "movq %%mm5, %%mm1 \n\t" | |
| 2740 "movq %%mm6, %%mm2 \n\t" | |
| 2741 "psubusb %%mm6, %%mm5 \n\t" | |
| 2742 "psubusb %%mm1, %%mm2 \n\t" | |
| 2743 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
| 2744 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | |
| 2745 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | |
| 2746 | |
| 2747 "pxor %%mm6, %%mm1 \n\t" | |
| 2748 "pand %%mm0, %%mm1 \n\t" | |
| 2749 "pxor %%mm1, %%mm6 \n\t" | |
| 2750 // 0:QP 6:First | |
| 2751 | |
| 2752 "movq (%0, %1, 8), %%mm5 \n\t" | |
| 2753 "addl %1, %0 \n\t" // %0 points to line 1 not 0 | |
| 2754 "movq (%0, %1, 8), %%mm7 \n\t" | |
| 2755 "movq %%mm5, %%mm1 \n\t" | |
| 2756 "movq %%mm7, %%mm2 \n\t" | |
| 2757 "psubusb %%mm7, %%mm5 \n\t" | |
| 2758 "psubusb %%mm1, %%mm2 \n\t" | |
| 2759 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
| 2760 "movq %2, %%mm0 \n\t" // QP,..., QP | |
| 2761 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | |
| 2762 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | |
| 2763 | |
| 2764 "pxor %%mm7, %%mm1 \n\t" | |
| 2765 "pand %%mm0, %%mm1 \n\t" | |
| 2766 "pxor %%mm1, %%mm7 \n\t" | |
| 2767 | |
| 2768 "movq %%mm6, %%mm5 \n\t" | |
| 2769 "punpckhbw %%mm4, %%mm6 \n\t" | |
| 2770 "punpcklbw %%mm4, %%mm5 \n\t" | |
| 2771 // 4:0 5/6:First 7:Last | |
| 2772 | |
| 2773 "movq %%mm5, %%mm0 \n\t" | |
| 2774 "movq %%mm6, %%mm1 \n\t" | |
| 2775 "psllw $2, %%mm0 \n\t" | |
| 2776 "psllw $2, %%mm1 \n\t" | |
| 2777 "paddw "MANGLE(w04)", %%mm0 \n\t" | |
| 2778 "paddw "MANGLE(w04)", %%mm1 \n\t" | |
| 2779 | |
| 2780 #define NEXT\ | |
| 2781 "movq (%0), %%mm2 \n\t"\ | |
| 2782 "movq (%0), %%mm3 \n\t"\ | |
| 2783 "addl %1, %0 \n\t"\ | |
| 2784 "punpcklbw %%mm4, %%mm2 \n\t"\ | |
| 2785 "punpckhbw %%mm4, %%mm3 \n\t"\ | |
| 2786 "paddw %%mm2, %%mm0 \n\t"\ | |
| 2787 "paddw %%mm3, %%mm1 \n\t" | |
| 2788 | |
| 2789 #define PREV\ | |
| 2790 "movq (%0), %%mm2 \n\t"\ | |
| 2791 "movq (%0), %%mm3 \n\t"\ | |
| 2792 "addl %1, %0 \n\t"\ | |
| 2793 "punpcklbw %%mm4, %%mm2 \n\t"\ | |
| 2794 "punpckhbw %%mm4, %%mm3 \n\t"\ | |
| 2795 "psubw %%mm2, %%mm0 \n\t"\ | |
| 2796 "psubw %%mm3, %%mm1 \n\t" | |
| 2797 | |
| 2798 | |
| 2799 NEXT //0 | |
| 2800 NEXT //1 | |
| 2801 NEXT //2 | |
| 2802 "movq %%mm0, (%3) \n\t" | |
| 2803 "movq %%mm1, 8(%3) \n\t" | |
| 2804 | |
| 2805 NEXT //3 | |
| 2806 "psubw %%mm5, %%mm0 \n\t" | |
| 2807 "psubw %%mm6, %%mm1 \n\t" | |
| 2808 "movq %%mm0, 16(%3) \n\t" | |
| 2809 "movq %%mm1, 24(%3) \n\t" | |
| 2810 | |
| 2811 NEXT //4 | |
| 2812 "psubw %%mm5, %%mm0 \n\t" | |
| 2813 "psubw %%mm6, %%mm1 \n\t" | |
| 2814 "movq %%mm0, 32(%3) \n\t" | |
| 2815 "movq %%mm1, 40(%3) \n\t" | |
| 2816 | |
| 2817 NEXT //5 | |
| 2818 "psubw %%mm5, %%mm0 \n\t" | |
| 2819 "psubw %%mm6, %%mm1 \n\t" | |
| 2820 "movq %%mm0, 48(%3) \n\t" | |
| 2821 "movq %%mm1, 56(%3) \n\t" | |
| 2822 | |
| 2823 NEXT //6 | |
| 2824 "psubw %%mm5, %%mm0 \n\t" | |
| 2825 "psubw %%mm6, %%mm1 \n\t" | |
| 2826 "movq %%mm0, 64(%3) \n\t" | |
| 2827 "movq %%mm1, 72(%3) \n\t" | |
| 2828 | |
| 2829 "movq %%mm7, %%mm6 \n\t" | |
| 2830 "punpckhbw %%mm4, %%mm7 \n\t" | |
| 2831 "punpcklbw %%mm4, %%mm6 \n\t" | |
| 2832 | |
| 2833 NEXT //7 | |
| 2834 "movl %4, %0 \n\t" | |
| 2835 "addl %1, %0 \n\t" | |
| 2836 PREV //0 | |
| 2837 "movq %%mm0, 80(%3) \n\t" | |
| 2838 "movq %%mm1, 88(%3) \n\t" | |
| 2839 | |
| 2840 PREV //1 | |
| 2841 "paddw %%mm6, %%mm0 \n\t" | |
| 2842 "paddw %%mm7, %%mm1 \n\t" | |
| 2843 "movq %%mm0, 96(%3) \n\t" | |
| 2844 "movq %%mm1, 104(%3) \n\t" | |
| 2845 | |
| 2846 PREV //2 | |
| 2847 "paddw %%mm6, %%mm0 \n\t" | |
| 2848 "paddw %%mm7, %%mm1 \n\t" | |
| 2849 "movq %%mm0, 112(%3) \n\t" | |
| 2850 "movq %%mm1, 120(%3) \n\t" | |
| 2851 | |
| 2852 PREV //3 | |
| 2853 "paddw %%mm6, %%mm0 \n\t" | |
| 2854 "paddw %%mm7, %%mm1 \n\t" | |
| 2855 "movq %%mm0, 128(%3) \n\t" | |
| 2856 "movq %%mm1, 136(%3) \n\t" | |
| 2857 | |
| 2858 PREV //4 | |
| 2859 "paddw %%mm6, %%mm0 \n\t" | |
| 2860 "paddw %%mm7, %%mm1 \n\t" | |
| 2861 "movq %%mm0, 144(%3) \n\t" | |
| 2862 "movq %%mm1, 152(%3) \n\t" | |
| 2863 | |
| 2864 "movl %4, %0 \n\t" //FIXME | |
| 2865 | |
| 2866 : "+&r"(src) | |
| 2867 : "r" (step), "m" (c->pQPb), "r"(sums), "g"(src) | |
| 2868 ); | |
| 2869 | |
| 2870 src+= step; // src points to begin of the 8x8 Block | |
| 2871 | |
| 2872 asm volatile( | |
| 2873 "movq %4, %%mm6 \n\t" | |
| 2874 "pcmpeqb %%mm5, %%mm5 \n\t" | |
| 2875 "pxor %%mm6, %%mm5 \n\t" | |
| 2876 "pxor %%mm7, %%mm7 \n\t" | |
| 2877 | |
| 2878 "1: \n\t" | |
| 2879 "movq (%1), %%mm0 \n\t" | |
| 2880 "movq 8(%1), %%mm1 \n\t" | |
| 2881 "paddw 32(%1), %%mm0 \n\t" | |
| 2882 "paddw 40(%1), %%mm1 \n\t" | |
| 2883 "movq (%0, %3), %%mm2 \n\t" | |
| 2884 "movq %%mm2, %%mm3 \n\t" | |
| 2885 "movq %%mm2, %%mm4 \n\t" | |
| 2886 "punpcklbw %%mm7, %%mm2 \n\t" | |
| 2887 "punpckhbw %%mm7, %%mm3 \n\t" | |
| 2888 "paddw %%mm2, %%mm0 \n\t" | |
| 2889 "paddw %%mm3, %%mm1 \n\t" | |
| 2890 "paddw %%mm2, %%mm0 \n\t" | |
| 2891 "paddw %%mm3, %%mm1 \n\t" | |
| 2892 "psrlw $4, %%mm0 \n\t" | |
| 2893 "psrlw $4, %%mm1 \n\t" | |
| 2894 "packuswb %%mm1, %%mm0 \n\t" | |
| 2895 "pand %%mm6, %%mm0 \n\t" | |
| 2896 "pand %%mm5, %%mm4 \n\t" | |
| 2897 "por %%mm4, %%mm0 \n\t" | |
| 2898 "movq %%mm0, (%0, %3) \n\t" | |
| 2899 "addl $16, %1 \n\t" | |
| 2900 "addl %2, %0 \n\t" | |
| 2901 " js 1b \n\t" | |
| 2902 | |
| 2903 : "+r"(offset), "+r"(temp_sums) | |
| 2904 : "r" (step), "r"(src - offset), "m"(dc_mask & eq_mask) | |
| 2905 ); | |
| 2906 }else | |
| 2907 src+= step; // src points to begin of the 8x8 Block | |
|
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2908 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2909 if(eq_mask != -1LL){ |
| 2040 | 2910 uint8_t *temp_src= src; |
|
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2911 asm volatile( |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2912 "pxor %%mm7, %%mm7 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2913 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2914 "andl $0xFFFFFFF8, %%ecx \n\t" // align |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2915 // 0 1 2 3 4 5 6 7 8 9 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2916 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2917 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2918 "movq (%0), %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2919 "movq %%mm0, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2920 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2921 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2922 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2923 "movq (%0, %1), %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2924 "leal (%0, %1, 2), %%eax \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2925 "movq %%mm2, %%mm3 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2926 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2927 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2928 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2929 "movq (%%eax), %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2930 "movq %%mm4, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2931 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2932 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2933 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2934 "paddw %%mm0, %%mm0 \n\t" // 2L0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2935 "paddw %%mm1, %%mm1 \n\t" // 2H0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2936 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2937 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2938 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2939 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2940 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2941 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2942 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2943 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2944 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2945 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2946 "movq (%%eax, %1), %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2947 "movq %%mm2, %%mm3 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2948 "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2949 "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2950 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2951 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2952 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2953 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2954 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2955 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2956 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2957 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2958 "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2959 "movq %%mm0, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2960 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2961 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2962 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2963 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2964 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2965 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2966 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2967 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2968 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2969 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2970 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2971 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2972 "leal (%%eax, %1), %0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2973 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2974 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2975 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2976 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2977 //50 opcodes so far |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2978 "movq (%0, %1, 2), %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2979 "movq %%mm2, %%mm3 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2980 "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2981 "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2982 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2983 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2984 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2985 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2986 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2987 "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2988 "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2989 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2990 "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2991 "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2992 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2993 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2994 "paddw %%mm0, %%mm0 \n\t" // 2L4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2995 "paddw %%mm1, %%mm1 \n\t" // 2H4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2996 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2997 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2998 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
2999 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3000 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3001 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3002 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3003 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3004 "movq (%0, %1, 4), %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3005 "movq %%mm2, %%mm3 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3006 "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3007 "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3008 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3009 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3010 "paddw %%mm3, %%mm3 \n\t" // 2H7 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3011 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3012 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3013 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3014 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3015 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3016 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3017 #ifdef HAVE_MMX2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3018 "movq %%mm7, %%mm6 \n\t" // 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3019 "psubw %%mm0, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3020 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3021 "movq %%mm7, %%mm6 \n\t" // 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3022 "psubw %%mm1, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3023 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3024 "movq %%mm7, %%mm6 \n\t" // 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3025 "psubw %%mm2, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3026 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3027 "movq %%mm7, %%mm6 \n\t" // 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3028 "psubw %%mm3, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3029 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3030 #else |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3031 "movq %%mm7, %%mm6 \n\t" // 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3032 "pcmpgtw %%mm0, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3033 "pxor %%mm6, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3034 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3035 "movq %%mm7, %%mm6 \n\t" // 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3036 "pcmpgtw %%mm1, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3037 "pxor %%mm6, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3038 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3039 "movq %%mm7, %%mm6 \n\t" // 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3040 "pcmpgtw %%mm2, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3041 "pxor %%mm6, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3042 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3043 "movq %%mm7, %%mm6 \n\t" // 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3044 "pcmpgtw %%mm3, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3045 "pxor %%mm6, %%mm3 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3046 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3047 #endif |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3048 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3049 #ifdef HAVE_MMX2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3050 "pminsw %%mm2, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3051 "pminsw %%mm3, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3052 #else |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3053 "movq %%mm0, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3054 "psubusw %%mm2, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3055 "psubw %%mm6, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3056 "movq %%mm1, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3057 "psubusw %%mm3, %%mm6 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3058 "psubw %%mm6, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3059 #endif |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3060 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3061 "movd %2, %%mm2 \n\t" // QP |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3062 "punpcklbw %%mm7, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3063 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3064 "movq %%mm7, %%mm6 \n\t" // 0 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3065 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3066 "pxor %%mm6, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3067 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3068 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3069 "pxor %%mm7, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3070 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3071 // 100 opcodes |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3072 "psllw $3, %%mm2 \n\t" // 8QP |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3073 "movq %%mm2, %%mm3 \n\t" // 8QP |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3074 "pcmpgtw %%mm4, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3075 "pcmpgtw %%mm5, %%mm3 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3076 "pand %%mm2, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3077 "pand %%mm3, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3078 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3079 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3080 "psubusw %%mm0, %%mm4 \n\t" // hd |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3081 "psubusw %%mm1, %%mm5 \n\t" // ld |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3082 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3083 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3084 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3085 "pmullw %%mm2, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3086 "pmullw %%mm2, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3087 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3088 "paddw %%mm2, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3089 "paddw %%mm2, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3090 "psrlw $6, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3091 "psrlw $6, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3092 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3093 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3094 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3095 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3096 "pxor %%mm2, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3097 "pxor %%mm3, %%mm3 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3098 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3099 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3100 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3101 "pxor %%mm2, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3102 "pxor %%mm3, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3103 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3104 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3105 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3106 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3107 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3108 "pxor %%mm6, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3109 "pxor %%mm7, %%mm3 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3110 "pand %%mm2, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3111 "pand %%mm3, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3112 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3113 #ifdef HAVE_MMX2 |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3114 "pminsw %%mm0, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3115 "pminsw %%mm1, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3116 #else |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3117 "movq %%mm4, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3118 "psubusw %%mm0, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3119 "psubw %%mm2, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3120 "movq %%mm5, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3121 "psubusw %%mm1, %%mm2 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3122 "psubw %%mm2, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3123 #endif |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3124 "pxor %%mm6, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3125 "pxor %%mm7, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3126 "psubw %%mm6, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3127 "psubw %%mm7, %%mm5 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3128 "packsswb %%mm5, %%mm4 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3129 "movq %3, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3130 "pandn %%mm4, %%mm1 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3131 "movq (%0), %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3132 "paddb %%mm1, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3133 "movq %%mm0, (%0) \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3134 "movq (%0, %1), %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3135 "psubb %%mm1, %%mm0 \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3136 "movq %%mm0, (%0, %1) \n\t" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3137 |
| 2040 | 3138 : "+r" (temp_src) |
|
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3139 : "r" (step), "m" (c->pQPb), "m"(eq_mask) |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3140 : "%eax", "%ecx" |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3141 ); |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3142 } |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3143 /*if(step==16){ |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3144 STOP_TIMER("step16") |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3145 }else{ |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3146 STOP_TIMER("stepX") |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3147 }*/ |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3148 } |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3149 #endif //HAVE_MMX |
|
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3150 |
| 169 | 3151 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 787 | 3152 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
| 96 | 3153 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3154 /** |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3155 * Copies a block from src to dst and fixes the blacklevel |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3156 * levelFix == 0 -> dont touch the brighness & contrast |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3157 */ |
|
634
be1cb0e1f276
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
600
diff
changeset
|
3158 #undef SCALED_CPY |
|
be1cb0e1f276
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
600
diff
changeset
|
3159 |
| 169 | 3160 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
| 787 | 3161 int levelFix, int64_t *packedOffsetAndScale) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3162 { |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3163 #ifndef HAVE_MMX |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3164 int i; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3165 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3166 if(levelFix) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3167 { |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3168 #ifdef HAVE_MMX |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3169 asm volatile( |
| 787 | 3170 "movq (%%eax), %%mm2 \n\t" // packedYOffset |
| 3171 "movq 8(%%eax), %%mm3 \n\t" // packedYScale | |
| 3172 "leal (%2,%4), %%eax \n\t" | |
| 3173 "leal (%3,%5), %%edx \n\t" | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3174 "pxor %%mm4, %%mm4 \n\t" |
| 173 | 3175 #ifdef HAVE_MMX2 |
| 3176 #define SCALED_CPY(src1, src2, dst1, dst2) \ | |
| 3177 "movq " #src1 ", %%mm0 \n\t"\ | |
| 3178 "movq " #src1 ", %%mm5 \n\t"\ | |
| 3179 "movq " #src2 ", %%mm1 \n\t"\ | |
| 3180 "movq " #src2 ", %%mm6 \n\t"\ | |
| 3181 "punpcklbw %%mm0, %%mm0 \n\t"\ | |
| 3182 "punpckhbw %%mm5, %%mm5 \n\t"\ | |
| 3183 "punpcklbw %%mm1, %%mm1 \n\t"\ | |
| 3184 "punpckhbw %%mm6, %%mm6 \n\t"\ | |
| 3185 "pmulhuw %%mm3, %%mm0 \n\t"\ | |
| 3186 "pmulhuw %%mm3, %%mm5 \n\t"\ | |
| 3187 "pmulhuw %%mm3, %%mm1 \n\t"\ | |
| 3188 "pmulhuw %%mm3, %%mm6 \n\t"\ | |
| 3189 "psubw %%mm2, %%mm0 \n\t"\ | |
| 3190 "psubw %%mm2, %%mm5 \n\t"\ | |
| 3191 "psubw %%mm2, %%mm1 \n\t"\ | |
| 3192 "psubw %%mm2, %%mm6 \n\t"\ | |
| 3193 "packuswb %%mm5, %%mm0 \n\t"\ | |
| 3194 "packuswb %%mm6, %%mm1 \n\t"\ | |
| 3195 "movq %%mm0, " #dst1 " \n\t"\ | |
| 3196 "movq %%mm1, " #dst2 " \n\t"\ | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3197 |
| 173 | 3198 #else //HAVE_MMX2 |
| 166 | 3199 #define SCALED_CPY(src1, src2, dst1, dst2) \ |
| 3200 "movq " #src1 ", %%mm0 \n\t"\ | |
| 3201 "movq " #src1 ", %%mm5 \n\t"\ | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3202 "punpcklbw %%mm4, %%mm0 \n\t"\ |
|
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3203 "punpckhbw %%mm4, %%mm5 \n\t"\ |
| 117 | 3204 "psubw %%mm2, %%mm0 \n\t"\ |
| 3205 "psubw %%mm2, %%mm5 \n\t"\ | |
| 166 | 3206 "movq " #src2 ", %%mm1 \n\t"\ |
| 117 | 3207 "psllw $6, %%mm0 \n\t"\ |
| 3208 "psllw $6, %%mm5 \n\t"\ | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3209 "pmulhw %%mm3, %%mm0 \n\t"\ |
| 166 | 3210 "movq " #src2 ", %%mm6 \n\t"\ |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3211 "pmulhw %%mm3, %%mm5 \n\t"\ |
|
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3212 "punpcklbw %%mm4, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3213 "punpckhbw %%mm4, %%mm6 \n\t"\ |
| 117 | 3214 "psubw %%mm2, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3215 "psubw %%mm2, %%mm6 \n\t"\ |
| 117 | 3216 "psllw $6, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3217 "psllw $6, %%mm6 \n\t"\ |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
3218 "pmulhw %%mm3, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3219 "pmulhw %%mm3, %%mm6 \n\t"\ |
|
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3220 "packuswb %%mm5, %%mm0 \n\t"\ |
|
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3221 "packuswb %%mm6, %%mm1 \n\t"\ |
| 166 | 3222 "movq %%mm0, " #dst1 " \n\t"\ |
| 3223 "movq %%mm1, " #dst2 " \n\t"\ | |
| 3224 | |
| 173 | 3225 #endif //!HAVE_MMX2 |
| 3226 | |
| 787 | 3227 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) |
| 3228 SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2)) | |
| 3229 SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4)) | |
| 3230 "leal (%%eax,%4,4), %%eax \n\t" | |
| 3231 "leal (%%edx,%5,4), %%edx \n\t" | |
| 3232 SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2)) | |
| 166 | 3233 |
| 3234 | |
| 787 | 3235 : "=&a" (packedOffsetAndScale) |
| 3236 : "0" (packedOffsetAndScale), | |
| 3237 "r"(src), | |
| 166 | 3238 "r"(dst), |
| 3239 "r" (srcStride), | |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
3240 "r" (dstStride) |
| 787 | 3241 : "%edx" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3242 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3243 #else |
| 164 | 3244 for(i=0; i<8; i++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3245 memcpy( &(dst[dstStride*i]), |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3246 &(src[srcStride*i]), BLOCK_SIZE); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3247 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3248 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3249 else |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3250 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3251 #ifdef HAVE_MMX |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3252 asm volatile( |
| 166 | 3253 "leal (%0,%2), %%eax \n\t" |
| 787 | 3254 "leal (%1,%3), %%edx \n\t" |
| 166 | 3255 |
| 3256 #define SIMPLE_CPY(src1, src2, dst1, dst2) \ | |
| 3257 "movq " #src1 ", %%mm0 \n\t"\ | |
| 3258 "movq " #src2 ", %%mm1 \n\t"\ | |
| 3259 "movq %%mm0, " #dst1 " \n\t"\ | |
| 3260 "movq %%mm1, " #dst2 " \n\t"\ | |
| 3261 | |
| 3262 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) | |
| 787 | 3263 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2)) |
| 3264 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4)) | |
| 166 | 3265 "leal (%%eax,%2,4), %%eax \n\t" |
| 787 | 3266 "leal (%%edx,%3,4), %%edx \n\t" |
| 3267 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2)) | |
| 166 | 3268 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3269 : : "r" (src), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3270 "r" (dst), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3271 "r" (srcStride), |
| 164 | 3272 "r" (dstStride) |
| 787 | 3273 : "%eax", "%edx" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3274 ); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3275 #else |
| 164 | 3276 for(i=0; i<8; i++) |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3277 memcpy( &(dst[dstStride*i]), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3278 &(src[srcStride*i]), BLOCK_SIZE); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3279 #endif |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3280 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3281 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3282 |
| 224 | 3283 /** |
| 3284 * Duplicates the given 8 src pixels ? times upward | |
| 3285 */ | |
| 3286 static inline void RENAME(duplicate)(uint8_t src[], int stride) | |
| 3287 { | |
| 3288 #ifdef HAVE_MMX | |
| 3289 asm volatile( | |
| 3290 "movq (%0), %%mm0 \n\t" | |
| 3291 "addl %1, %0 \n\t" | |
| 3292 "movq %%mm0, (%0) \n\t" | |
| 3293 "movq %%mm0, (%0, %1) \n\t" | |
| 3294 "movq %%mm0, (%0, %1, 2) \n\t" | |
| 3295 : "+r" (src) | |
| 3296 : "r" (-stride) | |
| 3297 ); | |
| 3298 #else | |
| 3299 int i; | |
| 3300 uint8_t *p=src; | |
| 3301 for(i=0; i<3; i++) | |
| 3302 { | |
| 3303 p-= stride; | |
| 3304 memcpy(p, src, 8); | |
| 3305 } | |
| 3306 #endif | |
| 3307 } | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3308 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3309 /** |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3310 * Filters array of bytes (Y or U or V values) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3311 */ |
| 169 | 3312 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 787 | 3313 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3314 { |
| 787 | 3315 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3316 int x,y; |
|
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3317 #ifdef COMPILE_TIME_MODE |
|
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3318 const int mode= COMPILE_TIME_MODE; |
|
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3319 #else |
| 787 | 3320 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; |
|
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
3321 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3322 int black=0, white=255; // blackest black and whitest white in the picture |
| 223 | 3323 int QPCorrecture= 256*256; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3324 |
|
886
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
3325 int copyAhead; |
|
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
3326 #ifdef HAVE_MMX |
|
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
3327 int i; |
|
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
3328 #endif |
| 164 | 3329 |
| 957 | 3330 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; |
| 3331 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; | |
| 3332 | |
| 787 | 3333 //FIXME remove |
| 3334 uint64_t * const yHistogram= c.yHistogram; | |
| 3335 uint8_t * const tempSrc= c.tempSrc; | |
| 3336 uint8_t * const tempDst= c.tempDst; | |
|
2031
4225c131a2eb
warning fixes by (Michael Roitzsch <mroi at users dot sourceforge dot net>)
michael
parents:
1724
diff
changeset
|
3337 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; |
|
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
3338 |
| 158 | 3339 #ifdef HAVE_MMX |
| 1724 | 3340 for(i=0; i<57; i++){ |
| 791 | 3341 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; |
| 3342 int threshold= offset*2 + 1; | |
| 3343 c.mmxDcOffset[i]= 0x7F - offset; | |
| 3344 c.mmxDcThreshold[i]= 0x7F - threshold; | |
| 3345 c.mmxDcOffset[i]*= 0x0101010101010101LL; | |
| 3346 c.mmxDcThreshold[i]*= 0x0101010101010101LL; | |
| 3347 } | |
| 158 | 3348 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3349 |
| 164 | 3350 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
| 787 | 3351 else if( (mode & LINEAR_BLEND_DEINT_FILTER) |
| 1157 | 3352 || (mode & FFMPEG_DEINT_FILTER) |
| 3353 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; | |
| 164 | 3354 else if( (mode & V_DEBLOCK) |
| 3355 || (mode & LINEAR_IPOL_DEINT_FILTER) | |
|
2037
98d8283534bb
accurate/slow (per line instead of per block) deblock filter spport which is identical to what is recommanded in the mpeg4 spec
michael
parents:
2036
diff
changeset
|
3356 || (mode & MEDIAN_DEINT_FILTER) |
|
98d8283534bb
accurate/slow (per line instead of per block) deblock filter spport which is identical to what is recommanded in the mpeg4 spec
michael
parents:
2036
diff
changeset
|
3357 || (mode & V_A_DEBLOCK)) copyAhead=13; |
| 164 | 3358 else if(mode & V_X1_FILTER) copyAhead=11; |
| 787 | 3359 // else if(mode & V_RK1_FILTER) copyAhead=10; |
| 164 | 3360 else if(mode & DERING) copyAhead=9; |
| 3361 else copyAhead=8; | |
| 3362 | |
| 3363 copyAhead-= 8; | |
| 3364 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3365 if(!isColor) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3366 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3367 uint64_t sum= 0; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3368 int i; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3369 uint64_t maxClipped; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3370 uint64_t clipped; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3371 double scale; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3372 |
| 787 | 3373 c.frameNum++; |
| 3374 // first frame is fscked so we ignore it | |
| 3375 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3376 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3377 for(i=0; i<256; i++) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3378 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3379 sum+= yHistogram[i]; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3380 // printf("%d ", yHistogram[i]); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3381 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3382 // printf("\n\n"); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3383 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3384 /* we allways get a completly black picture first */ |
| 793 | 3385 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3386 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3387 clipped= sum; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3388 for(black=255; black>0; black--) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3389 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3390 if(clipped < maxClipped) break; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3391 clipped-= yHistogram[black]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3392 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3393 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3394 clipped= sum; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3395 for(white=0; white<256; white++) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3396 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3397 if(clipped < maxClipped) break; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3398 clipped-= yHistogram[white]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3399 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3400 |
| 787 | 3401 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); |
| 173 | 3402 |
| 3403 #ifdef HAVE_MMX2 | |
| 787 | 3404 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
| 3405 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; | |
| 173 | 3406 #else |
| 787 | 3407 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
| 3408 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; | |
| 173 | 3409 #endif |
| 3410 | |
| 787 | 3411 c.packedYOffset|= c.packedYOffset<<32; |
| 3412 c.packedYOffset|= c.packedYOffset<<16; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3413 |
| 787 | 3414 c.packedYScale|= c.packedYScale<<32; |
| 3415 c.packedYScale|= c.packedYScale<<16; | |
| 223 | 3416 |
| 3417 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); | |
| 3418 else QPCorrecture= 256*256; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3419 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3420 else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3421 { |
| 787 | 3422 c.packedYScale= 0x0100010001000100LL; |
| 3423 c.packedYOffset= 0; | |
| 223 | 3424 QPCorrecture= 256*256; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3425 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3426 |
|
148
1cfc4d567c0a
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
142
diff
changeset
|
3427 /* copy & deinterlace first row of blocks */ |
| 142 | 3428 y=-BLOCK_SIZE; |
| 3429 { | |
| 3430 uint8_t *srcBlock= &(src[y*srcStride]); | |
| 224 | 3431 uint8_t *dstBlock= tempDst + dstStride; |
| 142 | 3432 |
| 3433 // From this point on it is guranteed that we can read and write 16 lines downward | |
| 3434 // finish 1 block before the next otherwise we´ll might have a problem | |
| 3435 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
| 3436 for(x=0; x<width; x+=BLOCK_SIZE) | |
| 3437 { | |
| 3438 | |
| 3439 #ifdef HAVE_MMX2 | |
| 3440 /* | |
| 3441 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 3442 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 3443 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 3444 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 3445 */ | |
| 3446 | |
| 3447 asm( | |
| 3448 "movl %4, %%eax \n\t" | |
| 3449 "shrl $2, %%eax \n\t" | |
| 3450 "andl $6, %%eax \n\t" | |
| 164 | 3451 "addl %5, %%eax \n\t" |
| 787 | 3452 "movl %%eax, %%edx \n\t" |
| 142 | 3453 "imul %1, %%eax \n\t" |
| 787 | 3454 "imul %3, %%edx \n\t" |
| 142 | 3455 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 3456 "prefetcht0 32(%%edx, %2) \n\t" |
| 142 | 3457 "addl %1, %%eax \n\t" |
| 787 | 3458 "addl %3, %%edx \n\t" |
| 142 | 3459 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 3460 "prefetcht0 32(%%edx, %2) \n\t" |
| 142 | 3461 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
| 164 | 3462 "m" (x), "m" (copyAhead) |
| 787 | 3463 : "%eax", "%edx" |
| 142 | 3464 ); |
| 3465 | |
| 3466 #elif defined(HAVE_3DNOW) | |
| 3467 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 3468 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
| 3469 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 3470 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 3471 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 3472 */ | |
| 3473 #endif | |
| 3474 | |
| 224 | 3475 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
| 787 | 3476 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
| 224 | 3477 |
| 3478 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); | |
| 142 | 3479 |
| 3480 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
| 169 | 3481 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
| 142 | 3482 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 1581 | 3483 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); |
| 142 | 3484 else if(mode & MEDIAN_DEINT_FILTER) |
| 169 | 3485 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
| 142 | 3486 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
| 169 | 3487 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
| 787 | 3488 else if(mode & FFMPEG_DEINT_FILTER) |
| 3489 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
| 1157 | 3490 else if(mode & LOWPASS5_DEINT_FILTER) |
| 3491 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); | |
| 142 | 3492 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
| 169 | 3493 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
| 142 | 3494 */ |
| 3495 dstBlock+=8; | |
| 3496 srcBlock+=8; | |
| 3497 } | |
| 941 | 3498 if(width==dstStride) |
| 3499 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride); | |
| 3500 else | |
| 3501 { | |
| 943 | 3502 int i; |
| 941 | 3503 for(i=0; i<copyAhead; i++) |
| 3504 { | |
| 3505 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); | |
| 3506 } | |
| 3507 } | |
| 142 | 3508 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3509 |
| 787 | 3510 //printf("\n"); |
| 111 | 3511 for(y=0; y<height; y+=BLOCK_SIZE) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3512 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3513 //1% speedup if these are here instead of the inner loop |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3514 uint8_t *srcBlock= &(src[y*srcStride]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3515 uint8_t *dstBlock= &(dst[y*dstStride]); |
| 169 | 3516 #ifdef HAVE_MMX |
| 787 | 3517 uint8_t *tempBlock1= c.tempBlocks; |
| 3518 uint8_t *tempBlock2= c.tempBlocks + 8; | |
| 169 | 3519 #endif |
| 957 | 3520 int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; |
| 1196 | 3521 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*QPStride]; |
| 156 | 3522 int QP=0; |
| 130 | 3523 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
| 3524 if not than use a temporary buffer */ | |
| 111 | 3525 if(y+15 >= height) |
| 3526 { | |
| 156 | 3527 int i; |
| 164 | 3528 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
| 111 | 3529 blockcopy to dst later */ |
| 164 | 3530 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
| 3531 srcStride*MAX(height-y-copyAhead, 0) ); | |
| 3532 | |
| 3533 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ | |
| 3534 for(i=MAX(height-y, 8); i<copyAhead+8; i++) | |
| 156 | 3535 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); |
| 3536 | |
| 164 | 3537 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
| 3538 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) ); | |
| 3539 | |
| 3540 /* duplicate last line of dst to fill the void upto line (copyAhead) */ | |
| 3541 for(i=height-y+1; i<=copyAhead; i++) | |
| 156 | 3542 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); |
| 3543 | |
| 130 | 3544 dstBlock= tempDst + dstStride; |
| 111 | 3545 srcBlock= tempSrc; |
| 3546 } | |
| 787 | 3547 //printf("\n"); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3548 |
| 112 | 3549 // From this point on it is guranteed that we can read and write 16 lines downward |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3550 // finish 1 block before the next otherwise we´ll might have a problem |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3551 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3552 for(x=0; x<width; x+=BLOCK_SIZE) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3553 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3554 const int stride= dstStride; |
| 169 | 3555 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3556 uint8_t *tmpXchg; |
| 169 | 3557 #endif |
| 791 | 3558 if(isColor) |
| 121 | 3559 { |
| 957 | 3560 QP= QPptr[x>>qpHShift]; |
| 3561 c.nonBQP= nonBQPptr[x>>qpHShift]; | |
| 791 | 3562 } |
| 3563 else | |
| 3564 { | |
| 3565 QP= QPptr[x>>4]; | |
| 223 | 3566 QP= (QP* QPCorrecture + 256*128)>>16; |
| 791 | 3567 c.nonBQP= nonBQPptr[x>>4]; |
| 3568 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; | |
|
148
1cfc4d567c0a
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
142
diff
changeset
|
3569 yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
| 121 | 3570 } |
| 787 | 3571 c.QP= QP; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3572 #ifdef HAVE_MMX |
| 111 | 3573 asm volatile( |
| 787 | 3574 "movd %1, %%mm7 \n\t" |
| 111 | 3575 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
| 3576 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | |
| 3577 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | |
| 787 | 3578 "movq %%mm7, %0 \n\t" |
| 3579 : "=m" (c.pQPb) | |
| 3580 : "r" (QP) | |
| 111 | 3581 ); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3582 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3583 |
| 96 | 3584 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3585 #ifdef HAVE_MMX2 |
| 126 | 3586 /* |
| 3587 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 3588 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 3589 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 3590 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 3591 */ | |
| 3592 | |
| 3593 asm( | |
| 3594 "movl %4, %%eax \n\t" | |
| 3595 "shrl $2, %%eax \n\t" | |
| 3596 "andl $6, %%eax \n\t" | |
| 164 | 3597 "addl %5, %%eax \n\t" |
| 787 | 3598 "movl %%eax, %%edx \n\t" |
| 126 | 3599 "imul %1, %%eax \n\t" |
| 787 | 3600 "imul %3, %%edx \n\t" |
| 126 | 3601 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 3602 "prefetcht0 32(%%edx, %2) \n\t" |
| 126 | 3603 "addl %1, %%eax \n\t" |
| 787 | 3604 "addl %3, %%edx \n\t" |
| 126 | 3605 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 3606 "prefetcht0 32(%%edx, %2) \n\t" |
| 126 | 3607 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
| 164 | 3608 "m" (x), "m" (copyAhead) |
| 787 | 3609 : "%eax", "%edx" |
| 126 | 3610 ); |
| 3611 | |
| 96 | 3612 #elif defined(HAVE_3DNOW) |
| 3613 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 111 | 3614 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
| 3615 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 3616 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 3617 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 96 | 3618 */ |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3619 #endif |
| 111 | 3620 |
| 169 | 3621 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
| 787 | 3622 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3623 |
| 111 | 3624 if(mode & LINEAR_IPOL_DEINT_FILTER) |
| 169 | 3625 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
| 111 | 3626 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 1581 | 3627 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); |
| 111 | 3628 else if(mode & MEDIAN_DEINT_FILTER) |
| 169 | 3629 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
| 111 | 3630 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
| 169 | 3631 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
| 787 | 3632 else if(mode & FFMPEG_DEINT_FILTER) |
| 3633 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
| 1157 | 3634 else if(mode & LOWPASS5_DEINT_FILTER) |
| 3635 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); | |
| 111 | 3636 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
| 169 | 3637 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
3638 */ |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3639 |
| 111 | 3640 /* only deblock if we have 2 blocks */ |
| 3641 if(y + 8 < height) | |
| 3642 { | |
| 787 | 3643 if(mode & V_X1_FILTER) |
| 3644 RENAME(vertX1Filter)(dstBlock, stride, &c); | |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3645 else if(mode & V_DEBLOCK) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3646 { |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3647 const int t= RENAME(vertClassify)(dstBlock, stride, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3648 |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3649 if(t==1) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3650 RENAME(doVertLowPass)(dstBlock, stride, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3651 else if(t==2) |
| 787 | 3652 RENAME(doVertDefFilter)(dstBlock, stride, &c); |
|
2037
98d8283534bb
accurate/slow (per line instead of per block) deblock filter spport which is identical to what is recommanded in the mpeg4 spec
michael
parents:
2036
diff
changeset
|
3653 }else if(mode & V_A_DEBLOCK){ |
|
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3654 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3655 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3656 } |
| 130 | 3657 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3658 #ifdef HAVE_MMX |
| 169 | 3659 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3660 #endif |
| 111 | 3661 /* check if we have a previous block to deblock it with dstBlock */ |
| 112 | 3662 if(x - 8 >= 0) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3663 { |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3664 #ifdef HAVE_MMX |
| 787 | 3665 if(mode & H_X1_FILTER) |
| 3666 RENAME(vertX1Filter)(tempBlock1, 16, &c); | |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3667 else if(mode & H_DEBLOCK) |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3668 { |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3669 //START_TIMER |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3670 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3671 //STOP_TIMER("dc & minmax") |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3672 if(t==1) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3673 RENAME(doVertLowPass)(tempBlock1, 16, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3674 else if(t==2) |
| 787 | 3675 RENAME(doVertDefFilter)(tempBlock1, 16, &c); |
|
2037
98d8283534bb
accurate/slow (per line instead of per block) deblock filter spport which is identical to what is recommanded in the mpeg4 spec
michael
parents:
2036
diff
changeset
|
3676 }else if(mode & H_A_DEBLOCK){ |
|
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3677 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3678 } |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3679 |
| 169 | 3680 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3681 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3682 #else |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3683 if(mode & H_X1_FILTER) |
|
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3684 horizX1Filter(dstBlock-4, stride, QP); |
|
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3685 else if(mode & H_DEBLOCK) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3686 { |
| 2043 | 3687 #ifdef HAVE_ALTIVEC |
| 3688 unsigned char __attribute__ ((aligned(16))) tempBlock[272]; | |
| 3689 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); | |
| 3690 | |
| 3691 const int t=vertClassify_altivec(tempBlock-48, 16, &c); | |
| 3692 if(t==1) { | |
| 3693 doVertLowPass_altivec(tempBlock-48, 16, &c); | |
| 3694 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); | |
| 3695 } | |
| 3696 else if(t==2) { | |
| 3697 doVertDefFilter_altivec(tempBlock-48, 16, &c); | |
| 3698 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); | |
| 3699 } | |
| 3700 #else | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3701 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3702 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3703 if(t==1) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3704 RENAME(doHorizLowPass)(dstBlock-4, stride, &c); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3705 else if(t==2) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
2031
diff
changeset
|
3706 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); |
| 2043 | 3707 #endif |
|
2037
98d8283534bb
accurate/slow (per line instead of per block) deblock filter spport which is identical to what is recommanded in the mpeg4 spec
michael
parents:
2036
diff
changeset
|
3708 }else if(mode & H_A_DEBLOCK){ |
|
2039
f25e485a7850
mmx optimized version of the per line/accurate deblock filter
michael
parents:
2038
diff
changeset
|
3709 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3710 } |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3711 #endif |
| 130 | 3712 if(mode & DERING) |
| 3713 { | |
| 3714 //FIXME filter first line | |
| 787 | 3715 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); |
| 130 | 3716 } |
| 156 | 3717 |
| 3718 if(mode & TEMP_NOISE_FILTER) | |
| 3719 { | |
| 169 | 3720 RENAME(tempNoiseReducer)(dstBlock-8, stride, |
| 787 | 3721 c.tempBlured[isColor] + y*dstStride + x, |
| 3722 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
| 3723 c.ppMode.maxTmpNoise); | |
| 156 | 3724 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3725 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3726 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3727 dstBlock+=8; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3728 srcBlock+=8; |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3729 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3730 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3731 tmpXchg= tempBlock1; |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3732 tempBlock1= tempBlock2; |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3733 tempBlock2 = tmpXchg; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3734 #endif |
| 111 | 3735 } |
| 3736 | |
| 156 | 3737 if(mode & DERING) |
| 3738 { | |
| 787 | 3739 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); |
| 156 | 3740 } |
| 3741 | |
| 3742 if((mode & TEMP_NOISE_FILTER)) | |
| 3743 { | |
| 169 | 3744 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
| 787 | 3745 c.tempBlured[isColor] + y*dstStride + x, |
| 3746 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
| 3747 c.ppMode.maxTmpNoise); | |
| 156 | 3748 } |
| 3749 | |
| 142 | 3750 /* did we use a tmp buffer for the last lines*/ |
| 112 | 3751 if(y+15 >= height) |
| 111 | 3752 { |
| 3753 uint8_t *dstBlock= &(dst[y*dstStride]); | |
| 941 | 3754 if(width==dstStride) |
| 3755 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y)); | |
| 3756 else | |
| 3757 { | |
| 944 | 3758 int i; |
| 941 | 3759 for(i=0; i<height-y; i++) |
| 3760 { | |
| 3761 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); | |
| 3762 } | |
| 3763 } | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3764 } |
| 163 | 3765 /* |
| 3766 for(x=0; x<width; x+=32) | |
| 3767 { | |
| 164 | 3768 volatile int i; |
| 163 | 3769 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
| 3770 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | |
| 164 | 3771 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
| 3772 // + dstBlock[x +13*dstStride] | |
| 3773 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; | |
| 3774 }*/ | |
| 3775 } | |
| 96 | 3776 #ifdef HAVE_3DNOW |
| 3777 asm volatile("femms"); | |
| 3778 #elif defined (HAVE_MMX) | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3779 asm volatile("emms"); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3780 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3781 |
| 163 | 3782 #ifdef DEBUG_BRIGHTNESS |
| 3783 if(!isColor) | |
| 3784 { | |
| 3785 int max=1; | |
| 3786 int i; | |
| 3787 for(i=0; i<256; i++) | |
| 3788 if(yHistogram[i] > max) max=yHistogram[i]; | |
| 3789 | |
| 3790 for(i=1; i<256; i++) | |
| 3791 { | |
| 3792 int x; | |
| 3793 int start=yHistogram[i-1]/(max/256+1); | |
| 3794 int end=yHistogram[i]/(max/256+1); | |
| 3795 int inc= end > start ? 1 : -1; | |
| 3796 for(x=start; x!=end+inc; x+=inc) | |
| 3797 dst[ i*dstStride + x]+=128; | |
| 3798 } | |
| 3799 | |
| 3800 for(i=0; i<100; i+=2) | |
| 3801 { | |
| 3802 dst[ (white)*dstStride + i]+=128; | |
| 3803 dst[ (black)*dstStride + i]+=128; | |
| 3804 } | |
| 3805 | |
| 3806 } | |
| 3807 #endif | |
| 3808 | |
| 787 | 3809 *c2= c; //copy local context back |
| 3810 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3811 } |
