Mercurial > libavcodec.hg
annotate libpostproc/postprocess_template.c @ 1352:e8ff4783f188 libavcodec
1) remove TBL support in PPC performance. It's much more useful to use the
PMCs, and with Apple's CHUD it's fairly easy too. No reason to keep useless
code around
2) make the PPC perf stuff a configure option
3) make put_pixels16_altivec a bit faster by unrolling the loop by 4
patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
| author | michaelni |
|---|---|
| date | Wed, 09 Jul 2003 20:18:13 +0000 |
| parents | cf65e69400ec |
| children | d2fc92d02bf7 |
| rev | line source |
|---|---|
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1 /* |
| 223 | 2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
4 This program is free software; you can redistribute it and/or modify |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
5 it under the terms of the GNU General Public License as published by |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
6 the Free Software Foundation; either version 2 of the License, or |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
7 (at your option) any later version. |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
8 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
9 This program is distributed in the hope that it will be useful, |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
12 GNU General Public License for more details. |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
13 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
14 You should have received a copy of the GNU General Public License |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
15 along with this program; if not, write to the Free Software |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
17 */ |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
18 |
| 1109 | 19 /** |
| 20 * @file postprocess_template.c | |
| 21 * mmx/mmx2/3dnow postprocess code. | |
| 22 */ | |
| 23 | |
| 24 | |
| 169 | 25 #undef PAVGB |
| 26 #undef PMINUB | |
| 27 #undef PMAXUB | |
| 104 | 28 |
| 29 #ifdef HAVE_MMX2 | |
| 30 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
| 31 #elif defined (HAVE_3DNOW) | |
| 32 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
| 33 #endif | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
34 |
| 134 | 35 #ifdef HAVE_MMX2 |
| 36 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" | |
| 37 #elif defined (HAVE_MMX) | |
| 38 #define PMINUB(b,a,t) \ | |
| 39 "movq " #a ", " #t " \n\t"\ | |
| 40 "psubusb " #b ", " #t " \n\t"\ | |
| 41 "psubb " #t ", " #a " \n\t" | |
| 42 #endif | |
| 43 | |
| 44 #ifdef HAVE_MMX2 | |
| 45 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" | |
| 46 #elif defined (HAVE_MMX) | |
| 47 #define PMAXUB(a,b) \ | |
| 48 "psubusb " #a ", " #b " \n\t"\ | |
| 49 "paddb " #a ", " #b " \n\t" | |
| 50 #endif | |
| 51 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
52 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
| 787 | 53 #ifdef HAVE_MMX |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
54 /** |
| 111 | 55 * Check if the middle 8x8 Block in the given 8x16 block is flat |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
56 */ |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
57 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
58 int numEq= 0, dcOk; |
| 111 | 59 src+= stride*4; // src points to begin of the 8x8 Block |
| 119 | 60 asm volatile( |
| 1331 | 61 "movq %0, %%mm7 \n\t" |
| 62 "movq %1, %%mm6 \n\t" | |
| 63 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) | |
| 64 ); | |
| 65 | |
| 66 asm volatile( | |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
67 "leal (%2, %3), %%eax \n\t" |
| 119 | 68 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 69 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
| 791 | 70 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
71 "movq (%2), %%mm0 \n\t" |
| 119 | 72 "movq (%%eax), %%mm1 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
73 "movq %%mm0, %%mm3 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
74 "movq %%mm0, %%mm4 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
75 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
76 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
77 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
78 "paddb %%mm7, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
79 "pcmpgtb %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
80 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
81 "movq (%%eax,%3), %%mm2 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
82 PMAXUB(%%mm2, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
83 PMINUB(%%mm2, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
84 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
85 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
86 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
87 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
88 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
89 "movq (%%eax, %3, 2), %%mm1 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
90 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
91 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
92 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
93 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
94 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
95 "paddb %%mm2, %%mm0 \n\t" |
| 787 | 96 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
97 "leal (%%eax, %3, 4), %%eax \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
98 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
99 "movq (%2, %3, 4), %%mm2 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
100 PMAXUB(%%mm2, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
101 PMINUB(%%mm2, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
102 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
103 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
104 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
105 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
106 |
| 787 | 107 "movq (%%eax), %%mm1 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
108 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
109 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
110 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
111 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
112 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
113 "paddb %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
114 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
115 "movq (%%eax, %3), %%mm2 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
116 PMAXUB(%%mm2, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
117 PMINUB(%%mm2, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
118 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
119 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
120 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
121 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
122 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
123 "movq (%%eax, %3, 2), %%mm1 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
124 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
125 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
126 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
127 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
128 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
129 "paddb %%mm2, %%mm0 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
130 "psubusb %%mm3, %%mm4 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
131 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
132 " \n\t" |
| 167 | 133 #ifdef HAVE_MMX2 |
| 134 "pxor %%mm7, %%mm7 \n\t" | |
| 135 "psadbw %%mm7, %%mm0 \n\t" | |
| 136 #else | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
137 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
138 "psrlw $8, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
139 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
140 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
141 "psrlq $16, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
142 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
143 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
144 "psrlq $32, %%mm0 \n\t" |
| 167 | 145 "paddb %%mm1, %%mm0 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
146 #endif |
| 1331 | 147 "movq %4, %%mm7 \n\t" // QP,..., QP |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
148 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
149 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
150 "packssdw %%mm4, %%mm4 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
151 "movd %%mm0, %0 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
152 "movd %%mm4, %1 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
153 |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
154 : "=r" (numEq), "=r" (dcOk) |
| 1331 | 155 : "r" (src), "r" (stride), "m" (c->pQPb) |
| 787 | 156 : "%eax" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
157 ); |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
158 |
| 167 | 159 numEq= (-numEq) &0xFF; |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
160 if(numEq > c->ppMode.flatnessThreshold){ |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
161 if(dcOk) return 0; |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
162 else return 1; |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
163 }else{ |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
164 return 2; |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
165 } |
| 787 | 166 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
167 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
168 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
169 /** |
| 111 | 170 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
| 107 | 171 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
172 */ |
| 787 | 173 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
174 { |
| 96 | 175 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 176 src+= stride*3; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
177 asm volatile( //"movv %0 %1 %2\n\t" |
| 787 | 178 "movq %2, %%mm0 \n\t" // QP,..., QP |
| 179 "pxor %%mm4, %%mm4 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
180 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
181 "movq (%0), %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
182 "movq (%0, %1), %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
183 "movq %%mm5, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
184 "movq %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
185 "psubusb %%mm6, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
186 "psubusb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
187 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
188 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
| 787 | 189 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
190 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
191 "pand %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
192 "pandn %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
193 "por %%mm2, %%mm6 \n\t"// First Line to Filter |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
194 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
195 "movq (%0, %1, 8), %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
196 "leal (%0, %1, 4), %%eax \n\t" |
| 787 | 197 "leal (%0, %1, 8), %%ecx \n\t" |
| 198 "subl %1, %%ecx \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
199 "addl %1, %0 \n\t" // %0 points to line 1 not 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
200 "movq (%0, %1, 8), %%mm7 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
201 "movq %%mm5, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
202 "movq %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
203 "psubusb %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
204 "psubusb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
205 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
206 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
| 787 | 207 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
208 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
209 "pand %%mm2, %%mm7 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
210 "pandn %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
211 "por %%mm2, %%mm7 \n\t" // First Line to Filter |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
212 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
213 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
214 // 1 2 3 4 5 6 7 8 |
| 787 | 215 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
216 // 6 4 2 2 1 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
217 // 6 4 4 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
218 // 6 8 2 |
| 111 | 219 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
220 "movq (%0, %1), %%mm0 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
221 "movq %%mm0, %%mm1 \n\t" // 1 |
| 96 | 222 PAVGB(%%mm6, %%mm0) //1 1 /2 |
| 223 PAVGB(%%mm6, %%mm0) //3 1 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
224 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
225 "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
226 "movq %%mm2, %%mm5 \n\t" // 1 |
| 96 | 227 PAVGB((%%eax), %%mm2) // 11 /2 |
| 228 PAVGB((%0, %1, 2), %%mm2) // 211 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
229 "movq %%mm2, %%mm3 \n\t" // 211 /4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
230 "movq (%0), %%mm4 \n\t" // 1 |
| 96 | 231 PAVGB(%%mm4, %%mm3) // 4 211 /8 |
| 232 PAVGB(%%mm0, %%mm3) //642211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
233 "movq %%mm3, (%0) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
234 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
235 "movq %%mm1, %%mm0 \n\t" // 1 |
| 96 | 236 PAVGB(%%mm6, %%mm0) //1 1 /2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
237 "movq %%mm4, %%mm3 \n\t" // 1 |
| 96 | 238 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
| 239 PAVGB((%%eax,%1,2), %%mm5) // 11 /2 | |
| 240 PAVGB((%%eax), %%mm5) // 211 /4 | |
| 241 PAVGB(%%mm5, %%mm3) // 2 2211 /8 | |
| 242 PAVGB(%%mm0, %%mm3) //4242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
243 "movq %%mm3, (%0,%1) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
244 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
| 96 | 245 PAVGB(%%mm4, %%mm6) //11 /2 |
| 787 | 246 "movq (%%ecx), %%mm0 \n\t" // 1 |
| 96 | 247 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
248 "movq %%mm0, %%mm3 \n\t" // 11/2 |
| 96 | 249 PAVGB(%%mm1, %%mm0) // 2 11/4 |
| 250 PAVGB(%%mm6, %%mm0) //222 11/8 | |
| 251 PAVGB(%%mm2, %%mm0) //22242211/16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
252 "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
253 "movq %%mm0, (%0, %1, 2) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
254 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
255 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 787 | 256 PAVGB((%%ecx), %%mm0) // 11 /2 |
| 96 | 257 PAVGB(%%mm0, %%mm6) //11 11 /4 |
| 258 PAVGB(%%mm1, %%mm4) // 11 /2 | |
| 259 PAVGB(%%mm2, %%mm1) // 11 /2 | |
| 260 PAVGB(%%mm1, %%mm6) //1122 11 /8 | |
| 261 PAVGB(%%mm5, %%mm6) //112242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
262 "movq (%%eax), %%mm5 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
263 "movq %%mm6, (%%eax) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
264 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
265 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 |
| 96 | 266 PAVGB(%%mm7, %%mm6) // 11 /2 |
| 267 PAVGB(%%mm4, %%mm6) // 11 11 /4 | |
| 268 PAVGB(%%mm3, %%mm6) // 11 2211 /8 | |
| 269 PAVGB(%%mm5, %%mm2) // 11 /2 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
270 "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
| 96 | 271 PAVGB(%%mm4, %%mm2) // 112 /4 |
| 272 PAVGB(%%mm2, %%mm6) // 112242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
273 "movq %%mm6, (%0, %1, 4) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
274 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
| 96 | 275 PAVGB(%%mm7, %%mm1) // 11 2 /4 |
| 276 PAVGB(%%mm4, %%mm5) // 11 /2 | |
| 277 PAVGB(%%mm5, %%mm0) // 11 11 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
278 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
| 96 | 279 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
| 280 PAVGB(%%mm0, %%mm1) // 11224222 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
281 "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
282 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
| 787 | 283 PAVGB((%%ecx), %%mm2) // 112 4 /8 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
284 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 96 | 285 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
| 286 PAVGB(%%mm7, %%mm6) // 1 12 /4 | |
| 287 PAVGB(%%mm2, %%mm6) // 1122424 /4 | |
| 787 | 288 "movq %%mm6, (%%ecx) \n\t" // X |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
289 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
| 96 | 290 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
| 291 PAVGB(%%mm7, %%mm5) // 11 6 /8 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
292 |
| 96 | 293 PAVGB(%%mm3, %%mm0) // 112 /4 |
| 294 PAVGB(%%mm0, %%mm5) // 112246 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
295 "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
| 140 | 296 "subl %1, %0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
297 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
298 : |
| 787 | 299 : "r" (src), "r" (stride), "m" (c->pQPb) |
| 300 : "%eax", "%ecx" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
301 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
302 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
303 const int l1= stride; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
304 const int l2= stride + l1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
305 const int l3= stride + l2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
306 const int l4= stride + l3; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
307 const int l5= stride + l4; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
308 const int l6= stride + l5; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
309 const int l7= stride + l6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
310 const int l8= stride + l7; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
311 const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
312 int x; |
| 111 | 313 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
314 for(x=0; x<BLOCK_SIZE; x++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
315 { |
| 787 | 316 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; |
| 317 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
318 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
319 int sums[9]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
320 sums[0] = first + src[l1]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
321 sums[1] = src[l1] + src[l2]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
322 sums[2] = src[l2] + src[l3]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
323 sums[3] = src[l3] + src[l4]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
324 sums[4] = src[l4] + src[l5]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
325 sums[5] = src[l5] + src[l6]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
326 sums[6] = src[l6] + src[l7]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
327 sums[7] = src[l7] + src[l8]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
328 sums[8] = src[l8] + last; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
329 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
330 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
331 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
332 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
333 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
334 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
335 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
336 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
337 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
338 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
339 src++; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
340 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
341 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
342 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
343 |
| 787 | 344 #if 0 |
| 96 | 345 /** |
| 346 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | |
| 347 * values are correctly clipped (MMX2) | |
| 348 * values are wraparound (C) | |
| 349 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | |
| 350 0 8 16 24 | |
| 351 x = 8 | |
| 352 x/2 = 4 | |
| 353 x/8 = 1 | |
| 354 1 12 12 23 | |
| 355 */ | |
| 169 | 356 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
| 96 | 357 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
358 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 359 src+= stride*3; |
| 96 | 360 // FIXME rounding |
| 361 asm volatile( | |
| 362 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 210 | 363 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 96 | 364 "leal (%0, %1), %%eax \n\t" |
| 787 | 365 "leal (%%eax, %1, 4), %%ecx \n\t" |
| 96 | 366 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 367 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
| 210 | 368 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
| 96 | 369 "movq %%mm0, %%mm1 \n\t" // QP,..., QP |
| 210 | 370 "paddusb "MANGLE(b02)", %%mm0 \n\t" |
| 96 | 371 "psrlw $2, %%mm0 \n\t" |
| 210 | 372 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 |
| 96 | 373 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... |
| 374 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | |
| 787 | 375 "movq (%%ecx), %%mm3 \n\t" // line 5 |
| 96 | 376 "movq %%mm2, %%mm4 \n\t" // line 4 |
| 377 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | |
| 378 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
379 PAVGB(%%mm3, %%mm5) |
| 96 | 380 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 |
| 381 "psubusb %%mm3, %%mm4 \n\t" | |
| 382 "psubusb %%mm2, %%mm3 \n\t" | |
| 383 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | |
| 384 "psubusb %%mm0, %%mm4 \n\t" | |
| 385 "pcmpeqb %%mm7, %%mm4 \n\t" | |
| 386 "pand %%mm4, %%mm5 \n\t" // d/2 | |
| 387 | |
| 388 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | |
| 389 "paddb %%mm5, %%mm2 \n\t" | |
| 390 // "psubb %%mm6, %%mm2 \n\t" | |
| 391 "movq %%mm2, (%0,%1, 4) \n\t" | |
| 392 | |
| 787 | 393 "movq (%%ecx), %%mm2 \n\t" |
| 96 | 394 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 |
| 395 "psubb %%mm5, %%mm2 \n\t" | |
| 396 // "psubb %%mm6, %%mm2 \n\t" | |
| 787 | 397 "movq %%mm2, (%%ecx) \n\t" |
| 96 | 398 |
| 399 "paddb %%mm6, %%mm5 \n\t" | |
| 400 "psrlw $2, %%mm5 \n\t" | |
| 210 | 401 "pand "MANGLE(b3F)", %%mm5 \n\t" |
| 402 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 | |
| 96 | 403 |
| 404 "movq (%%eax, %1, 2), %%mm2 \n\t" | |
| 405 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | |
| 406 "paddsb %%mm5, %%mm2 \n\t" | |
| 407 "psubb %%mm6, %%mm2 \n\t" | |
| 408 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
| 409 | |
| 787 | 410 "movq (%%ecx, %1), %%mm2 \n\t" |
| 96 | 411 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 |
| 412 "psubsb %%mm5, %%mm2 \n\t" | |
| 413 "psubb %%mm6, %%mm2 \n\t" | |
| 787 | 414 "movq %%mm2, (%%ecx, %1) \n\t" |
| 96 | 415 |
| 416 : | |
| 417 : "r" (src), "r" (stride) | |
| 787 | 418 : "%eax", "%ecx" |
| 96 | 419 ); |
| 420 #else | |
| 421 const int l1= stride; | |
| 422 const int l2= stride + l1; | |
| 423 const int l3= stride + l2; | |
| 424 const int l4= stride + l3; | |
| 425 const int l5= stride + l4; | |
| 426 const int l6= stride + l5; | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
427 // const int l7= stride + l6; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
428 // const int l8= stride + l7; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
429 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
430 int x; |
| 141 | 431 const int QP15= QP + (QP>>2); |
| 111 | 432 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
433 for(x=0; x<BLOCK_SIZE; x++) |
| 96 | 434 { |
| 141 | 435 const int v = (src[x+l5] - src[x+l4]); |
| 436 if(ABS(v) < QP15) | |
| 96 | 437 { |
| 141 | 438 src[x+l3] +=v>>3; |
| 439 src[x+l4] +=v>>1; | |
| 440 src[x+l5] -=v>>1; | |
| 441 src[x+l6] -=v>>3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
442 |
| 96 | 443 } |
| 444 } | |
| 445 | |
| 446 #endif | |
| 447 } | |
| 787 | 448 #endif |
| 96 | 449 |
| 450 /** | |
| 451 * Experimental Filter 1 | |
| 99 | 452 * will not damage linear gradients |
| 453 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
454 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
455 * MMX2 version does correct clipping C version doesnt |
| 96 | 456 */ |
| 787 | 457 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
| 96 | 458 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
459 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 460 src+= stride*3; |
| 461 | |
| 96 | 462 asm volatile( |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
463 "pxor %%mm7, %%mm7 \n\t" // 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
464 "leal (%0, %1), %%eax \n\t" |
| 787 | 465 "leal (%%eax, %1, 4), %%ecx \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
466 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 467 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
468 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
469 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
470 "movq %%mm1, %%mm2 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
471 "psubusb %%mm0, %%mm1 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
472 "psubusb %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
473 "por %%mm1, %%mm0 \n\t" // |l2 - l3| |
| 787 | 474 "movq (%%ecx), %%mm3 \n\t" // line 5 |
| 475 "movq (%%ecx, %1), %%mm4 \n\t" // line 6 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
476 "movq %%mm3, %%mm5 \n\t" // line 5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
477 "psubusb %%mm4, %%mm3 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
478 "psubusb %%mm5, %%mm4 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
479 "por %%mm4, %%mm3 \n\t" // |l5 - l6| |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
480 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
481 "movq %%mm2, %%mm1 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
482 "psubusb %%mm5, %%mm2 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
483 "movq %%mm2, %%mm4 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
484 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
485 "psubusb %%mm1, %%mm5 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
486 "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
487 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
488 "movq %%mm4, %%mm3 \n\t" // d |
| 787 | 489 "movq %2, %%mm0 \n\t" |
| 334 | 490 "paddusb %%mm0, %%mm0 \n\t" |
| 491 "psubusb %%mm0, %%mm4 \n\t" | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
492 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
| 210 | 493 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
494 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
495 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
496 PAVGB(%%mm7, %%mm3) // d/2 |
| 99 | 497 "movq %%mm3, %%mm1 \n\t" // d/2 |
| 498 PAVGB(%%mm7, %%mm3) // d/4 | |
| 499 PAVGB(%%mm1, %%mm3) // 3*d/8 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
500 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
501 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
502 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
503 "psubusb %%mm3, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
504 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
505 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
506 |
| 787 | 507 "movq (%%ecx), %%mm0 \n\t" // line 5 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
508 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
509 "paddusb %%mm3, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
510 "pxor %%mm2, %%mm0 \n\t" |
| 787 | 511 "movq %%mm0, (%%ecx) \n\t" // line 5 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
512 |
| 99 | 513 PAVGB(%%mm7, %%mm1) // d/4 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
514 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
515 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
516 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
| 99 | 517 "psubusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
518 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
519 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
520 |
| 787 | 521 "movq (%%ecx, %1), %%mm0 \n\t" // line 6 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
522 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
| 99 | 523 "paddusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
524 "pxor %%mm2, %%mm0 \n\t" |
| 787 | 525 "movq %%mm0, (%%ecx, %1) \n\t" // line 6 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
526 |
| 99 | 527 PAVGB(%%mm7, %%mm1) // d/8 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
528 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
529 "movq (%%eax, %1), %%mm0 \n\t" // line 2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
530 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
| 99 | 531 "psubusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
532 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
533 "movq %%mm0, (%%eax, %1) \n\t" // line 2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
534 |
| 787 | 535 "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
536 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
| 99 | 537 "paddusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
538 "pxor %%mm2, %%mm0 \n\t" |
| 787 | 539 "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7 |
| 96 | 540 |
| 541 : | |
| 787 | 542 : "r" (src), "r" (stride), "m" (co->pQPb) |
| 543 : "%eax", "%ecx" | |
| 96 | 544 ); |
| 545 #else | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
546 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
547 const int l1= stride; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
548 const int l2= stride + l1; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
549 const int l3= stride + l2; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
550 const int l4= stride + l3; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
551 const int l5= stride + l4; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
552 const int l6= stride + l5; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
553 const int l7= stride + l6; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
554 // const int l8= stride + l7; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
555 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
556 int x; |
| 111 | 557 |
| 558 src+= stride*3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
559 for(x=0; x<BLOCK_SIZE; x++) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
560 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
561 int a= src[l3] - src[l4]; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
562 int b= src[l4] - src[l5]; |
| 99 | 563 int c= src[l5] - src[l6]; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
564 |
| 141 | 565 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); |
| 566 d= MAX(d, 0); | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
567 |
| 787 | 568 if(d < co->QP*2) |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
569 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
570 int v = d * SIGN(-b); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
571 |
| 141 | 572 src[l2] +=v>>3; |
| 573 src[l3] +=v>>2; | |
| 574 src[l4] +=(3*v)>>3; | |
| 575 src[l5] -=(3*v)>>3; | |
| 576 src[l6] -=v>>2; | |
| 577 src[l7] -=v>>3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
578 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
579 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
580 src++; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
581 } |
| 96 | 582 #endif |
| 583 } | |
| 584 | |
| 787 | 585 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
586 { |
| 163 | 587 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 588 /* | |
| 589 uint8_t tmp[16]; | |
| 590 const int l1= stride; | |
| 591 const int l2= stride + l1; | |
| 592 const int l3= stride + l2; | |
| 593 const int l4= (int)tmp - (int)src - stride*3; | |
| 594 const int l5= (int)tmp - (int)src - stride*3 + 8; | |
| 595 const int l6= stride*3 + l3; | |
| 596 const int l7= stride + l6; | |
| 597 const int l8= stride + l7; | |
| 598 | |
| 599 memcpy(tmp, src+stride*7, 8); | |
| 600 memcpy(tmp+8, src+stride*8, 8); | |
| 601 */ | |
| 111 | 602 src+= stride*4; |
| 163 | 603 asm volatile( |
| 604 | |
| 605 #if 0 //sligtly more accurate and slightly slower | |
| 606 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 607 "leal (%0, %1), %%eax \n\t" | |
| 787 | 608 "leal (%%eax, %1, 4), %%ecx \n\t" |
| 163 | 609 // 0 1 2 3 4 5 6 7 |
| 787 | 610 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
| 611 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
| 163 | 612 |
| 613 | |
| 614 "movq (%0, %1, 2), %%mm0 \n\t" // l2 | |
| 615 "movq (%0), %%mm1 \n\t" // l0 | |
| 616 "movq %%mm0, %%mm2 \n\t" // l2 | |
| 617 PAVGB(%%mm7, %%mm0) // ~l2/2 | |
| 618 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | |
| 619 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | |
| 620 | |
| 621 "movq (%%eax), %%mm1 \n\t" // l1 | |
| 622 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 | |
| 623 "movq %%mm1, %%mm4 \n\t" // l1 | |
| 624 PAVGB(%%mm7, %%mm1) // ~l1/2 | |
| 625 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | |
| 626 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | |
| 627 | |
| 628 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | |
| 629 "psubusb %%mm1, %%mm0 \n\t" | |
| 630 "psubusb %%mm4, %%mm1 \n\t" | |
| 631 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | |
| 632 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 | |
| 633 | |
| 634 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
| 635 "movq %%mm0, %%mm4 \n\t" // l4 | |
| 636 PAVGB(%%mm7, %%mm0) // ~l4/2 | |
| 637 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | |
| 638 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | |
| 639 | |
| 787 | 640 "movq (%%ecx), %%mm2 \n\t" // l5 |
| 163 | 641 "movq %%mm3, %%mm5 \n\t" // l3 |
| 642 PAVGB(%%mm7, %%mm3) // ~l3/2 | |
| 643 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | |
| 644 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | |
| 645 | |
| 646 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | |
| 647 "psubusb %%mm3, %%mm0 \n\t" | |
| 648 "psubusb %%mm6, %%mm3 \n\t" | |
| 649 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | |
| 650 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | |
| 651 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | |
| 652 | |
| 787 | 653 "movq (%%ecx, %1), %%mm6 \n\t" // l6 |
| 163 | 654 "movq %%mm6, %%mm5 \n\t" // l6 |
| 655 PAVGB(%%mm7, %%mm6) // ~l6/2 | |
| 656 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | |
| 657 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | |
| 658 | |
| 787 | 659 "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7 |
| 163 | 660 "movq %%mm2, %%mm4 \n\t" // l5 |
| 661 PAVGB(%%mm7, %%mm2) // ~l5/2 | |
| 662 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | |
| 663 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | |
| 664 | |
| 665 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | |
| 666 "psubusb %%mm2, %%mm6 \n\t" | |
| 667 "psubusb %%mm4, %%mm2 \n\t" | |
| 668 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | |
| 669 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | |
| 670 | |
| 671 | |
| 672 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | |
| 787 | 673 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? |
| 210 | 674 "paddusb "MANGLE(b01)", %%mm4 \n\t" |
| 163 | 675 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
| 676 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | |
| 677 "pand %%mm4, %%mm3 \n\t" | |
| 678 | |
| 679 "movq %%mm3, %%mm1 \n\t" | |
| 210 | 680 // "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 681 PAVGB(%%mm7, %%mm3) |
| 682 PAVGB(%%mm7, %%mm3) | |
| 683 "paddusb %%mm1, %%mm3 \n\t" | |
| 210 | 684 // "paddusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 685 |
| 686 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 | |
| 687 "movq (%0, %1, 4), %%mm5 \n\t" //l4 | |
| 688 "movq (%0, %1, 4), %%mm4 \n\t" //l4 | |
| 689 "psubusb %%mm6, %%mm5 \n\t" | |
| 690 "psubusb %%mm4, %%mm6 \n\t" | |
| 691 "por %%mm6, %%mm5 \n\t" // |l3-l4| | |
| 692 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | |
| 693 "pxor %%mm6, %%mm0 \n\t" | |
| 694 "pand %%mm0, %%mm3 \n\t" | |
| 695 PMINUB(%%mm5, %%mm3, %%mm0) | |
| 696 | |
| 210 | 697 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 698 PAVGB(%%mm7, %%mm3) |
| 699 | |
| 700 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 701 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 702 "pxor %%mm6, %%mm0 \n\t" | |
| 703 "pxor %%mm6, %%mm2 \n\t" | |
| 704 "psubb %%mm3, %%mm0 \n\t" | |
| 705 "paddb %%mm3, %%mm2 \n\t" | |
| 706 "pxor %%mm6, %%mm0 \n\t" | |
| 707 "pxor %%mm6, %%mm2 \n\t" | |
| 708 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 709 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 710 #endif | |
| 711 | |
| 712 "leal (%0, %1), %%eax \n\t" | |
| 713 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | |
| 714 // 0 1 2 3 4 5 6 7 | |
| 787 | 715 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
| 716 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
| 163 | 717 |
| 718 | |
| 719 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 | |
| 720 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
| 721 "pxor %%mm6, %%mm1 \n\t" // -l3-1 | |
| 722 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | |
| 723 // mm1=-l3-1, mm0=128-q | |
| 724 | |
| 725 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 | |
| 726 "movq (%%eax, %1), %%mm3 \n\t" // l2 | |
| 727 "pxor %%mm6, %%mm2 \n\t" // -l5-1 | |
| 728 "movq %%mm2, %%mm5 \n\t" // -l5-1 | |
| 210 | 729 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
| 787 | 730 "leal (%%eax, %1, 4), %%ecx \n\t" |
| 163 | 731 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 |
| 732 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | |
| 733 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | |
| 734 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | |
| 735 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | |
| 736 | |
| 737 "movq (%%eax), %%mm2 \n\t" // l1 | |
| 738 "pxor %%mm6, %%mm2 \n\t" // -l1-1 | |
| 739 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | |
| 740 PAVGB((%0), %%mm1) // (l0-l3+256)/2 | |
| 210 | 741 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 |
| 163 | 742 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
| 743 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | |
| 744 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | |
| 745 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | |
| 746 | |
| 787 | 747 PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2 |
| 748 "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7 | |
| 163 | 749 "pxor %%mm6, %%mm1 \n\t" // -l7-1 |
| 750 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | |
| 210 | 751 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
| 163 | 752 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
| 753 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | |
| 754 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | |
| 755 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | |
| 756 | |
| 210 | 757 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
| 758 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 | |
| 163 | 759 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 |
| 760 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | |
| 761 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | |
| 762 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | |
| 763 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | |
| 764 | |
| 765 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | |
| 766 | |
| 210 | 767 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
| 787 | 768 "movq %2, %%mm2 \n\t" // QP |
| 163 | 769 PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
| 770 "psubb %%mm6, %%mm2 \n\t" | |
| 771 | |
| 772 "movq %%mm4, %%mm1 \n\t" | |
| 773 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | |
| 774 "pxor %%mm1, %%mm4 \n\t" | |
| 775 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | |
| 776 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | |
| 777 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | |
| 778 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | |
| 779 | |
| 780 "movq %%mm4, %%mm3 \n\t" // d | |
| 210 | 781 "psubusb "MANGLE(b01)", %%mm4 \n\t" |
| 163 | 782 PAVGB(%%mm7, %%mm4) // d/32 |
| 783 PAVGB(%%mm7, %%mm4) // (d + 32)/64 | |
| 784 "paddb %%mm3, %%mm4 \n\t" // 5d/64 | |
| 785 "pand %%mm2, %%mm4 \n\t" | |
| 786 | |
| 210 | 787 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 |
| 163 | 788 "psubb %%mm0, %%mm5 \n\t" // q |
| 789 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | |
| 790 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | |
| 791 "pxor %%mm7, %%mm5 \n\t" | |
| 792 | |
| 793 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | |
| 794 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | |
| 795 | |
| 796 "pand %%mm7, %%mm4 \n\t" | |
| 797 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 798 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 799 "pxor %%mm1, %%mm0 \n\t" | |
| 800 "pxor %%mm1, %%mm2 \n\t" | |
| 801 "paddb %%mm4, %%mm0 \n\t" | |
| 802 "psubb %%mm4, %%mm2 \n\t" | |
| 803 "pxor %%mm1, %%mm0 \n\t" | |
| 804 "pxor %%mm1, %%mm2 \n\t" | |
| 805 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 806 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 807 | |
| 808 : | |
| 787 | 809 : "r" (src), "r" (stride), "m" (c->pQPb) |
| 810 : "%eax", "%ecx" | |
| 163 | 811 ); |
| 812 | |
| 813 /* | |
| 814 { | |
| 815 int x; | |
| 816 src-= stride; | |
| 817 for(x=0; x<BLOCK_SIZE; x++) | |
| 818 { | |
| 819 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
| 820 if(ABS(middleEnergy)< 8*QP) | |
| 821 { | |
| 822 const int q=(src[l4] - src[l5])/2; | |
| 823 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
| 824 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
| 825 | |
| 826 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
| 827 d= MAX(d, 0); | |
| 828 | |
| 829 d= (5*d + 32) >> 6; | |
| 830 d*= SIGN(-middleEnergy); | |
| 831 | |
| 832 if(q>0) | |
| 833 { | |
| 834 d= d<0 ? 0 : d; | |
| 835 d= d>q ? q : d; | |
| 836 } | |
| 837 else | |
| 838 { | |
| 839 d= d>0 ? 0 : d; | |
| 840 d= d<q ? q : d; | |
| 841 } | |
| 842 | |
| 843 src[l4]-= d; | |
| 844 src[l5]+= d; | |
| 845 } | |
| 846 src++; | |
| 847 } | |
| 848 src-=8; | |
| 849 for(x=0; x<8; x++) | |
| 850 { | |
| 851 int y; | |
| 852 for(y=4; y<6; y++) | |
| 853 { | |
| 854 int d= src[x+y*stride] - tmp[x+(y-4)*8]; | |
| 855 int ad= ABS(d); | |
| 856 static int max=0; | |
| 857 static int sum=0; | |
| 858 static int num=0; | |
| 859 static int bias=0; | |
| 860 | |
| 861 if(max<ad) max=ad; | |
| 862 sum+= ad>3 ? 1 : 0; | |
| 863 if(ad>3) | |
| 864 { | |
| 865 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | |
| 866 } | |
| 867 if(y==4) bias+=d; | |
| 868 num++; | |
| 869 if(num%1000000 == 0) | |
| 870 { | |
| 871 printf(" %d %d %d %d\n", num, sum, max, bias); | |
| 872 } | |
| 873 } | |
| 874 } | |
| 875 } | |
| 876 */ | |
| 877 #elif defined (HAVE_MMX) | |
| 878 src+= stride*4; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
879 asm volatile( |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
880 "pxor %%mm7, %%mm7 \n\t" |
| 787 | 881 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars |
| 882 "andl $0xFFFFFFF8, %%ecx \n\t" // align | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
883 // 0 1 2 3 4 5 6 7 |
| 787 | 884 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 |
| 885 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
886 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
887 "movq (%0), %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
888 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
889 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
890 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
891 |
| 810 | 892 "movq (%0, %1), %%mm2 \n\t" |
| 893 "leal (%0, %1, 2), %%eax \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
894 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
895 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
896 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
897 |
| 810 | 898 "movq (%%eax), %%mm4 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
899 "movq %%mm4, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
900 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
901 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
902 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
903 "paddw %%mm0, %%mm0 \n\t" // 2L0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
904 "paddw %%mm1, %%mm1 \n\t" // 2H0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
905 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
906 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
907 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
908 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
909 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
910 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
911 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
912 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
913 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
914 |
| 810 | 915 "movq (%%eax, %1), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
916 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
917 "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
918 "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
919 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
920 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
921 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
922 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
923 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
| 787 | 924 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 925 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
926 |
| 810 | 927 "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
928 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
929 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
930 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
931 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
932 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
933 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
| 787 | 934 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 |
| 935 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
936 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
937 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
938 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
939 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
940 |
| 810 | 941 "leal (%%eax, %1), %0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
942 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
943 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
944 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
945 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
946 //50 opcodes so far |
| 810 | 947 "movq (%0, %1, 2), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
948 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
949 "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
950 "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
951 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
952 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
953 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
954 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
955 |
| 810 | 956 "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
957 "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
958 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
| 810 | 959 "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
960 "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
961 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
962 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
963 "paddw %%mm0, %%mm0 \n\t" // 2L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
964 "paddw %%mm1, %%mm1 \n\t" // 2H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
965 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
966 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
967 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
968 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
969 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
970 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
971 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
972 |
| 810 | 973 "movq (%0, %1, 4), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
974 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
975 "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
976 "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
977 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
978 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
979 "paddw %%mm3, %%mm3 \n\t" // 2H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
980 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
981 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
982 |
| 787 | 983 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 984 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 140 | 985 |
| 986 #ifdef HAVE_MMX2 | |
| 987 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 988 "psubw %%mm0, %%mm6 \n\t" | |
| 989 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
| 990 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 991 "psubw %%mm1, %%mm6 \n\t" | |
| 992 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
| 993 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 994 "psubw %%mm2, %%mm6 \n\t" | |
| 995 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
| 996 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 997 "psubw %%mm3, %%mm6 \n\t" | |
| 998 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
| 999 #else | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1000 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1001 "pcmpgtw %%mm0, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1002 "pxor %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1003 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1004 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1005 "pcmpgtw %%mm1, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1006 "pxor %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1007 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1008 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1009 "pcmpgtw %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1010 "pxor %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1011 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1012 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1013 "pcmpgtw %%mm3, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1014 "pxor %%mm6, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1015 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
| 140 | 1016 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1017 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1018 #ifdef HAVE_MMX2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1019 "pminsw %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1020 "pminsw %%mm3, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1021 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1022 "movq %%mm0, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1023 "psubusw %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1024 "psubw %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1025 "movq %%mm1, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1026 "psubusw %%mm3, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1027 "psubw %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1028 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1029 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1030 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1031 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1032 "pxor %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1033 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1034 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1035 "pxor %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1036 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1037 // 100 opcodes |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1038 "movd %2, %%mm2 \n\t" // QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1039 "psllw $3, %%mm2 \n\t" // 8QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1040 "movq %%mm2, %%mm3 \n\t" // 8QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1041 "pcmpgtw %%mm4, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1042 "pcmpgtw %%mm5, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1043 "pand %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1044 "pand %%mm3, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1045 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1046 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1047 "psubusw %%mm0, %%mm4 \n\t" // hd |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1048 "psubusw %%mm1, %%mm5 \n\t" // ld |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1049 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1050 |
| 211 | 1051 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1052 "pmullw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1053 "pmullw %%mm2, %%mm5 \n\t" |
| 211 | 1054 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1055 "paddw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1056 "paddw %%mm2, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1057 "psrlw $6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1058 "psrlw $6, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1059 |
| 787 | 1060 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 |
| 1061 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1062 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1063 "pxor %%mm2, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1064 "pxor %%mm3, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1065 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1066 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1067 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1068 "pxor %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1069 "pxor %%mm3, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1070 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1071 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1072 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1073 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1074 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1075 "pxor %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1076 "pxor %%mm7, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1077 "pand %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1078 "pand %%mm3, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1079 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1080 #ifdef HAVE_MMX2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1081 "pminsw %%mm0, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1082 "pminsw %%mm1, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1083 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1084 "movq %%mm4, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1085 "psubusw %%mm0, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1086 "psubw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1087 "movq %%mm5, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1088 "psubusw %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1089 "psubw %%mm2, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1090 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1091 "pxor %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1092 "pxor %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1093 "psubw %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1094 "psubw %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1095 "packsswb %%mm5, %%mm4 \n\t" |
| 810 | 1096 "movq (%0), %%mm0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1097 "paddb %%mm4, %%mm0 \n\t" |
| 810 | 1098 "movq %%mm0, (%0) \n\t" |
| 1099 "movq (%0, %1), %%mm0 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1100 "psubb %%mm4, %%mm0 \n\t" |
| 810 | 1101 "movq %%mm0, (%0, %1) \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1102 |
| 810 | 1103 : "+r" (src) |
| 1104 : "r" (stride), "m" (c->pQPb) | |
| 1105 : "%eax", "%ecx" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1106 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1107 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1108 const int l1= stride; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1109 const int l2= stride + l1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1110 const int l3= stride + l2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1111 const int l4= stride + l3; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1112 const int l5= stride + l4; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1113 const int l6= stride + l5; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1114 const int l7= stride + l6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1115 const int l8= stride + l7; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1116 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
1117 int x; |
| 111 | 1118 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
1119 for(x=0; x<BLOCK_SIZE; x++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1120 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1121 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
| 787 | 1122 if(ABS(middleEnergy) < 8*c->QP) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1123 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1124 const int q=(src[l4] - src[l5])/2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1125 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1126 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1127 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1128 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1129 d= MAX(d, 0); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1130 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1131 d= (5*d + 32) >> 6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1132 d*= SIGN(-middleEnergy); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1133 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1134 if(q>0) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1135 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1136 d= d<0 ? 0 : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1137 d= d>q ? q : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1138 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1139 else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1140 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1141 d= d>0 ? 0 : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1142 d= d<q ? q : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1143 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1144 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1145 src[l4]-= d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1146 src[l5]+= d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1147 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1148 src++; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1149 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1150 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1151 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1152 |
| 787 | 1153 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1154 { |
| 132 | 1155 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1156 asm volatile( |
| 787 | 1157 "pxor %%mm6, %%mm6 \n\t" |
| 1158 "pcmpeqb %%mm7, %%mm7 \n\t" | |
| 1159 "movq %2, %%mm0 \n\t" | |
| 1160 "punpcklbw %%mm6, %%mm0 \n\t" | |
| 1161 "psrlw $1, %%mm0 \n\t" | |
| 1162 "psubw %%mm7, %%mm0 \n\t" | |
| 1163 "packuswb %%mm0, %%mm0 \n\t" | |
| 1164 "movq %%mm0, %3 \n\t" | |
| 130 | 1165 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1166 "leal (%0, %1), %%eax \n\t" |
| 787 | 1167 "leal (%%eax, %1, 4), %%edx \n\t" |
| 1168 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1169 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1170 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1171 |
| 169 | 1172 #undef FIND_MIN_MAX |
| 132 | 1173 #ifdef HAVE_MMX2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1174 #define FIND_MIN_MAX(addr)\ |
| 130 | 1175 "movq " #addr ", %%mm0 \n\t"\ |
| 167 | 1176 "pminub %%mm0, %%mm7 \n\t"\ |
| 1177 "pmaxub %%mm0, %%mm6 \n\t" | |
| 132 | 1178 #else |
| 1179 #define FIND_MIN_MAX(addr)\ | |
| 1180 "movq " #addr ", %%mm0 \n\t"\ | |
| 167 | 1181 "movq %%mm7, %%mm1 \n\t"\ |
| 1182 "psubusb %%mm0, %%mm6 \n\t"\ | |
| 1183 "paddb %%mm0, %%mm6 \n\t"\ | |
| 132 | 1184 "psubusb %%mm0, %%mm1 \n\t"\ |
| 167 | 1185 "psubb %%mm1, %%mm7 \n\t" |
| 132 | 1186 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1187 |
| 130 | 1188 FIND_MIN_MAX((%%eax)) |
| 1189 FIND_MIN_MAX((%%eax, %1)) | |
| 1190 FIND_MIN_MAX((%%eax, %1, 2)) | |
| 1191 FIND_MIN_MAX((%0, %1, 4)) | |
| 787 | 1192 FIND_MIN_MAX((%%edx)) |
| 1193 FIND_MIN_MAX((%%edx, %1)) | |
| 1194 FIND_MIN_MAX((%%edx, %1, 2)) | |
| 130 | 1195 FIND_MIN_MAX((%0, %1, 8)) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1196 |
| 167 | 1197 "movq %%mm7, %%mm4 \n\t" |
| 1198 "psrlq $8, %%mm7 \n\t" | |
| 1199 #ifdef HAVE_MMX2 | |
| 1200 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1201 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
| 1202 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1203 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
| 1204 "pminub %%mm4, %%mm7 \n\t" | |
| 1205 #else | |
| 1206 "movq %%mm7, %%mm1 \n\t" | |
| 1207 "psubusb %%mm4, %%mm1 \n\t" | |
| 1208 "psubb %%mm1, %%mm7 \n\t" | |
| 1209 "movq %%mm7, %%mm4 \n\t" | |
| 1210 "psrlq $16, %%mm7 \n\t" | |
| 1211 "movq %%mm7, %%mm1 \n\t" | |
| 1212 "psubusb %%mm4, %%mm1 \n\t" | |
| 1213 "psubb %%mm1, %%mm7 \n\t" | |
| 1214 "movq %%mm7, %%mm4 \n\t" | |
| 1215 "psrlq $32, %%mm7 \n\t" | |
| 1216 "movq %%mm7, %%mm1 \n\t" | |
| 1217 "psubusb %%mm4, %%mm1 \n\t" | |
| 1218 "psubb %%mm1, %%mm7 \n\t" | |
| 1219 #endif | |
| 1220 | |
| 1221 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1222 "movq %%mm6, %%mm4 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1223 "psrlq $8, %%mm6 \n\t" |
| 132 | 1224 #ifdef HAVE_MMX2 |
| 167 | 1225 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1226 "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
| 167 | 1227 "pmaxub %%mm4, %%mm6 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1228 "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
| 167 | 1229 "pmaxub %%mm4, %%mm6 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1230 #else |
| 167 | 1231 "psubusb %%mm4, %%mm6 \n\t" |
| 1232 "paddb %%mm4, %%mm6 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1233 "movq %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1234 "psrlq $16, %%mm6 \n\t" |
| 167 | 1235 "psubusb %%mm4, %%mm6 \n\t" |
| 1236 "paddb %%mm4, %%mm6 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1237 "movq %%mm6, %%mm4 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1238 "psrlq $32, %%mm6 \n\t" |
| 167 | 1239 "psubusb %%mm4, %%mm6 \n\t" |
| 1240 "paddb %%mm4, %%mm6 \n\t" | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1241 #endif |
| 167 | 1242 "movq %%mm6, %%mm0 \n\t" // max |
| 1243 "psubb %%mm7, %%mm6 \n\t" // max - min | |
| 1244 "movd %%mm6, %%ecx \n\t" | |
| 210 | 1245 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" |
| 167 | 1246 " jb 1f \n\t" |
| 787 | 1247 "leal -24(%%esp), %%ecx \n\t" |
| 1248 "andl $0xFFFFFFF8, %%ecx \n\t" | |
| 167 | 1249 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1250 "punpcklbw %%mm7, %%mm7 \n\t" |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1251 "punpcklbw %%mm7, %%mm7 \n\t" |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1252 "punpcklbw %%mm7, %%mm7 \n\t" |
| 787 | 1253 "movq %%mm7, (%%ecx) \n\t" |
| 130 | 1254 |
| 1255 "movq (%0), %%mm0 \n\t" // L10 | |
| 1256 "movq %%mm0, %%mm1 \n\t" // L10 | |
| 1257 "movq %%mm0, %%mm2 \n\t" // L10 | |
| 1258 "psllq $8, %%mm1 \n\t" | |
| 1259 "psrlq $8, %%mm2 \n\t" | |
| 1260 "movd -4(%0), %%mm3 \n\t" | |
| 1261 "movd 8(%0), %%mm4 \n\t" | |
| 1262 "psrlq $24, %%mm3 \n\t" | |
| 1263 "psllq $56, %%mm4 \n\t" | |
| 1264 "por %%mm3, %%mm1 \n\t" // L00 | |
| 1265 "por %%mm4, %%mm2 \n\t" // L20 | |
| 1266 "movq %%mm1, %%mm3 \n\t" // L00 | |
| 1267 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 | |
| 1268 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 | |
| 1269 "psubusb %%mm7, %%mm0 \n\t" | |
| 1270 "psubusb %%mm7, %%mm2 \n\t" | |
| 1271 "psubusb %%mm7, %%mm3 \n\t" | |
| 210 | 1272 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 |
| 1273 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 | |
| 1274 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 | |
| 130 | 1275 "paddb %%mm2, %%mm0 \n\t" |
| 1276 "paddb %%mm3, %%mm0 \n\t" | |
| 1277 | |
| 1278 "movq (%%eax), %%mm2 \n\t" // L11 | |
| 1279 "movq %%mm2, %%mm3 \n\t" // L11 | |
| 1280 "movq %%mm2, %%mm4 \n\t" // L11 | |
| 1281 "psllq $8, %%mm3 \n\t" | |
| 1282 "psrlq $8, %%mm4 \n\t" | |
| 1283 "movd -4(%%eax), %%mm5 \n\t" | |
| 1284 "movd 8(%%eax), %%mm6 \n\t" | |
| 1285 "psrlq $24, %%mm5 \n\t" | |
| 1286 "psllq $56, %%mm6 \n\t" | |
| 1287 "por %%mm5, %%mm3 \n\t" // L01 | |
| 1288 "por %%mm6, %%mm4 \n\t" // L21 | |
| 1289 "movq %%mm3, %%mm5 \n\t" // L01 | |
| 1290 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 | |
| 1291 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 | |
| 1292 "psubusb %%mm7, %%mm2 \n\t" | |
| 1293 "psubusb %%mm7, %%mm4 \n\t" | |
| 1294 "psubusb %%mm7, %%mm5 \n\t" | |
| 210 | 1295 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 |
| 1296 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 | |
| 1297 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 | |
| 130 | 1298 "paddb %%mm4, %%mm2 \n\t" |
| 1299 "paddb %%mm5, %%mm2 \n\t" | |
| 1300 // 0, 2, 3, 1 | |
| 1301 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ | |
| 1302 "movq " #src ", " #sx " \n\t" /* src[0] */\ | |
| 1303 "movq " #sx ", " #lx " \n\t" /* src[0] */\ | |
| 1304 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ | |
| 1305 "psllq $8, " #lx " \n\t"\ | |
| 1306 "psrlq $8, " #t0 " \n\t"\ | |
| 1307 "movd -4" #src ", " #t1 " \n\t"\ | |
| 1308 "psrlq $24, " #t1 " \n\t"\ | |
| 1309 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ | |
| 1310 "movd 8" #src ", " #t1 " \n\t"\ | |
| 1311 "psllq $56, " #t1 " \n\t"\ | |
| 1312 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | |
| 1313 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | |
| 1314 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | |
| 1315 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | |
| 135 | 1316 PAVGB(lx, pplx) \ |
| 787 | 1317 "movq " #lx ", 8(%%ecx) \n\t"\ |
| 1318 "movq (%%ecx), " #lx " \n\t"\ | |
| 140 | 1319 "psubusb " #lx ", " #t1 " \n\t"\ |
| 1320 "psubusb " #lx ", " #t0 " \n\t"\ | |
| 1321 "psubusb " #lx ", " #sx " \n\t"\ | |
| 210 | 1322 "movq "MANGLE(b00)", " #lx " \n\t"\ |
| 140 | 1323 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
| 1324 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ | |
| 1325 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ | |
| 130 | 1326 "paddb " #t1 ", " #t0 " \n\t"\ |
| 1327 "paddb " #t0 ", " #sx " \n\t"\ | |
| 1328 \ | |
| 1329 PAVGB(plx, pplx) /* filtered */\ | |
| 1330 "movq " #dst ", " #t0 " \n\t" /* dst */\ | |
| 134 | 1331 "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
| 787 | 1332 "psubusb %3, " #t0 " \n\t"\ |
| 1333 "paddusb %3, " #t1 " \n\t"\ | |
| 134 | 1334 PMAXUB(t0, pplx)\ |
| 1335 PMINUB(t1, pplx, t0)\ | |
| 130 | 1336 "paddb " #sx ", " #ppsx " \n\t"\ |
| 1337 "paddb " #psx ", " #ppsx " \n\t"\ | |
| 210 | 1338 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
| 1339 "pand "MANGLE(b08)", " #ppsx " \n\t"\ | |
| 140 | 1340 "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
| 134 | 1341 "pand " #ppsx ", " #pplx " \n\t"\ |
| 130 | 1342 "pandn " #dst ", " #ppsx " \n\t"\ |
| 140 | 1343 "por " #pplx ", " #ppsx " \n\t"\ |
| 135 | 1344 "movq " #ppsx ", " #dst " \n\t"\ |
| 787 | 1345 "movq 8(%%ecx), " #lx " \n\t" |
| 134 | 1346 |
| 130 | 1347 /* |
| 1348 0000000 | |
| 1349 1111111 | |
| 1350 | |
| 1351 1111110 | |
| 1352 1111101 | |
| 1353 1111100 | |
| 1354 1111011 | |
| 1355 1111010 | |
| 1356 1111001 | |
| 1357 | |
| 1358 1111000 | |
| 1359 1110111 | |
| 1360 | |
| 1361 */ | |
| 1362 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) | |
| 1363 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1364 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1365 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
| 787 | 1366 DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
| 1367 DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1368 DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
| 1369 DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1370 DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1371 |
| 167 | 1372 "1: \n\t" |
| 787 | 1373 : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2) |
| 1374 : "%eax", "%edx", "%ecx" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1375 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1376 #else |
| 134 | 1377 int y; |
| 1378 int min=255; | |
| 1379 int max=0; | |
| 1380 int avg; | |
| 1381 uint8_t *p; | |
| 1382 int s[10]; | |
| 787 | 1383 const int QP2= c->QP/2 + 1; |
| 134 | 1384 |
| 1385 for(y=1; y<9; y++) | |
| 1386 { | |
| 1387 int x; | |
| 1388 p= src + stride*y; | |
| 1389 for(x=1; x<9; x++) | |
| 1390 { | |
| 1391 p++; | |
| 1392 if(*p > max) max= *p; | |
| 1393 if(*p < min) min= *p; | |
| 1394 } | |
| 1395 } | |
| 787 | 1396 avg= (min + max + 1)>>1; |
| 134 | 1397 |
| 167 | 1398 if(max - min <deringThreshold) return; |
| 1399 | |
| 134 | 1400 for(y=0; y<10; y++) |
| 1401 { | |
| 1402 int t = 0; | |
| 787 | 1403 |
| 1404 if(src[stride*y + 0] > avg) t+= 1; | |
| 1405 if(src[stride*y + 1] > avg) t+= 2; | |
| 1406 if(src[stride*y + 2] > avg) t+= 4; | |
| 1407 if(src[stride*y + 3] > avg) t+= 8; | |
| 1408 if(src[stride*y + 4] > avg) t+= 16; | |
| 1409 if(src[stride*y + 5] > avg) t+= 32; | |
| 1410 if(src[stride*y + 6] > avg) t+= 64; | |
| 1411 if(src[stride*y + 7] > avg) t+= 128; | |
| 1412 if(src[stride*y + 8] > avg) t+= 256; | |
| 1413 if(src[stride*y + 9] > avg) t+= 512; | |
| 1414 | |
| 134 | 1415 t |= (~t)<<16; |
| 1416 t &= (t<<1) & (t>>1); | |
| 1417 s[y] = t; | |
| 1418 } | |
| 787 | 1419 |
| 1420 for(y=1; y<9; y++) | |
| 1421 { | |
| 1422 int t = s[y-1] & s[y] & s[y+1]; | |
| 1423 t|= t>>16; | |
| 1424 s[y-1]= t; | |
| 1425 } | |
| 134 | 1426 |
| 1427 for(y=1; y<9; y++) | |
| 1428 { | |
| 1429 int x; | |
| 787 | 1430 int t = s[y-1]; |
| 134 | 1431 |
| 1432 p= src + stride*y; | |
| 1433 for(x=1; x<9; x++) | |
| 1434 { | |
| 1435 p++; | |
| 1436 if(t & (1<<x)) | |
| 1437 { | |
| 1438 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | |
| 1439 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | |
| 1440 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | |
| 1441 f= (f + 8)>>4; | |
| 1442 | |
| 167 | 1443 #ifdef DEBUG_DERING_THRESHOLD |
| 1444 asm volatile("emms\n\t":); | |
| 1445 { | |
| 1446 static long long numPixels=0; | |
| 1447 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | |
| 1448 // if((max-min)<20 || (max-min)*QP<200) | |
| 1449 // if((max-min)*QP < 500) | |
| 1450 // if(max-min<QP/2) | |
| 1451 if(max-min < 20) | |
| 1452 { | |
| 1453 static int numSkiped=0; | |
| 1454 static int errorSum=0; | |
| 1455 static int worstQP=0; | |
| 1456 static int worstRange=0; | |
| 1457 static int worstDiff=0; | |
| 1458 int diff= (f - *p); | |
| 1459 int absDiff= ABS(diff); | |
| 1460 int error= diff*diff; | |
| 1461 | |
| 1462 if(x==1 || x==8 || y==1 || y==8) continue; | |
| 1463 | |
| 1464 numSkiped++; | |
| 1465 if(absDiff > worstDiff) | |
| 1466 { | |
| 1467 worstDiff= absDiff; | |
| 1468 worstQP= QP; | |
| 1469 worstRange= max-min; | |
| 1470 } | |
| 1471 errorSum+= error; | |
| 1472 | |
| 1473 if(1024LL*1024LL*1024LL % numSkiped == 0) | |
| 1474 { | |
| 1475 printf( "sum:%1.3f, skip:%d, wQP:%d, " | |
| 1476 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | |
| 1477 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, | |
| 1478 worstDiff, (float)numSkiped/numPixels); | |
| 1479 } | |
| 1480 } | |
| 1481 } | |
| 1482 #endif | |
| 787 | 1483 if (*p + QP2 < f) *p= *p + QP2; |
| 1484 else if(*p - QP2 > f) *p= *p - QP2; | |
| 134 | 1485 else *p=f; |
| 1486 } | |
| 1487 } | |
| 1488 } | |
| 167 | 1489 #ifdef DEBUG_DERING_THRESHOLD |
| 1490 if(max-min < 20) | |
| 1491 { | |
| 1492 for(y=1; y<9; y++) | |
| 1493 { | |
| 1494 int x; | |
| 1495 int t = 0; | |
| 1496 p= src + stride*y; | |
| 1497 for(x=1; x<9; x++) | |
| 1498 { | |
| 1499 p++; | |
| 1500 *p = MIN(*p + 20, 255); | |
| 1501 } | |
| 1502 } | |
| 1503 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | |
| 1504 } | |
| 1505 #endif | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1506 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1507 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1508 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1509 /** |
| 1109 | 1510 * Deinterlaces the given block by linearly interpolating every second line. |
| 142 | 1511 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1512 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1513 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1514 */ |
| 169 | 1515 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1516 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1517 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1518 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1519 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1520 "leal (%0, %1), %%eax \n\t" |
| 787 | 1521 "leal (%%eax, %1, 4), %%ecx \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1522 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1523 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1524 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1525 "movq (%0), %%mm0 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1526 "movq (%%eax, %1), %%mm1 \n\t" |
| 111 | 1527 PAVGB(%%mm1, %%mm0) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1528 "movq %%mm0, (%%eax) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1529 "movq (%0, %1, 4), %%mm0 \n\t" |
| 111 | 1530 PAVGB(%%mm0, %%mm1) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1531 "movq %%mm1, (%%eax, %1, 2) \n\t" |
| 787 | 1532 "movq (%%ecx, %1), %%mm1 \n\t" |
| 111 | 1533 PAVGB(%%mm1, %%mm0) |
| 787 | 1534 "movq %%mm0, (%%ecx) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1535 "movq (%0, %1, 8), %%mm0 \n\t" |
| 111 | 1536 PAVGB(%%mm0, %%mm1) |
| 787 | 1537 "movq %%mm1, (%%ecx, %1, 2) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1538 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1539 : : "r" (src), "r" (stride) |
| 787 | 1540 : "%eax", "%ecx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1541 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1542 #else |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1543 int a, b, x; |
| 142 | 1544 src+= 4*stride; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1545 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1546 for(x=0; x<2; x++){ |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1547 a= *(uint32_t*)&src[stride*0]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1548 b= *(uint32_t*)&src[stride*2]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1549 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1550 a= *(uint32_t*)&src[stride*4]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1551 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1552 b= *(uint32_t*)&src[stride*6]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1553 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1554 a= *(uint32_t*)&src[stride*8]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1555 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1556 src += 4; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1557 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1558 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1559 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1560 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1561 /** |
| 1109 | 1562 * Deinterlaces the given block by cubic interpolating every second line. |
| 142 | 1563 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1564 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1565 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 1566 * this filter will read lines 3-15 and write 7-13 | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1567 */ |
| 169 | 1568 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1569 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1570 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1571 src+= stride*3; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1572 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1573 "leal (%0, %1), %%eax \n\t" |
| 787 | 1574 "leal (%%eax, %1, 4), %%edx \n\t" |
| 1575 "leal (%%edx, %1, 4), %%ecx \n\t" | |
| 111 | 1576 "addl %1, %%ecx \n\t" |
| 1577 "pxor %%mm7, %%mm7 \n\t" | |
| 1578 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 787 | 1579 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1580 |
| 111 | 1581 #define DEINT_CUBIC(a,b,c,d,e)\ |
| 1582 "movq " #a ", %%mm0 \n\t"\ | |
| 1583 "movq " #b ", %%mm1 \n\t"\ | |
| 1584 "movq " #d ", %%mm2 \n\t"\ | |
| 1585 "movq " #e ", %%mm3 \n\t"\ | |
| 1586 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ | |
| 1587 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ | |
| 1588 "movq %%mm0, %%mm2 \n\t"\ | |
| 1589 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
| 1590 "punpckhbw %%mm7, %%mm2 \n\t"\ | |
| 1591 "movq %%mm1, %%mm3 \n\t"\ | |
| 1592 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
| 1593 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
| 1594 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ | |
| 1595 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ | |
| 1596 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ | |
| 1597 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ | |
| 1598 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | |
| 1599 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | |
| 1600 "packuswb %%mm3, %%mm1 \n\t"\ | |
| 1601 "movq %%mm1, " #c " \n\t" | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1602 |
| 787 | 1603 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1)) |
| 1604 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8)) | |
| 1605 DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx)) | |
| 1606 DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2)) | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1607 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1608 : : "r" (src), "r" (stride) |
| 787 | 1609 : "%eax", "%edx", "ecx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1610 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1611 #else |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1612 int x; |
| 142 | 1613 src+= stride*3; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1614 for(x=0; x<8; x++) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1615 { |
| 1157 | 1616 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); |
| 1617 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); | |
| 1618 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); | |
| 1619 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1620 src++; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1621 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1622 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1623 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1624 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1625 /** |
| 1109 | 1626 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. |
| 142 | 1627 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1628 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1629 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 787 | 1630 * this filter will read lines 4-13 and write 5-11 |
| 1631 */ | |
| 1632 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) | |
| 1633 { | |
| 1634 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 1635 src+= stride*4; | |
| 1636 asm volatile( | |
| 1637 "leal (%0, %1), %%eax \n\t" | |
| 1638 "leal (%%eax, %1, 4), %%edx \n\t" | |
| 1639 "pxor %%mm7, %%mm7 \n\t" | |
| 1640 "movq (%2), %%mm0 \n\t" | |
| 1641 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 1642 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
| 1643 | |
| 1644 #define DEINT_FF(a,b,c,d)\ | |
| 1645 "movq " #a ", %%mm1 \n\t"\ | |
| 1646 "movq " #b ", %%mm2 \n\t"\ | |
| 1647 "movq " #c ", %%mm3 \n\t"\ | |
| 1648 "movq " #d ", %%mm4 \n\t"\ | |
| 1649 PAVGB(%%mm3, %%mm1) \ | |
| 1650 PAVGB(%%mm4, %%mm0) \ | |
| 1651 "movq %%mm0, %%mm3 \n\t"\ | |
| 1652 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
| 1653 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
| 1654 "movq %%mm1, %%mm4 \n\t"\ | |
| 1655 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
| 1656 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
| 1657 "psllw $2, %%mm1 \n\t"\ | |
| 1658 "psllw $2, %%mm4 \n\t"\ | |
| 1659 "psubw %%mm0, %%mm1 \n\t"\ | |
| 1660 "psubw %%mm3, %%mm4 \n\t"\ | |
| 1661 "movq %%mm2, %%mm5 \n\t"\ | |
| 1662 "movq %%mm2, %%mm0 \n\t"\ | |
| 1663 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
| 1664 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
| 1665 "paddw %%mm2, %%mm1 \n\t"\ | |
| 1666 "paddw %%mm5, %%mm4 \n\t"\ | |
| 1667 "psraw $2, %%mm1 \n\t"\ | |
| 1668 "psraw $2, %%mm4 \n\t"\ | |
| 1669 "packuswb %%mm4, %%mm1 \n\t"\ | |
| 1670 "movq %%mm1, " #b " \n\t"\ | |
| 1671 | |
| 1672 DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2)) | |
| 1673 DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) ) | |
| 1674 DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2)) | |
| 1675 DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4)) | |
| 1676 | |
| 1677 "movq %%mm0, (%2) \n\t" | |
| 1678 : : "r" (src), "r" (stride), "r"(tmp) | |
| 1679 : "%eax", "%edx" | |
| 1680 ); | |
| 1681 #else | |
| 1682 int x; | |
| 1683 src+= stride*4; | |
| 1684 for(x=0; x<8; x++) | |
| 1685 { | |
| 1686 int t1= tmp[x]; | |
| 1687 int t2= src[stride*1]; | |
| 1688 | |
| 1157 | 1689 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); |
| 787 | 1690 t1= src[stride*4]; |
| 1157 | 1691 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); |
| 787 | 1692 t2= src[stride*6]; |
| 1157 | 1693 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); |
| 787 | 1694 t1= src[stride*8]; |
| 1157 | 1695 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); |
| 787 | 1696 tmp[x]= t1; |
| 1697 | |
| 1698 src++; | |
| 1699 } | |
| 1700 #endif | |
| 1701 } | |
| 1702 | |
| 1703 /** | |
| 1157 | 1704 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. |
| 1705 * will be called for every 8x8 block and can read & write from line 4-15 | |
| 1706 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1707 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 1708 * this filter will read lines 4-13 and write 4-11 | |
| 1709 */ | |
| 1710 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) | |
| 1711 { | |
| 1712 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 1713 src+= stride*4; | |
| 1714 asm volatile( | |
| 1715 "leal (%0, %1), %%eax \n\t" | |
| 1716 "leal (%%eax, %1, 4), %%edx \n\t" | |
| 1717 "pxor %%mm7, %%mm7 \n\t" | |
| 1718 "movq (%2), %%mm0 \n\t" | |
| 1719 "movq (%3), %%mm1 \n\t" | |
| 1720 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 1721 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
| 1722 | |
| 1723 #define DEINT_L5(t1,t2,a,b,c)\ | |
| 1724 "movq " #a ", %%mm2 \n\t"\ | |
| 1725 "movq " #b ", %%mm3 \n\t"\ | |
| 1726 "movq " #c ", %%mm4 \n\t"\ | |
| 1727 PAVGB(t2, %%mm3) \ | |
| 1728 PAVGB(t1, %%mm4) \ | |
| 1729 "movq %%mm2, %%mm5 \n\t"\ | |
| 1730 "movq %%mm2, " #t1 " \n\t"\ | |
| 1731 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
| 1732 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
| 1733 "movq %%mm2, %%mm6 \n\t"\ | |
| 1734 "paddw %%mm2, %%mm2 \n\t"\ | |
| 1735 "paddw %%mm6, %%mm2 \n\t"\ | |
| 1736 "movq %%mm5, %%mm6 \n\t"\ | |
| 1737 "paddw %%mm5, %%mm5 \n\t"\ | |
| 1738 "paddw %%mm6, %%mm5 \n\t"\ | |
| 1739 "movq %%mm3, %%mm6 \n\t"\ | |
| 1740 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
| 1741 "punpckhbw %%mm7, %%mm6 \n\t"\ | |
| 1742 "paddw %%mm3, %%mm3 \n\t"\ | |
| 1743 "paddw %%mm6, %%mm6 \n\t"\ | |
| 1744 "paddw %%mm3, %%mm2 \n\t"\ | |
| 1745 "paddw %%mm6, %%mm5 \n\t"\ | |
| 1746 "movq %%mm4, %%mm6 \n\t"\ | |
| 1747 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
| 1748 "punpckhbw %%mm7, %%mm6 \n\t"\ | |
| 1749 "psubw %%mm4, %%mm2 \n\t"\ | |
| 1750 "psubw %%mm6, %%mm5 \n\t"\ | |
| 1751 "psraw $2, %%mm2 \n\t"\ | |
| 1752 "psraw $2, %%mm5 \n\t"\ | |
| 1753 "packuswb %%mm5, %%mm2 \n\t"\ | |
| 1754 "movq %%mm2, " #a " \n\t"\ | |
| 1755 | |
| 1756 DEINT_L5(%%mm0, %%mm1, (%0) , (%%eax) , (%%eax, %1) ) | |
| 1757 DEINT_L5(%%mm1, %%mm0, (%%eax) , (%%eax, %1) , (%%eax, %1, 2)) | |
| 1758 DEINT_L5(%%mm0, %%mm1, (%%eax, %1) , (%%eax, %1, 2), (%0, %1, 4) ) | |
| 1759 DEINT_L5(%%mm1, %%mm0, (%%eax, %1, 2), (%0, %1, 4) , (%%edx) ) | |
| 1760 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%edx) , (%%edx, %1) ) | |
| 1761 DEINT_L5(%%mm1, %%mm0, (%%edx) , (%%edx, %1) , (%%edx, %1, 2)) | |
| 1762 DEINT_L5(%%mm0, %%mm1, (%%edx, %1) , (%%edx, %1, 2), (%0, %1, 8) ) | |
| 1763 DEINT_L5(%%mm1, %%mm0, (%%edx, %1, 2), (%0, %1, 8) , (%%edx, %1, 4)) | |
| 1764 | |
| 1765 "movq %%mm0, (%2) \n\t" | |
| 1766 "movq %%mm1, (%3) \n\t" | |
| 1767 : : "r" (src), "r" (stride), "r"(tmp), "r"(tmp2) | |
| 1768 : "%eax", "%edx" | |
| 1769 ); | |
| 1770 #else | |
| 1771 int x; | |
| 1772 src+= stride*4; | |
| 1773 for(x=0; x<8; x++) | |
| 1774 { | |
| 1775 int t1= tmp[x]; | |
| 1776 int t2= tmp2[x]; | |
| 1777 int t3= src[0]; | |
| 1778 | |
| 1779 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); | |
| 1780 t1= src[stride*1]; | |
| 1781 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); | |
| 1782 t2= src[stride*2]; | |
| 1783 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); | |
| 1784 t3= src[stride*3]; | |
| 1785 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); | |
| 1786 t1= src[stride*4]; | |
| 1787 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); | |
| 1788 t2= src[stride*5]; | |
| 1789 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); | |
| 1790 t3= src[stride*6]; | |
| 1791 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); | |
| 1792 t1= src[stride*7]; | |
| 1793 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); | |
| 1794 | |
| 1795 tmp[x]= t3; | |
| 1796 tmp2[x]= t1; | |
| 1797 | |
| 1798 src++; | |
| 1799 } | |
| 1800 #endif | |
| 1801 } | |
| 1802 | |
| 1803 /** | |
| 1109 | 1804 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. |
| 787 | 1805 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1806 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1807 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1808 * will shift the image up by 1 line (FIXME if this is a problem) |
| 142 | 1809 * this filter will read lines 4-13 and write 4-11 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1810 */ |
| 169 | 1811 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1812 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1813 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1814 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1815 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1816 "leal (%0, %1), %%eax \n\t" |
| 787 | 1817 "leal (%%eax, %1, 4), %%edx \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1818 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1819 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1820 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1821 "movq (%0), %%mm0 \n\t" // L0 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1822 "movq (%%eax, %1), %%mm1 \n\t" // L2 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1823 PAVGB(%%mm1, %%mm0) // L0+L2 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1824 "movq (%%eax), %%mm2 \n\t" // L1 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1825 PAVGB(%%mm2, %%mm0) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1826 "movq %%mm0, (%0) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1827 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1828 PAVGB(%%mm0, %%mm2) // L1+L3 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1829 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1830 "movq %%mm2, (%%eax) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1831 "movq (%0, %1, 4), %%mm2 \n\t" // L4 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1832 PAVGB(%%mm2, %%mm1) // L2+L4 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1833 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1834 "movq %%mm1, (%%eax, %1) \n\t" |
| 787 | 1835 "movq (%%edx), %%mm1 \n\t" // L5 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1836 PAVGB(%%mm1, %%mm0) // L3+L5 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1837 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1838 "movq %%mm0, (%%eax, %1, 2) \n\t" |
| 787 | 1839 "movq (%%edx, %1), %%mm0 \n\t" // L6 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1840 PAVGB(%%mm0, %%mm2) // L4+L6 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1841 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1842 "movq %%mm2, (%0, %1, 4) \n\t" |
| 787 | 1843 "movq (%%edx, %1, 2), %%mm2 \n\t" // L7 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1844 PAVGB(%%mm2, %%mm1) // L5+L7 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1845 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 |
| 787 | 1846 "movq %%mm1, (%%edx) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1847 "movq (%0, %1, 8), %%mm1 \n\t" // L8 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1848 PAVGB(%%mm1, %%mm0) // L6+L8 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1849 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 |
| 787 | 1850 "movq %%mm0, (%%edx, %1) \n\t" |
| 1851 "movq (%%edx, %1, 4), %%mm0 \n\t" // L9 | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1852 PAVGB(%%mm0, %%mm2) // L7+L9 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1853 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 |
| 787 | 1854 "movq %%mm2, (%%edx, %1, 2) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1855 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1856 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1857 : : "r" (src), "r" (stride) |
| 787 | 1858 : "%eax", "%edx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1859 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1860 #else |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1861 int a, b, c, x; |
| 142 | 1862 src+= 4*stride; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1863 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1864 for(x=0; x<2; x++){ |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1865 a= *(uint32_t*)&src[stride*0]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1866 b= *(uint32_t*)&src[stride*1]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1867 c= *(uint32_t*)&src[stride*2]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1868 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1869 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1870 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1871 a= *(uint32_t*)&src[stride*3]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1872 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1873 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1874 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1875 b= *(uint32_t*)&src[stride*4]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1876 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1877 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1878 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1879 c= *(uint32_t*)&src[stride*5]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1880 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1881 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1882 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1883 a= *(uint32_t*)&src[stride*6]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1884 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1885 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1886 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1887 b= *(uint32_t*)&src[stride*7]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1888 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1889 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1890 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1891 c= *(uint32_t*)&src[stride*8]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1892 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1893 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1894 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1895 a= *(uint32_t*)&src[stride*9]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1896 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1897 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1898 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1899 src += 4; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1900 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1901 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1902 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1903 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1904 /** |
| 1109 | 1905 * Deinterlaces the given block by applying a median filter to every second line. |
| 142 | 1906 * will be called for every 8x8 block and can read & write from line 4-15, |
| 1907 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1908 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1909 */ |
| 169 | 1910 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1911 { |
| 107 | 1912 #ifdef HAVE_MMX |
| 142 | 1913 src+= 4*stride; |
| 107 | 1914 #ifdef HAVE_MMX2 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1915 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1916 "leal (%0, %1), %%eax \n\t" |
| 787 | 1917 "leal (%%eax, %1, 4), %%edx \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1918 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1919 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1920 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1921 "movq (%0), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1922 "movq (%%eax, %1), %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1923 "movq (%%eax), %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1924 "movq %%mm0, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1925 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1926 "pminub %%mm3, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1927 "pmaxub %%mm2, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1928 "pminub %%mm1, %%mm0 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1929 "movq %%mm0, (%%eax) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1930 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1931 "movq (%0, %1, 4), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1932 "movq (%%eax, %1, 2), %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1933 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1934 "pmaxub %%mm1, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1935 "pminub %%mm3, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1936 "pmaxub %%mm0, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1937 "pminub %%mm1, %%mm2 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1938 "movq %%mm2, (%%eax, %1, 2) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1939 |
| 787 | 1940 "movq (%%edx), %%mm2 \n\t" // |
| 1941 "movq (%%edx, %1), %%mm1 \n\t" // | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1942 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1943 "pmaxub %%mm0, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1944 "pminub %%mm3, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1945 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1946 "pminub %%mm0, %%mm2 \n\t" |
| 787 | 1947 "movq %%mm2, (%%edx) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1948 |
| 787 | 1949 "movq (%%edx, %1, 2), %%mm2 \n\t" // |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1950 "movq (%0, %1, 8), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1951 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1952 "pmaxub %%mm0, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1953 "pminub %%mm3, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1954 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1955 "pminub %%mm0, %%mm2 \n\t" |
| 787 | 1956 "movq %%mm2, (%%edx, %1, 2) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1957 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1958 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1959 : : "r" (src), "r" (stride) |
| 787 | 1960 : "%eax", "%edx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1961 ); |
| 107 | 1962 |
| 1963 #else // MMX without MMX2 | |
| 1964 asm volatile( | |
| 1965 "leal (%0, %1), %%eax \n\t" | |
| 787 | 1966 "leal (%%eax, %1, 4), %%edx \n\t" |
| 107 | 1967 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1968 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
| 107 | 1969 "pxor %%mm7, %%mm7 \n\t" |
| 1970 | |
| 1971 #define MEDIAN(a,b,c)\ | |
| 1972 "movq " #a ", %%mm0 \n\t"\ | |
| 1973 "movq " #b ", %%mm2 \n\t"\ | |
| 1974 "movq " #c ", %%mm1 \n\t"\ | |
| 1975 "movq %%mm0, %%mm3 \n\t"\ | |
| 1976 "movq %%mm1, %%mm4 \n\t"\ | |
| 1977 "movq %%mm2, %%mm5 \n\t"\ | |
| 1978 "psubusb %%mm1, %%mm3 \n\t"\ | |
| 1979 "psubusb %%mm2, %%mm4 \n\t"\ | |
| 1980 "psubusb %%mm0, %%mm5 \n\t"\ | |
| 1981 "pcmpeqb %%mm7, %%mm3 \n\t"\ | |
| 1982 "pcmpeqb %%mm7, %%mm4 \n\t"\ | |
| 1983 "pcmpeqb %%mm7, %%mm5 \n\t"\ | |
| 1984 "movq %%mm3, %%mm6 \n\t"\ | |
| 1985 "pxor %%mm4, %%mm3 \n\t"\ | |
| 1986 "pxor %%mm5, %%mm4 \n\t"\ | |
| 1987 "pxor %%mm6, %%mm5 \n\t"\ | |
| 1988 "por %%mm3, %%mm1 \n\t"\ | |
| 1989 "por %%mm4, %%mm2 \n\t"\ | |
| 1990 "por %%mm5, %%mm0 \n\t"\ | |
| 1991 "pand %%mm2, %%mm0 \n\t"\ | |
| 1992 "pand %%mm1, %%mm0 \n\t"\ | |
| 1993 "movq %%mm0, " #b " \n\t" | |
| 1994 | |
| 1995 MEDIAN((%0), (%%eax), (%%eax, %1)) | |
| 1996 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) | |
| 787 | 1997 MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1)) |
| 1998 MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8)) | |
| 107 | 1999 |
| 2000 : : "r" (src), "r" (stride) | |
| 787 | 2001 : "%eax", "%edx" |
| 107 | 2002 ); |
| 2003 #endif // MMX | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2004 #else |
|
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2005 int x, y; |
| 142 | 2006 src+= 4*stride; |
|
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2007 // FIXME - there should be a way to do a few columns in parallel like w/mmx |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2008 for(x=0; x<8; x++) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2009 { |
|
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2010 uint8_t *colsrc = src; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2011 for (y=0; y<4; y++) |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2012 { |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2013 int a, b, c, d, e, f; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2014 a = colsrc[0 ]; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2015 b = colsrc[stride ]; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2016 c = colsrc[stride*2]; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2017 d = (a-b)>>31; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2018 e = (b-c)>>31; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2019 f = (c-a)>>31; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2020 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2021 colsrc += stride*2; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2022 } |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2023 src++; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2024 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2025 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2026 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2027 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2028 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2029 /** |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2030 * transposes and shift the given 8x8 Block into dst1 and dst2 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2031 */ |
| 169 | 2032 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2033 { |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2034 asm( |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2035 "leal (%0, %1), %%eax \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2036 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 2037 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2038 "movq (%0), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2039 "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2040 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2041 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2042 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2043 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2044 "movq (%%eax, %1), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2045 "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2046 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2047 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2048 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2049 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2050 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2051 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2052 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2053 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2054 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2055 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2056 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2057 "movd %%mm0, 128(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2058 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2059 "movd %%mm0, 144(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2060 "movd %%mm3, 160(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2061 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2062 "movd %%mm3, 176(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2063 "movd %%mm3, 48(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2064 "movd %%mm2, 192(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2065 "movd %%mm2, 64(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2066 "psrlq $32, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2067 "movd %%mm2, 80(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2068 "movd %%mm1, 96(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2069 "psrlq $32, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2070 "movd %%mm1, 112(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2071 |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2072 "leal (%%eax, %1, 4), %%eax \n\t" |
|
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2073 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2074 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2075 "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2076 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2077 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2078 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2079 |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2080 "movq (%%eax, %1), %%mm1 \n\t" |
|
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2081 "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2082 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2083 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2084 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2085 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2086 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2087 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2088 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2089 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2090 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2091 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2092 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2093 "movd %%mm0, 132(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2094 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2095 "movd %%mm0, 148(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2096 "movd %%mm3, 164(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2097 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2098 "movd %%mm3, 180(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2099 "movd %%mm3, 52(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2100 "movd %%mm2, 196(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2101 "movd %%mm2, 68(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2102 "psrlq $32, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2103 "movd %%mm2, 84(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2104 "movd %%mm1, 100(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2105 "psrlq $32, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2106 "movd %%mm1, 116(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2107 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2108 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2109 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2110 : "%eax" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2111 ); |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2112 } |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2113 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2114 /** |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2115 * transposes the given 8x8 block |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2116 */ |
| 169 | 2117 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2118 { |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2119 asm( |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2120 "leal (%0, %1), %%eax \n\t" |
| 787 | 2121 "leal (%%eax, %1, 4), %%edx \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2122 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 2123 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2124 "movq (%2), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2125 "movq 16(%2), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2126 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2127 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2128 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2129 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2130 "movq 32(%2), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2131 "movq 48(%2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2132 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2133 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2134 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2135 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2136 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2137 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2138 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2139 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2140 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2141 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2142 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2143 "movd %%mm0, (%0) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2144 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2145 "movd %%mm0, (%%eax) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2146 "movd %%mm3, (%%eax, %1) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2147 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2148 "movd %%mm3, (%%eax, %1, 2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2149 "movd %%mm2, (%0, %1, 4) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2150 "psrlq $32, %%mm2 \n\t" |
| 787 | 2151 "movd %%mm2, (%%edx) \n\t" |
| 2152 "movd %%mm1, (%%edx, %1) \n\t" | |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2153 "psrlq $32, %%mm1 \n\t" |
| 787 | 2154 "movd %%mm1, (%%edx, %1, 2) \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2155 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2156 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2157 "movq 64(%2), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2158 "movq 80(%2), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2159 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2160 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2161 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2162 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2163 "movq 96(%2), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2164 "movq 112(%2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2165 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2166 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2167 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2168 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2169 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2170 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2171 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2172 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2173 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2174 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2175 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2176 "movd %%mm0, 4(%0) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2177 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2178 "movd %%mm0, 4(%%eax) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2179 "movd %%mm3, 4(%%eax, %1) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2180 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2181 "movd %%mm3, 4(%%eax, %1, 2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2182 "movd %%mm2, 4(%0, %1, 4) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2183 "psrlq $32, %%mm2 \n\t" |
| 787 | 2184 "movd %%mm2, 4(%%edx) \n\t" |
| 2185 "movd %%mm1, 4(%%edx, %1) \n\t" | |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2186 "psrlq $32, %%mm1 \n\t" |
| 787 | 2187 "movd %%mm1, 4(%%edx, %1, 2) \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2188 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2189 :: "r" (dst), "r" (dstStride), "r" (src) |
| 787 | 2190 : "%eax", "%edx" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2191 ); |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2192 } |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2193 #endif |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2194 //static int test=0; |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2195 |
| 943 | 2196 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
| 158 | 2197 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
| 156 | 2198 { |
| 787 | 2199 // to save a register (FIXME do this outside of the loops) |
| 2200 tempBluredPast[127]= maxNoise[0]; | |
| 2201 tempBluredPast[128]= maxNoise[1]; | |
| 2202 tempBluredPast[129]= maxNoise[2]; | |
| 2203 | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2204 #define FAST_L2_DIFF |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2205 //#define L1_DIFF //u should change the thresholds too if u try that one |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2206 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2207 asm volatile( |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2208 "leal (%2, %2, 2), %%eax \n\t" // 3*stride |
| 787 | 2209 "leal (%2, %2, 4), %%edx \n\t" // 5*stride |
| 2210 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2211 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 2212 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2213 //FIXME reorder? |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2214 #ifdef L1_DIFF //needs mmx2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2215 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2216 "psadbw (%1), %%mm0 \n\t" // |L0-R0| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2217 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2218 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2219 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2220 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2221 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2222 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2223 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2224 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2225 "paddw %%mm1, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2226 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| |
| 787 | 2227 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2228 "paddw %%mm2, %%mm0 \n\t" |
| 787 | 2229 "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5| |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2230 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2231 "paddw %%mm3, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2232 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2233 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2234 "paddw %%mm4, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2235 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2236 "paddw %%mm5, %%mm6 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2237 "paddw %%mm7, %%mm6 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2238 "paddw %%mm6, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2239 #elif defined (FAST_L2_DIFF) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2240 "pcmpeqb %%mm7, %%mm7 \n\t" |
| 210 | 2241 "movq "MANGLE(b80)", %%mm6 \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2242 "pxor %%mm0, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2243 #define L2_DIFF_CORE(a, b)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2244 "movq " #a ", %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2245 "movq " #b ", %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2246 "pxor %%mm7, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2247 PAVGB(%%mm2, %%mm5)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2248 "paddb %%mm6, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2249 "movq %%mm5, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2250 "psllw $8, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2251 "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2252 "pmaddwd %%mm2, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2253 "paddd %%mm2, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2254 "psrld $14, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2255 "paddd %%mm5, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2256 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2257 L2_DIFF_CORE((%0), (%1)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2258 L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2259 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2260 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2261 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
| 787 | 2262 L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2263 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2264 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2265 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2266 #else |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2267 "pxor %%mm7, %%mm7 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2268 "pxor %%mm0, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2269 #define L2_DIFF_CORE(a, b)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2270 "movq " #a ", %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2271 "movq " #b ", %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2272 "movq %%mm5, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2273 "movq %%mm2, %%mm3 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2274 "punpcklbw %%mm7, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2275 "punpckhbw %%mm7, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2276 "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2277 "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2278 "psubw %%mm2, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2279 "psubw %%mm3, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2280 "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2281 "pmaddwd %%mm1, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2282 "paddd %%mm1, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2283 "paddd %%mm5, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2284 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2285 L2_DIFF_CORE((%0), (%1)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2286 L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2287 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2288 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2289 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
| 787 | 2290 L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2291 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2292 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2293 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2294 #endif |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2295 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2296 "movq %%mm0, %%mm4 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2297 "psrlq $32, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2298 "paddd %%mm0, %%mm4 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2299 "movd %%mm4, %%ecx \n\t" |
| 158 | 2300 "shll $2, %%ecx \n\t" |
| 787 | 2301 "movl %3, %%edx \n\t" |
| 2302 "addl -4(%%edx), %%ecx \n\t" | |
| 2303 "addl 4(%%edx), %%ecx \n\t" | |
| 2304 "addl -1024(%%edx), %%ecx \n\t" | |
| 158 | 2305 "addl $4, %%ecx \n\t" |
| 787 | 2306 "addl 1024(%%edx), %%ecx \n\t" |
| 158 | 2307 "shrl $3, %%ecx \n\t" |
| 787 | 2308 "movl %%ecx, (%%edx) \n\t" |
| 158 | 2309 |
| 210 | 2310 // "movl %3, %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2311 // "movl %%ecx, test \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2312 // "jmp 4f \n\t" |
| 787 | 2313 "cmpl 512(%%edx), %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2314 " jb 2f \n\t" |
| 787 | 2315 "cmpl 516(%%edx), %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2316 " jb 1f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2317 |
| 787 | 2318 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2319 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2320 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2321 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2322 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2323 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2324 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
| 787 | 2325 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2326 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2327 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2328 "movq %%mm0, (%1) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2329 "movq %%mm1, (%1, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2330 "movq %%mm2, (%1, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2331 "movq %%mm3, (%1, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2332 "movq %%mm4, (%1, %2, 4) \n\t" // L4 |
| 787 | 2333 "movq %%mm5, (%1, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2334 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2335 "movq %%mm7, (%1, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2336 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2337 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2338 "1: \n\t" |
| 787 | 2339 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2340 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2341 "movq (%0), %%mm0 \n\t" // L0 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2342 PAVGB((%1), %%mm0) // L0 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2343 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2344 PAVGB((%1, %2), %%mm1) // L1 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2345 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2346 PAVGB((%1, %2, 2), %%mm2) // L2 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2347 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2348 PAVGB((%1, %%eax), %%mm3) // L3 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2349 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2350 PAVGB((%1, %2, 4), %%mm4) // L4 |
| 787 | 2351 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
| 2352 PAVGB((%1, %%edx), %%mm5) // L5 | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2353 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2354 PAVGB((%1, %%eax, 2), %%mm6) // L6 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2355 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2356 PAVGB((%1, %%ecx), %%mm7) // L7 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2357 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2358 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2359 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2360 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2361 "movq %%mm4, (%1, %2, 4) \n\t" // R4 |
| 787 | 2362 "movq %%mm5, (%1, %%edx) \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2363 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2364 "movq %%mm7, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2365 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2366 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2367 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2368 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2369 "movq %%mm4, (%0, %2, 4) \n\t" // L4 |
| 787 | 2370 "movq %%mm5, (%0, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2371 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2372 "movq %%mm7, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2373 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2374 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2375 "2: \n\t" |
| 787 | 2376 "cmpl 508(%%edx), %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2377 " jb 3f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2378 |
| 787 | 2379 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2380 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2381 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2382 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2383 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2384 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2385 "movq (%1), %%mm4 \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2386 "movq (%1, %2), %%mm5 \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2387 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2388 "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2389 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2390 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2391 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2392 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2393 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2394 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2395 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2396 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2397 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2398 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2399 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2400 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2401 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2402 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2403 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2404 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2405 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2406 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
| 787 | 2407 "movq (%0, %%edx), %%mm1 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2408 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2409 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2410 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
| 787 | 2411 "movq (%1, %%edx), %%mm5 \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2412 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2413 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2414 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2415 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2416 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2417 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2418 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2419 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2420 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2421 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2422 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
| 787 | 2423 "movq %%mm1, (%1, %%edx) \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2424 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2425 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2426 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
| 787 | 2427 "movq %%mm1, (%0, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2428 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2429 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2430 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2431 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2432 "3: \n\t" |
| 787 | 2433 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2434 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2435 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2436 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2437 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2438 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2439 "movq (%1), %%mm4 \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2440 "movq (%1, %2), %%mm5 \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2441 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2442 "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2443 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2444 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2445 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2446 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2447 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2448 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2449 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2450 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2451 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2452 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2453 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2454 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2455 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2456 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2457 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2458 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2459 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2460 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2461 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2462 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2463 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2464 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
| 787 | 2465 "movq (%0, %%edx), %%mm1 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2466 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2467 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2468 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
| 787 | 2469 "movq (%1, %%edx), %%mm5 \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2470 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2471 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2472 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2473 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2474 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2475 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2476 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2477 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2478 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2479 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2480 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2481 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2482 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2483 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2484 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
| 787 | 2485 "movq %%mm1, (%1, %%edx) \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2486 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2487 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2488 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
| 787 | 2489 "movq %%mm1, (%0, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2490 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2491 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2492 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2493 "4: \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2494 |
| 158 | 2495 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) |
| 787 | 2496 : "%eax", "%edx", "%ecx", "memory" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2497 ); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2498 //printf("%d\n", test); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2499 #else |
| 788 | 2500 { |
| 156 | 2501 int y; |
| 2502 int d=0; | |
| 2503 int sysd=0; | |
| 158 | 2504 int i; |
| 156 | 2505 |
| 2506 for(y=0; y<8; y++) | |
| 2507 { | |
| 2508 int x; | |
| 2509 for(x=0; x<8; x++) | |
| 2510 { | |
| 2511 int ref= tempBlured[ x + y*stride ]; | |
| 2512 int cur= src[ x + y*stride ]; | |
| 2513 int d1=ref - cur; | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2514 // if(x==0 || x==7) d1+= d1>>1; |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2515 // if(y==0 || y==7) d1+= d1>>1; |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2516 // d+= ABS(d1); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2517 d+= d1*d1; |
| 156 | 2518 sysd+= d1; |
| 2519 } | |
| 2520 } | |
| 158 | 2521 i=d; |
| 2522 d= ( | |
| 2523 4*d | |
| 2524 +(*(tempBluredPast-256)) | |
| 2525 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) | |
| 2526 +(*(tempBluredPast+256)) | |
| 2527 +4)>>3; | |
| 2528 *tempBluredPast=i; | |
| 2529 // ((*tempBluredPast)*3 + d + 2)>>2; | |
| 2530 | |
| 156 | 2531 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); |
| 2532 /* | |
| 2533 Switch between | |
| 2534 1 0 0 0 0 0 0 (0) | |
| 2535 64 32 16 8 4 2 1 (1) | |
| 2536 64 48 36 27 20 15 11 (33) (approx) | |
| 2537 64 56 49 43 37 33 29 (200) (approx) | |
| 2538 */ | |
| 2539 if(d > maxNoise[1]) | |
| 2540 { | |
| 2541 if(d < maxNoise[2]) | |
| 2542 { | |
| 2543 for(y=0; y<8; y++) | |
| 2544 { | |
| 2545 int x; | |
| 2546 for(x=0; x<8; x++) | |
| 2547 { | |
| 2548 int ref= tempBlured[ x + y*stride ]; | |
| 2549 int cur= src[ x + y*stride ]; | |
| 2550 tempBlured[ x + y*stride ]= | |
| 2551 src[ x + y*stride ]= | |
| 2552 (ref + cur + 1)>>1; | |
| 2553 } | |
| 2554 } | |
| 2555 } | |
| 2556 else | |
| 2557 { | |
| 2558 for(y=0; y<8; y++) | |
| 2559 { | |
| 2560 int x; | |
| 2561 for(x=0; x<8; x++) | |
| 2562 { | |
| 2563 tempBlured[ x + y*stride ]= src[ x + y*stride ]; | |
| 2564 } | |
| 2565 } | |
| 2566 } | |
| 2567 } | |
| 2568 else | |
| 2569 { | |
| 2570 if(d < maxNoise[0]) | |
| 2571 { | |
| 2572 for(y=0; y<8; y++) | |
| 2573 { | |
| 2574 int x; | |
| 2575 for(x=0; x<8; x++) | |
| 2576 { | |
| 2577 int ref= tempBlured[ x + y*stride ]; | |
| 2578 int cur= src[ x + y*stride ]; | |
| 2579 tempBlured[ x + y*stride ]= | |
| 2580 src[ x + y*stride ]= | |
| 2581 (ref*7 + cur + 4)>>3; | |
| 2582 } | |
| 2583 } | |
| 2584 } | |
| 2585 else | |
| 2586 { | |
| 2587 for(y=0; y<8; y++) | |
| 2588 { | |
| 2589 int x; | |
| 2590 for(x=0; x<8; x++) | |
| 2591 { | |
| 2592 int ref= tempBlured[ x + y*stride ]; | |
| 2593 int cur= src[ x + y*stride ]; | |
| 2594 tempBlured[ x + y*stride ]= | |
| 2595 src[ x + y*stride ]= | |
| 2596 (ref*3 + cur + 2)>>2; | |
| 2597 } | |
| 2598 } | |
| 2599 } | |
| 2600 } | |
| 788 | 2601 } |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2602 #endif |
| 156 | 2603 } |
| 2604 | |
| 169 | 2605 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 787 | 2606 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
| 96 | 2607 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2608 /** |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2609 * Copies a block from src to dst and fixes the blacklevel |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2610 * levelFix == 0 -> dont touch the brighness & contrast |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2611 */ |
|
634
be1cb0e1f276
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
600
diff
changeset
|
2612 #undef SCALED_CPY |
|
be1cb0e1f276
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
600
diff
changeset
|
2613 |
| 169 | 2614 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
| 787 | 2615 int levelFix, int64_t *packedOffsetAndScale) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2616 { |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2617 #ifndef HAVE_MMX |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2618 int i; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2619 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2620 if(levelFix) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2621 { |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2622 #ifdef HAVE_MMX |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2623 asm volatile( |
| 787 | 2624 "movq (%%eax), %%mm2 \n\t" // packedYOffset |
| 2625 "movq 8(%%eax), %%mm3 \n\t" // packedYScale | |
| 2626 "leal (%2,%4), %%eax \n\t" | |
| 2627 "leal (%3,%5), %%edx \n\t" | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2628 "pxor %%mm4, %%mm4 \n\t" |
| 173 | 2629 #ifdef HAVE_MMX2 |
| 2630 #define SCALED_CPY(src1, src2, dst1, dst2) \ | |
| 2631 "movq " #src1 ", %%mm0 \n\t"\ | |
| 2632 "movq " #src1 ", %%mm5 \n\t"\ | |
| 2633 "movq " #src2 ", %%mm1 \n\t"\ | |
| 2634 "movq " #src2 ", %%mm6 \n\t"\ | |
| 2635 "punpcklbw %%mm0, %%mm0 \n\t"\ | |
| 2636 "punpckhbw %%mm5, %%mm5 \n\t"\ | |
| 2637 "punpcklbw %%mm1, %%mm1 \n\t"\ | |
| 2638 "punpckhbw %%mm6, %%mm6 \n\t"\ | |
| 2639 "pmulhuw %%mm3, %%mm0 \n\t"\ | |
| 2640 "pmulhuw %%mm3, %%mm5 \n\t"\ | |
| 2641 "pmulhuw %%mm3, %%mm1 \n\t"\ | |
| 2642 "pmulhuw %%mm3, %%mm6 \n\t"\ | |
| 2643 "psubw %%mm2, %%mm0 \n\t"\ | |
| 2644 "psubw %%mm2, %%mm5 \n\t"\ | |
| 2645 "psubw %%mm2, %%mm1 \n\t"\ | |
| 2646 "psubw %%mm2, %%mm6 \n\t"\ | |
| 2647 "packuswb %%mm5, %%mm0 \n\t"\ | |
| 2648 "packuswb %%mm6, %%mm1 \n\t"\ | |
| 2649 "movq %%mm0, " #dst1 " \n\t"\ | |
| 2650 "movq %%mm1, " #dst2 " \n\t"\ | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2651 |
| 173 | 2652 #else //HAVE_MMX2 |
| 166 | 2653 #define SCALED_CPY(src1, src2, dst1, dst2) \ |
| 2654 "movq " #src1 ", %%mm0 \n\t"\ | |
| 2655 "movq " #src1 ", %%mm5 \n\t"\ | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2656 "punpcklbw %%mm4, %%mm0 \n\t"\ |
|
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2657 "punpckhbw %%mm4, %%mm5 \n\t"\ |
| 117 | 2658 "psubw %%mm2, %%mm0 \n\t"\ |
| 2659 "psubw %%mm2, %%mm5 \n\t"\ | |
| 166 | 2660 "movq " #src2 ", %%mm1 \n\t"\ |
| 117 | 2661 "psllw $6, %%mm0 \n\t"\ |
| 2662 "psllw $6, %%mm5 \n\t"\ | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2663 "pmulhw %%mm3, %%mm0 \n\t"\ |
| 166 | 2664 "movq " #src2 ", %%mm6 \n\t"\ |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2665 "pmulhw %%mm3, %%mm5 \n\t"\ |
|
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2666 "punpcklbw %%mm4, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2667 "punpckhbw %%mm4, %%mm6 \n\t"\ |
| 117 | 2668 "psubw %%mm2, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2669 "psubw %%mm2, %%mm6 \n\t"\ |
| 117 | 2670 "psllw $6, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2671 "psllw $6, %%mm6 \n\t"\ |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2672 "pmulhw %%mm3, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2673 "pmulhw %%mm3, %%mm6 \n\t"\ |
|
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2674 "packuswb %%mm5, %%mm0 \n\t"\ |
|
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2675 "packuswb %%mm6, %%mm1 \n\t"\ |
| 166 | 2676 "movq %%mm0, " #dst1 " \n\t"\ |
| 2677 "movq %%mm1, " #dst2 " \n\t"\ | |
| 2678 | |
| 173 | 2679 #endif //!HAVE_MMX2 |
| 2680 | |
| 787 | 2681 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) |
| 2682 SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2)) | |
| 2683 SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4)) | |
| 2684 "leal (%%eax,%4,4), %%eax \n\t" | |
| 2685 "leal (%%edx,%5,4), %%edx \n\t" | |
| 2686 SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2)) | |
| 166 | 2687 |
| 2688 | |
| 787 | 2689 : "=&a" (packedOffsetAndScale) |
| 2690 : "0" (packedOffsetAndScale), | |
| 2691 "r"(src), | |
| 166 | 2692 "r"(dst), |
| 2693 "r" (srcStride), | |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2694 "r" (dstStride) |
| 787 | 2695 : "%edx" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2696 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2697 #else |
| 164 | 2698 for(i=0; i<8; i++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2699 memcpy( &(dst[dstStride*i]), |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2700 &(src[srcStride*i]), BLOCK_SIZE); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2701 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2702 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2703 else |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2704 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2705 #ifdef HAVE_MMX |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2706 asm volatile( |
| 166 | 2707 "leal (%0,%2), %%eax \n\t" |
| 787 | 2708 "leal (%1,%3), %%edx \n\t" |
| 166 | 2709 |
| 2710 #define SIMPLE_CPY(src1, src2, dst1, dst2) \ | |
| 2711 "movq " #src1 ", %%mm0 \n\t"\ | |
| 2712 "movq " #src2 ", %%mm1 \n\t"\ | |
| 2713 "movq %%mm0, " #dst1 " \n\t"\ | |
| 2714 "movq %%mm1, " #dst2 " \n\t"\ | |
| 2715 | |
| 2716 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) | |
| 787 | 2717 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2)) |
| 2718 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4)) | |
| 166 | 2719 "leal (%%eax,%2,4), %%eax \n\t" |
| 787 | 2720 "leal (%%edx,%3,4), %%edx \n\t" |
| 2721 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2)) | |
| 166 | 2722 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2723 : : "r" (src), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2724 "r" (dst), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2725 "r" (srcStride), |
| 164 | 2726 "r" (dstStride) |
| 787 | 2727 : "%eax", "%edx" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2728 ); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2729 #else |
| 164 | 2730 for(i=0; i<8; i++) |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2731 memcpy( &(dst[dstStride*i]), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2732 &(src[srcStride*i]), BLOCK_SIZE); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2733 #endif |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2734 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2735 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2736 |
| 224 | 2737 /** |
| 2738 * Duplicates the given 8 src pixels ? times upward | |
| 2739 */ | |
| 2740 static inline void RENAME(duplicate)(uint8_t src[], int stride) | |
| 2741 { | |
| 2742 #ifdef HAVE_MMX | |
| 2743 asm volatile( | |
| 2744 "movq (%0), %%mm0 \n\t" | |
| 2745 "addl %1, %0 \n\t" | |
| 2746 "movq %%mm0, (%0) \n\t" | |
| 2747 "movq %%mm0, (%0, %1) \n\t" | |
| 2748 "movq %%mm0, (%0, %1, 2) \n\t" | |
| 2749 : "+r" (src) | |
| 2750 : "r" (-stride) | |
| 2751 ); | |
| 2752 #else | |
| 2753 int i; | |
| 2754 uint8_t *p=src; | |
| 2755 for(i=0; i<3; i++) | |
| 2756 { | |
| 2757 p-= stride; | |
| 2758 memcpy(p, src, 8); | |
| 2759 } | |
| 2760 #endif | |
| 2761 } | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2762 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2763 /** |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2764 * Filters array of bytes (Y or U or V values) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2765 */ |
| 169 | 2766 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 787 | 2767 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2768 { |
| 787 | 2769 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2770 int x,y; |
|
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2771 #ifdef COMPILE_TIME_MODE |
|
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2772 const int mode= COMPILE_TIME_MODE; |
|
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2773 #else |
| 787 | 2774 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; |
|
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2775 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2776 int black=0, white=255; // blackest black and whitest white in the picture |
| 223 | 2777 int QPCorrecture= 256*256; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2778 |
|
886
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
2779 int copyAhead; |
|
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
2780 #ifdef HAVE_MMX |
|
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
2781 int i; |
|
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
2782 #endif |
| 164 | 2783 |
| 957 | 2784 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; |
| 2785 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; | |
| 2786 | |
| 787 | 2787 //FIXME remove |
| 2788 uint64_t * const yHistogram= c.yHistogram; | |
| 2789 uint8_t * const tempSrc= c.tempSrc; | |
| 2790 uint8_t * const tempDst= c.tempDst; | |
| 791 | 2791 const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; |
|
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2792 |
| 158 | 2793 #ifdef HAVE_MMX |
| 791 | 2794 for(i=0; i<32; i++){ |
| 2795 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; | |
| 2796 int threshold= offset*2 + 1; | |
| 2797 c.mmxDcOffset[i]= 0x7F - offset; | |
| 2798 c.mmxDcThreshold[i]= 0x7F - threshold; | |
| 2799 c.mmxDcOffset[i]*= 0x0101010101010101LL; | |
| 2800 c.mmxDcThreshold[i]*= 0x0101010101010101LL; | |
| 2801 } | |
| 158 | 2802 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2803 |
| 164 | 2804 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
| 787 | 2805 else if( (mode & LINEAR_BLEND_DEINT_FILTER) |
| 1157 | 2806 || (mode & FFMPEG_DEINT_FILTER) |
| 2807 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; | |
| 164 | 2808 else if( (mode & V_DEBLOCK) |
| 2809 || (mode & LINEAR_IPOL_DEINT_FILTER) | |
| 2810 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; | |
| 2811 else if(mode & V_X1_FILTER) copyAhead=11; | |
| 787 | 2812 // else if(mode & V_RK1_FILTER) copyAhead=10; |
| 164 | 2813 else if(mode & DERING) copyAhead=9; |
| 2814 else copyAhead=8; | |
| 2815 | |
| 2816 copyAhead-= 8; | |
| 2817 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2818 if(!isColor) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2819 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2820 uint64_t sum= 0; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2821 int i; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2822 uint64_t maxClipped; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2823 uint64_t clipped; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2824 double scale; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2825 |
| 787 | 2826 c.frameNum++; |
| 2827 // first frame is fscked so we ignore it | |
| 2828 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2829 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2830 for(i=0; i<256; i++) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2831 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2832 sum+= yHistogram[i]; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2833 // printf("%d ", yHistogram[i]); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2834 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2835 // printf("\n\n"); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2836 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2837 /* we allways get a completly black picture first */ |
| 793 | 2838 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2839 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2840 clipped= sum; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2841 for(black=255; black>0; black--) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2842 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2843 if(clipped < maxClipped) break; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2844 clipped-= yHistogram[black]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2845 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2846 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2847 clipped= sum; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2848 for(white=0; white<256; white++) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2849 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2850 if(clipped < maxClipped) break; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2851 clipped-= yHistogram[white]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2852 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2853 |
| 787 | 2854 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); |
| 173 | 2855 |
| 2856 #ifdef HAVE_MMX2 | |
| 787 | 2857 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
| 2858 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; | |
| 173 | 2859 #else |
| 787 | 2860 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
| 2861 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; | |
| 173 | 2862 #endif |
| 2863 | |
| 787 | 2864 c.packedYOffset|= c.packedYOffset<<32; |
| 2865 c.packedYOffset|= c.packedYOffset<<16; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2866 |
| 787 | 2867 c.packedYScale|= c.packedYScale<<32; |
| 2868 c.packedYScale|= c.packedYScale<<16; | |
| 223 | 2869 |
| 2870 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); | |
| 2871 else QPCorrecture= 256*256; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2872 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2873 else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2874 { |
| 787 | 2875 c.packedYScale= 0x0100010001000100LL; |
| 2876 c.packedYOffset= 0; | |
| 223 | 2877 QPCorrecture= 256*256; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2878 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2879 |
|
148
1cfc4d567c0a
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
142
diff
changeset
|
2880 /* copy & deinterlace first row of blocks */ |
| 142 | 2881 y=-BLOCK_SIZE; |
| 2882 { | |
| 2883 uint8_t *srcBlock= &(src[y*srcStride]); | |
| 224 | 2884 uint8_t *dstBlock= tempDst + dstStride; |
| 142 | 2885 |
| 2886 // From this point on it is guranteed that we can read and write 16 lines downward | |
| 2887 // finish 1 block before the next otherwise we´ll might have a problem | |
| 2888 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
| 2889 for(x=0; x<width; x+=BLOCK_SIZE) | |
| 2890 { | |
| 2891 | |
| 2892 #ifdef HAVE_MMX2 | |
| 2893 /* | |
| 2894 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 2895 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 2896 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 2897 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 2898 */ | |
| 2899 | |
| 2900 asm( | |
| 2901 "movl %4, %%eax \n\t" | |
| 2902 "shrl $2, %%eax \n\t" | |
| 2903 "andl $6, %%eax \n\t" | |
| 164 | 2904 "addl %5, %%eax \n\t" |
| 787 | 2905 "movl %%eax, %%edx \n\t" |
| 142 | 2906 "imul %1, %%eax \n\t" |
| 787 | 2907 "imul %3, %%edx \n\t" |
| 142 | 2908 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 2909 "prefetcht0 32(%%edx, %2) \n\t" |
| 142 | 2910 "addl %1, %%eax \n\t" |
| 787 | 2911 "addl %3, %%edx \n\t" |
| 142 | 2912 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 2913 "prefetcht0 32(%%edx, %2) \n\t" |
| 142 | 2914 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
| 164 | 2915 "m" (x), "m" (copyAhead) |
| 787 | 2916 : "%eax", "%edx" |
| 142 | 2917 ); |
| 2918 | |
| 2919 #elif defined(HAVE_3DNOW) | |
| 2920 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 2921 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
| 2922 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 2923 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 2924 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 2925 */ | |
| 2926 #endif | |
| 2927 | |
| 224 | 2928 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
| 787 | 2929 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
| 224 | 2930 |
| 2931 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); | |
| 142 | 2932 |
| 2933 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
| 169 | 2934 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
| 142 | 2935 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 169 | 2936 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
| 142 | 2937 else if(mode & MEDIAN_DEINT_FILTER) |
| 169 | 2938 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
| 142 | 2939 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
| 169 | 2940 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
| 787 | 2941 else if(mode & FFMPEG_DEINT_FILTER) |
| 2942 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
| 1157 | 2943 else if(mode & LOWPASS5_DEINT_FILTER) |
| 2944 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); | |
| 142 | 2945 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
| 169 | 2946 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
| 142 | 2947 */ |
| 2948 dstBlock+=8; | |
| 2949 srcBlock+=8; | |
| 2950 } | |
| 941 | 2951 if(width==dstStride) |
| 2952 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride); | |
| 2953 else | |
| 2954 { | |
| 943 | 2955 int i; |
| 941 | 2956 for(i=0; i<copyAhead; i++) |
| 2957 { | |
| 2958 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); | |
| 2959 } | |
| 2960 } | |
| 142 | 2961 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2962 |
| 787 | 2963 //printf("\n"); |
| 111 | 2964 for(y=0; y<height; y+=BLOCK_SIZE) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2965 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2966 //1% speedup if these are here instead of the inner loop |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2967 uint8_t *srcBlock= &(src[y*srcStride]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2968 uint8_t *dstBlock= &(dst[y*dstStride]); |
| 169 | 2969 #ifdef HAVE_MMX |
| 787 | 2970 uint8_t *tempBlock1= c.tempBlocks; |
| 2971 uint8_t *tempBlock2= c.tempBlocks + 8; | |
| 169 | 2972 #endif |
| 957 | 2973 int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; |
| 1196 | 2974 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*QPStride]; |
| 156 | 2975 int QP=0; |
| 130 | 2976 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
| 2977 if not than use a temporary buffer */ | |
| 111 | 2978 if(y+15 >= height) |
| 2979 { | |
| 156 | 2980 int i; |
| 164 | 2981 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
| 111 | 2982 blockcopy to dst later */ |
| 164 | 2983 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
| 2984 srcStride*MAX(height-y-copyAhead, 0) ); | |
| 2985 | |
| 2986 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ | |
| 2987 for(i=MAX(height-y, 8); i<copyAhead+8; i++) | |
| 156 | 2988 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); |
| 2989 | |
| 164 | 2990 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
| 2991 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) ); | |
| 2992 | |
| 2993 /* duplicate last line of dst to fill the void upto line (copyAhead) */ | |
| 2994 for(i=height-y+1; i<=copyAhead; i++) | |
| 156 | 2995 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); |
| 2996 | |
| 130 | 2997 dstBlock= tempDst + dstStride; |
| 111 | 2998 srcBlock= tempSrc; |
| 2999 } | |
| 787 | 3000 //printf("\n"); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3001 |
| 112 | 3002 // From this point on it is guranteed that we can read and write 16 lines downward |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3003 // finish 1 block before the next otherwise we´ll might have a problem |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3004 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3005 for(x=0; x<width; x+=BLOCK_SIZE) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3006 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3007 const int stride= dstStride; |
| 169 | 3008 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3009 uint8_t *tmpXchg; |
| 169 | 3010 #endif |
| 791 | 3011 if(isColor) |
| 121 | 3012 { |
| 957 | 3013 QP= QPptr[x>>qpHShift]; |
| 3014 c.nonBQP= nonBQPptr[x>>qpHShift]; | |
| 791 | 3015 } |
| 3016 else | |
| 3017 { | |
| 3018 QP= QPptr[x>>4]; | |
| 223 | 3019 QP= (QP* QPCorrecture + 256*128)>>16; |
| 791 | 3020 c.nonBQP= nonBQPptr[x>>4]; |
| 3021 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; | |
|
148
1cfc4d567c0a
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
142
diff
changeset
|
3022 yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
| 121 | 3023 } |
| 787 | 3024 c.QP= QP; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3025 #ifdef HAVE_MMX |
| 111 | 3026 asm volatile( |
| 787 | 3027 "movd %1, %%mm7 \n\t" |
| 111 | 3028 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
| 3029 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | |
| 3030 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | |
| 787 | 3031 "movq %%mm7, %0 \n\t" |
| 3032 : "=m" (c.pQPb) | |
| 3033 : "r" (QP) | |
| 111 | 3034 ); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3035 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3036 |
| 96 | 3037 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3038 #ifdef HAVE_MMX2 |
| 126 | 3039 /* |
| 3040 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 3041 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 3042 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 3043 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 3044 */ | |
| 3045 | |
| 3046 asm( | |
| 3047 "movl %4, %%eax \n\t" | |
| 3048 "shrl $2, %%eax \n\t" | |
| 3049 "andl $6, %%eax \n\t" | |
| 164 | 3050 "addl %5, %%eax \n\t" |
| 787 | 3051 "movl %%eax, %%edx \n\t" |
| 126 | 3052 "imul %1, %%eax \n\t" |
| 787 | 3053 "imul %3, %%edx \n\t" |
| 126 | 3054 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 3055 "prefetcht0 32(%%edx, %2) \n\t" |
| 126 | 3056 "addl %1, %%eax \n\t" |
| 787 | 3057 "addl %3, %%edx \n\t" |
| 126 | 3058 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 3059 "prefetcht0 32(%%edx, %2) \n\t" |
| 126 | 3060 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
| 164 | 3061 "m" (x), "m" (copyAhead) |
| 787 | 3062 : "%eax", "%edx" |
| 126 | 3063 ); |
| 3064 | |
| 96 | 3065 #elif defined(HAVE_3DNOW) |
| 3066 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 111 | 3067 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
| 3068 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 3069 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 3070 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 96 | 3071 */ |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3072 #endif |
| 111 | 3073 |
| 169 | 3074 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
| 787 | 3075 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3076 |
| 111 | 3077 if(mode & LINEAR_IPOL_DEINT_FILTER) |
| 169 | 3078 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
| 111 | 3079 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 169 | 3080 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
| 111 | 3081 else if(mode & MEDIAN_DEINT_FILTER) |
| 169 | 3082 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
| 111 | 3083 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
| 169 | 3084 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
| 787 | 3085 else if(mode & FFMPEG_DEINT_FILTER) |
| 3086 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
| 1157 | 3087 else if(mode & LOWPASS5_DEINT_FILTER) |
| 3088 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); | |
| 111 | 3089 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
| 169 | 3090 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
3091 */ |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3092 |
| 111 | 3093 /* only deblock if we have 2 blocks */ |
| 3094 if(y + 8 < height) | |
| 3095 { | |
| 787 | 3096 if(mode & V_X1_FILTER) |
| 3097 RENAME(vertX1Filter)(dstBlock, stride, &c); | |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3098 else if(mode & V_DEBLOCK) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3099 { |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3100 const int t= RENAME(vertClassify)(dstBlock, stride, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3101 |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3102 if(t==1) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3103 RENAME(doVertLowPass)(dstBlock, stride, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3104 else if(t==2) |
| 787 | 3105 RENAME(doVertDefFilter)(dstBlock, stride, &c); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3106 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3107 } |
| 130 | 3108 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3109 #ifdef HAVE_MMX |
| 169 | 3110 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3111 #endif |
| 111 | 3112 /* check if we have a previous block to deblock it with dstBlock */ |
| 112 | 3113 if(x - 8 >= 0) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3114 { |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3115 #ifdef HAVE_MMX |
| 787 | 3116 if(mode & H_X1_FILTER) |
| 3117 RENAME(vertX1Filter)(tempBlock1, 16, &c); | |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3118 else if(mode & H_DEBLOCK) |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3119 { |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3120 //START_TIMER |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3121 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3122 //STOP_TIMER("dc & minmax") |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3123 if(t==1) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3124 RENAME(doVertLowPass)(tempBlock1, 16, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3125 else if(t==2) |
| 787 | 3126 RENAME(doVertDefFilter)(tempBlock1, 16, &c); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3127 } |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3128 |
| 169 | 3129 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3130 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3131 #else |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3132 if(mode & H_X1_FILTER) |
|
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3133 horizX1Filter(dstBlock-4, stride, QP); |
|
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3134 else if(mode & H_DEBLOCK) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3135 { |
| 787 | 3136 if( isHorizDC(dstBlock-4, stride, &c)) |
| 96 | 3137 { |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3138 if(isHorizMinMaxOk(dstBlock-4, stride, QP)) |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3139 doHorizLowPass(dstBlock-4, stride, QP); |
| 96 | 3140 } |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3141 else |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3142 doHorizDefFilter(dstBlock-4, stride, QP); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3143 } |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3144 #endif |
| 130 | 3145 if(mode & DERING) |
| 3146 { | |
| 3147 //FIXME filter first line | |
| 787 | 3148 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); |
| 130 | 3149 } |
| 156 | 3150 |
| 3151 if(mode & TEMP_NOISE_FILTER) | |
| 3152 { | |
| 169 | 3153 RENAME(tempNoiseReducer)(dstBlock-8, stride, |
| 787 | 3154 c.tempBlured[isColor] + y*dstStride + x, |
| 3155 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
| 3156 c.ppMode.maxTmpNoise); | |
| 156 | 3157 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3158 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3159 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3160 dstBlock+=8; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3161 srcBlock+=8; |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3162 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3163 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3164 tmpXchg= tempBlock1; |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3165 tempBlock1= tempBlock2; |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3166 tempBlock2 = tmpXchg; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3167 #endif |
| 111 | 3168 } |
| 3169 | |
| 156 | 3170 if(mode & DERING) |
| 3171 { | |
| 787 | 3172 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); |
| 156 | 3173 } |
| 3174 | |
| 3175 if((mode & TEMP_NOISE_FILTER)) | |
| 3176 { | |
| 169 | 3177 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
| 787 | 3178 c.tempBlured[isColor] + y*dstStride + x, |
| 3179 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
| 3180 c.ppMode.maxTmpNoise); | |
| 156 | 3181 } |
| 3182 | |
| 142 | 3183 /* did we use a tmp buffer for the last lines*/ |
| 112 | 3184 if(y+15 >= height) |
| 111 | 3185 { |
| 3186 uint8_t *dstBlock= &(dst[y*dstStride]); | |
| 941 | 3187 if(width==dstStride) |
| 3188 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y)); | |
| 3189 else | |
| 3190 { | |
| 944 | 3191 int i; |
| 941 | 3192 for(i=0; i<height-y; i++) |
| 3193 { | |
| 3194 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); | |
| 3195 } | |
| 3196 } | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3197 } |
| 163 | 3198 /* |
| 3199 for(x=0; x<width; x+=32) | |
| 3200 { | |
| 164 | 3201 volatile int i; |
| 163 | 3202 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
| 3203 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | |
| 164 | 3204 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
| 3205 // + dstBlock[x +13*dstStride] | |
| 3206 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; | |
| 3207 }*/ | |
| 3208 } | |
| 96 | 3209 #ifdef HAVE_3DNOW |
| 3210 asm volatile("femms"); | |
| 3211 #elif defined (HAVE_MMX) | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3212 asm volatile("emms"); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3213 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3214 |
| 163 | 3215 #ifdef DEBUG_BRIGHTNESS |
| 3216 if(!isColor) | |
| 3217 { | |
| 3218 int max=1; | |
| 3219 int i; | |
| 3220 for(i=0; i<256; i++) | |
| 3221 if(yHistogram[i] > max) max=yHistogram[i]; | |
| 3222 | |
| 3223 for(i=1; i<256; i++) | |
| 3224 { | |
| 3225 int x; | |
| 3226 int start=yHistogram[i-1]/(max/256+1); | |
| 3227 int end=yHistogram[i]/(max/256+1); | |
| 3228 int inc= end > start ? 1 : -1; | |
| 3229 for(x=start; x!=end+inc; x+=inc) | |
| 3230 dst[ i*dstStride + x]+=128; | |
| 3231 } | |
| 3232 | |
| 3233 for(i=0; i<100; i+=2) | |
| 3234 { | |
| 3235 dst[ (white)*dstStride + i]+=128; | |
| 3236 dst[ (black)*dstStride + i]+=128; | |
| 3237 } | |
| 3238 | |
| 3239 } | |
| 3240 #endif | |
| 3241 | |
| 787 | 3242 *c2= c; //copy local context back |
| 3243 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3244 } |
