Mercurial > libavcodec.hg
annotate libpostproc/postprocess_template.c @ 1757:3906ddbaffec libavcodec
optimization & bugfix extracted from the 4k line diff between ffmpeg 0.4.7 and http://www.alicestreet.com/ffh263.html
the other parts of the diff where
1. spelling fixes (rejected as only a small part of it could be applied automatically)
2. cosmetics (reindention, function reordering, var renaming, ...) with bugs (rejected)
3. rtp related stuff (rejetced as it breaks several codecs)
4. some changes to the intra/inter decission & scene change detection (quality tests needed first)
| author | michael |
|---|---|
| date | Sat, 24 Jan 2004 23:47:33 +0000 |
| parents | ea5200a9f730 |
| children | 4225c131a2eb |
| rev | line source |
|---|---|
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1 /* |
| 223 | 2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
4 This program is free software; you can redistribute it and/or modify |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
5 it under the terms of the GNU General Public License as published by |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
6 the Free Software Foundation; either version 2 of the License, or |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
7 (at your option) any later version. |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
8 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
9 This program is distributed in the hope that it will be useful, |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
12 GNU General Public License for more details. |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
13 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
14 You should have received a copy of the GNU General Public License |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
15 along with this program; if not, write to the Free Software |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
17 */ |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
18 |
| 1109 | 19 /** |
| 20 * @file postprocess_template.c | |
| 21 * mmx/mmx2/3dnow postprocess code. | |
| 22 */ | |
| 23 | |
| 24 | |
| 169 | 25 #undef PAVGB |
| 26 #undef PMINUB | |
| 27 #undef PMAXUB | |
| 104 | 28 |
| 29 #ifdef HAVE_MMX2 | |
| 30 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
| 31 #elif defined (HAVE_3DNOW) | |
| 32 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
| 33 #endif | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
34 |
| 134 | 35 #ifdef HAVE_MMX2 |
| 36 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" | |
| 37 #elif defined (HAVE_MMX) | |
| 38 #define PMINUB(b,a,t) \ | |
| 39 "movq " #a ", " #t " \n\t"\ | |
| 40 "psubusb " #b ", " #t " \n\t"\ | |
| 41 "psubb " #t ", " #a " \n\t" | |
| 42 #endif | |
| 43 | |
| 44 #ifdef HAVE_MMX2 | |
| 45 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" | |
| 46 #elif defined (HAVE_MMX) | |
| 47 #define PMAXUB(a,b) \ | |
| 48 "psubusb " #a ", " #b " \n\t"\ | |
| 49 "paddb " #a ", " #b " \n\t" | |
| 50 #endif | |
| 51 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
52 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
| 787 | 53 #ifdef HAVE_MMX |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
54 /** |
| 111 | 55 * Check if the middle 8x8 Block in the given 8x16 block is flat |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
56 */ |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
57 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
58 int numEq= 0, dcOk; |
| 111 | 59 src+= stride*4; // src points to begin of the 8x8 Block |
| 119 | 60 asm volatile( |
| 1331 | 61 "movq %0, %%mm7 \n\t" |
| 62 "movq %1, %%mm6 \n\t" | |
| 63 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) | |
| 64 ); | |
| 65 | |
| 66 asm volatile( | |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
67 "leal (%2, %3), %%eax \n\t" |
| 119 | 68 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 69 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
| 791 | 70 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
71 "movq (%2), %%mm0 \n\t" |
| 119 | 72 "movq (%%eax), %%mm1 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
73 "movq %%mm0, %%mm3 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
74 "movq %%mm0, %%mm4 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
75 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
76 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
77 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
78 "paddb %%mm7, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
79 "pcmpgtb %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
80 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
81 "movq (%%eax,%3), %%mm2 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
82 PMAXUB(%%mm2, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
83 PMINUB(%%mm2, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
84 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
85 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
86 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
87 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
88 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
89 "movq (%%eax, %3, 2), %%mm1 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
90 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
91 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
92 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
93 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
94 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
95 "paddb %%mm2, %%mm0 \n\t" |
| 787 | 96 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
97 "leal (%%eax, %3, 4), %%eax \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
98 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
99 "movq (%2, %3, 4), %%mm2 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
100 PMAXUB(%%mm2, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
101 PMINUB(%%mm2, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
102 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
103 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
104 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
105 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
106 |
| 787 | 107 "movq (%%eax), %%mm1 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
108 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
109 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
110 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
111 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
112 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
113 "paddb %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
114 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
115 "movq (%%eax, %3), %%mm2 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
116 PMAXUB(%%mm2, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
117 PMINUB(%%mm2, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
118 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
119 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
120 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
121 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
122 |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
123 "movq (%%eax, %3, 2), %%mm1 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
124 PMAXUB(%%mm1, %%mm4) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
125 PMINUB(%%mm1, %%mm3, %%mm5) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
126 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
127 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
128 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
129 "paddb %%mm2, %%mm0 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
130 "psubusb %%mm3, %%mm4 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
131 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
132 " \n\t" |
| 167 | 133 #ifdef HAVE_MMX2 |
| 134 "pxor %%mm7, %%mm7 \n\t" | |
| 135 "psadbw %%mm7, %%mm0 \n\t" | |
| 136 #else | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
137 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
138 "psrlw $8, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
139 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
140 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
141 "psrlq $16, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
142 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
143 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
144 "psrlq $32, %%mm0 \n\t" |
| 167 | 145 "paddb %%mm1, %%mm0 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
146 #endif |
| 1331 | 147 "movq %4, %%mm7 \n\t" // QP,..., QP |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
148 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
149 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
150 "packssdw %%mm4, %%mm4 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
151 "movd %%mm0, %0 \n\t" |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
152 "movd %%mm4, %1 \n\t" |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
153 |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
154 : "=r" (numEq), "=r" (dcOk) |
| 1331 | 155 : "r" (src), "r" (stride), "m" (c->pQPb) |
| 787 | 156 : "%eax" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
157 ); |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
158 |
| 167 | 159 numEq= (-numEq) &0xFF; |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
160 if(numEq > c->ppMode.flatnessThreshold){ |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
161 if(dcOk) return 0; |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
162 else return 1; |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
163 }else{ |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
164 return 2; |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
165 } |
| 787 | 166 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
167 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
168 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
169 /** |
| 111 | 170 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
| 107 | 171 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
172 */ |
| 787 | 173 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
174 { |
| 96 | 175 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 176 src+= stride*3; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
177 asm volatile( //"movv %0 %1 %2\n\t" |
| 787 | 178 "movq %2, %%mm0 \n\t" // QP,..., QP |
| 179 "pxor %%mm4, %%mm4 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
180 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
181 "movq (%0), %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
182 "movq (%0, %1), %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
183 "movq %%mm5, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
184 "movq %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
185 "psubusb %%mm6, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
186 "psubusb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
187 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
188 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
| 787 | 189 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
190 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
191 "pand %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
192 "pandn %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
193 "por %%mm2, %%mm6 \n\t"// First Line to Filter |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
194 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
195 "movq (%0, %1, 8), %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
196 "leal (%0, %1, 4), %%eax \n\t" |
| 787 | 197 "leal (%0, %1, 8), %%ecx \n\t" |
| 198 "subl %1, %%ecx \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
199 "addl %1, %0 \n\t" // %0 points to line 1 not 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
200 "movq (%0, %1, 8), %%mm7 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
201 "movq %%mm5, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
202 "movq %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
203 "psubusb %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
204 "psubusb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
205 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
206 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
| 787 | 207 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
208 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
209 "pand %%mm2, %%mm7 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
210 "pandn %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
211 "por %%mm2, %%mm7 \n\t" // First Line to Filter |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
212 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
213 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
214 // 1 2 3 4 5 6 7 8 |
| 787 | 215 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
216 // 6 4 2 2 1 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
217 // 6 4 4 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
218 // 6 8 2 |
| 111 | 219 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
220 "movq (%0, %1), %%mm0 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
221 "movq %%mm0, %%mm1 \n\t" // 1 |
| 96 | 222 PAVGB(%%mm6, %%mm0) //1 1 /2 |
| 223 PAVGB(%%mm6, %%mm0) //3 1 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
224 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
225 "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
226 "movq %%mm2, %%mm5 \n\t" // 1 |
| 96 | 227 PAVGB((%%eax), %%mm2) // 11 /2 |
| 228 PAVGB((%0, %1, 2), %%mm2) // 211 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
229 "movq %%mm2, %%mm3 \n\t" // 211 /4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
230 "movq (%0), %%mm4 \n\t" // 1 |
| 96 | 231 PAVGB(%%mm4, %%mm3) // 4 211 /8 |
| 232 PAVGB(%%mm0, %%mm3) //642211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
233 "movq %%mm3, (%0) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
234 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
235 "movq %%mm1, %%mm0 \n\t" // 1 |
| 96 | 236 PAVGB(%%mm6, %%mm0) //1 1 /2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
237 "movq %%mm4, %%mm3 \n\t" // 1 |
| 96 | 238 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
| 239 PAVGB((%%eax,%1,2), %%mm5) // 11 /2 | |
| 240 PAVGB((%%eax), %%mm5) // 211 /4 | |
| 241 PAVGB(%%mm5, %%mm3) // 2 2211 /8 | |
| 242 PAVGB(%%mm0, %%mm3) //4242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
243 "movq %%mm3, (%0,%1) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
244 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
| 96 | 245 PAVGB(%%mm4, %%mm6) //11 /2 |
| 787 | 246 "movq (%%ecx), %%mm0 \n\t" // 1 |
| 96 | 247 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
248 "movq %%mm0, %%mm3 \n\t" // 11/2 |
| 96 | 249 PAVGB(%%mm1, %%mm0) // 2 11/4 |
| 250 PAVGB(%%mm6, %%mm0) //222 11/8 | |
| 251 PAVGB(%%mm2, %%mm0) //22242211/16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
252 "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
253 "movq %%mm0, (%0, %1, 2) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
254 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
255 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 787 | 256 PAVGB((%%ecx), %%mm0) // 11 /2 |
| 96 | 257 PAVGB(%%mm0, %%mm6) //11 11 /4 |
| 258 PAVGB(%%mm1, %%mm4) // 11 /2 | |
| 259 PAVGB(%%mm2, %%mm1) // 11 /2 | |
| 260 PAVGB(%%mm1, %%mm6) //1122 11 /8 | |
| 261 PAVGB(%%mm5, %%mm6) //112242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
262 "movq (%%eax), %%mm5 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
263 "movq %%mm6, (%%eax) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
264 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
265 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 |
| 96 | 266 PAVGB(%%mm7, %%mm6) // 11 /2 |
| 267 PAVGB(%%mm4, %%mm6) // 11 11 /4 | |
| 268 PAVGB(%%mm3, %%mm6) // 11 2211 /8 | |
| 269 PAVGB(%%mm5, %%mm2) // 11 /2 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
270 "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
| 96 | 271 PAVGB(%%mm4, %%mm2) // 112 /4 |
| 272 PAVGB(%%mm2, %%mm6) // 112242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
273 "movq %%mm6, (%0, %1, 4) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
274 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
| 96 | 275 PAVGB(%%mm7, %%mm1) // 11 2 /4 |
| 276 PAVGB(%%mm4, %%mm5) // 11 /2 | |
| 277 PAVGB(%%mm5, %%mm0) // 11 11 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
278 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
| 96 | 279 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
| 280 PAVGB(%%mm0, %%mm1) // 11224222 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
281 "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
282 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
| 787 | 283 PAVGB((%%ecx), %%mm2) // 112 4 /8 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
284 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 96 | 285 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
| 286 PAVGB(%%mm7, %%mm6) // 1 12 /4 | |
| 287 PAVGB(%%mm2, %%mm6) // 1122424 /4 | |
| 787 | 288 "movq %%mm6, (%%ecx) \n\t" // X |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
289 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
| 96 | 290 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
| 291 PAVGB(%%mm7, %%mm5) // 11 6 /8 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
292 |
| 96 | 293 PAVGB(%%mm3, %%mm0) // 112 /4 |
| 294 PAVGB(%%mm0, %%mm5) // 112246 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
295 "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
| 140 | 296 "subl %1, %0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
297 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
298 : |
| 787 | 299 : "r" (src), "r" (stride), "m" (c->pQPb) |
| 300 : "%eax", "%ecx" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
301 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
302 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
303 const int l1= stride; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
304 const int l2= stride + l1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
305 const int l3= stride + l2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
306 const int l4= stride + l3; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
307 const int l5= stride + l4; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
308 const int l6= stride + l5; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
309 const int l7= stride + l6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
310 const int l8= stride + l7; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
311 const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
312 int x; |
| 111 | 313 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
314 for(x=0; x<BLOCK_SIZE; x++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
315 { |
| 787 | 316 const int first= ABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; |
| 317 const int last= ABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
318 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
319 int sums[9]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
320 sums[0] = first + src[l1]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
321 sums[1] = src[l1] + src[l2]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
322 sums[2] = src[l2] + src[l3]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
323 sums[3] = src[l3] + src[l4]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
324 sums[4] = src[l4] + src[l5]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
325 sums[5] = src[l5] + src[l6]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
326 sums[6] = src[l6] + src[l7]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
327 sums[7] = src[l7] + src[l8]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
328 sums[8] = src[l8] + last; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
329 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
330 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
331 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
332 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
333 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
334 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
335 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
336 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
337 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
338 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
339 src++; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
340 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
341 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
342 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
343 |
| 787 | 344 #if 0 |
| 96 | 345 /** |
| 346 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | |
| 347 * values are correctly clipped (MMX2) | |
| 348 * values are wraparound (C) | |
| 349 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | |
| 350 0 8 16 24 | |
| 351 x = 8 | |
| 352 x/2 = 4 | |
| 353 x/8 = 1 | |
| 354 1 12 12 23 | |
| 355 */ | |
| 169 | 356 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
| 96 | 357 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
358 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 359 src+= stride*3; |
| 96 | 360 // FIXME rounding |
| 361 asm volatile( | |
| 362 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 210 | 363 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 96 | 364 "leal (%0, %1), %%eax \n\t" |
| 787 | 365 "leal (%%eax, %1, 4), %%ecx \n\t" |
| 96 | 366 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 367 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
| 210 | 368 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
| 96 | 369 "movq %%mm0, %%mm1 \n\t" // QP,..., QP |
| 210 | 370 "paddusb "MANGLE(b02)", %%mm0 \n\t" |
| 96 | 371 "psrlw $2, %%mm0 \n\t" |
| 210 | 372 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 |
| 96 | 373 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... |
| 374 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | |
| 787 | 375 "movq (%%ecx), %%mm3 \n\t" // line 5 |
| 96 | 376 "movq %%mm2, %%mm4 \n\t" // line 4 |
| 377 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | |
| 378 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
379 PAVGB(%%mm3, %%mm5) |
| 96 | 380 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 |
| 381 "psubusb %%mm3, %%mm4 \n\t" | |
| 382 "psubusb %%mm2, %%mm3 \n\t" | |
| 383 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | |
| 384 "psubusb %%mm0, %%mm4 \n\t" | |
| 385 "pcmpeqb %%mm7, %%mm4 \n\t" | |
| 386 "pand %%mm4, %%mm5 \n\t" // d/2 | |
| 387 | |
| 388 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | |
| 389 "paddb %%mm5, %%mm2 \n\t" | |
| 390 // "psubb %%mm6, %%mm2 \n\t" | |
| 391 "movq %%mm2, (%0,%1, 4) \n\t" | |
| 392 | |
| 787 | 393 "movq (%%ecx), %%mm2 \n\t" |
| 96 | 394 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 |
| 395 "psubb %%mm5, %%mm2 \n\t" | |
| 396 // "psubb %%mm6, %%mm2 \n\t" | |
| 787 | 397 "movq %%mm2, (%%ecx) \n\t" |
| 96 | 398 |
| 399 "paddb %%mm6, %%mm5 \n\t" | |
| 400 "psrlw $2, %%mm5 \n\t" | |
| 210 | 401 "pand "MANGLE(b3F)", %%mm5 \n\t" |
| 402 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 | |
| 96 | 403 |
| 404 "movq (%%eax, %1, 2), %%mm2 \n\t" | |
| 405 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | |
| 406 "paddsb %%mm5, %%mm2 \n\t" | |
| 407 "psubb %%mm6, %%mm2 \n\t" | |
| 408 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
| 409 | |
| 787 | 410 "movq (%%ecx, %1), %%mm2 \n\t" |
| 96 | 411 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 |
| 412 "psubsb %%mm5, %%mm2 \n\t" | |
| 413 "psubb %%mm6, %%mm2 \n\t" | |
| 787 | 414 "movq %%mm2, (%%ecx, %1) \n\t" |
| 96 | 415 |
| 416 : | |
| 417 : "r" (src), "r" (stride) | |
| 787 | 418 : "%eax", "%ecx" |
| 96 | 419 ); |
| 420 #else | |
| 421 const int l1= stride; | |
| 422 const int l2= stride + l1; | |
| 423 const int l3= stride + l2; | |
| 424 const int l4= stride + l3; | |
| 425 const int l5= stride + l4; | |
| 426 const int l6= stride + l5; | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
427 // const int l7= stride + l6; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
428 // const int l8= stride + l7; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
429 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
430 int x; |
| 141 | 431 const int QP15= QP + (QP>>2); |
| 111 | 432 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
433 for(x=0; x<BLOCK_SIZE; x++) |
| 96 | 434 { |
| 141 | 435 const int v = (src[x+l5] - src[x+l4]); |
| 436 if(ABS(v) < QP15) | |
| 96 | 437 { |
| 141 | 438 src[x+l3] +=v>>3; |
| 439 src[x+l4] +=v>>1; | |
| 440 src[x+l5] -=v>>1; | |
| 441 src[x+l6] -=v>>3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
442 |
| 96 | 443 } |
| 444 } | |
| 445 | |
| 446 #endif | |
| 447 } | |
| 787 | 448 #endif |
| 96 | 449 |
| 450 /** | |
| 451 * Experimental Filter 1 | |
| 99 | 452 * will not damage linear gradients |
| 453 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
454 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
455 * MMX2 version does correct clipping C version doesnt |
| 96 | 456 */ |
| 787 | 457 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
| 96 | 458 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
459 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 460 src+= stride*3; |
| 461 | |
| 96 | 462 asm volatile( |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
463 "pxor %%mm7, %%mm7 \n\t" // 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
464 "leal (%0, %1), %%eax \n\t" |
| 787 | 465 "leal (%%eax, %1, 4), %%ecx \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
466 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 467 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
468 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
469 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
470 "movq %%mm1, %%mm2 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
471 "psubusb %%mm0, %%mm1 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
472 "psubusb %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
473 "por %%mm1, %%mm0 \n\t" // |l2 - l3| |
| 787 | 474 "movq (%%ecx), %%mm3 \n\t" // line 5 |
| 475 "movq (%%ecx, %1), %%mm4 \n\t" // line 6 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
476 "movq %%mm3, %%mm5 \n\t" // line 5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
477 "psubusb %%mm4, %%mm3 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
478 "psubusb %%mm5, %%mm4 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
479 "por %%mm4, %%mm3 \n\t" // |l5 - l6| |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
480 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
481 "movq %%mm2, %%mm1 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
482 "psubusb %%mm5, %%mm2 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
483 "movq %%mm2, %%mm4 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
484 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
485 "psubusb %%mm1, %%mm5 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
486 "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
487 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
488 "movq %%mm4, %%mm3 \n\t" // d |
| 787 | 489 "movq %2, %%mm0 \n\t" |
| 334 | 490 "paddusb %%mm0, %%mm0 \n\t" |
| 491 "psubusb %%mm0, %%mm4 \n\t" | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
492 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
| 210 | 493 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
494 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
495 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
496 PAVGB(%%mm7, %%mm3) // d/2 |
| 99 | 497 "movq %%mm3, %%mm1 \n\t" // d/2 |
| 498 PAVGB(%%mm7, %%mm3) // d/4 | |
| 499 PAVGB(%%mm1, %%mm3) // 3*d/8 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
500 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
501 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
502 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
503 "psubusb %%mm3, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
504 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
505 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
506 |
| 787 | 507 "movq (%%ecx), %%mm0 \n\t" // line 5 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
508 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
509 "paddusb %%mm3, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
510 "pxor %%mm2, %%mm0 \n\t" |
| 787 | 511 "movq %%mm0, (%%ecx) \n\t" // line 5 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
512 |
| 99 | 513 PAVGB(%%mm7, %%mm1) // d/4 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
514 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
515 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
516 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
| 99 | 517 "psubusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
518 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
519 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
520 |
| 787 | 521 "movq (%%ecx, %1), %%mm0 \n\t" // line 6 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
522 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
| 99 | 523 "paddusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
524 "pxor %%mm2, %%mm0 \n\t" |
| 787 | 525 "movq %%mm0, (%%ecx, %1) \n\t" // line 6 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
526 |
| 99 | 527 PAVGB(%%mm7, %%mm1) // d/8 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
528 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
529 "movq (%%eax, %1), %%mm0 \n\t" // line 2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
530 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
| 99 | 531 "psubusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
532 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
533 "movq %%mm0, (%%eax, %1) \n\t" // line 2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
534 |
| 787 | 535 "movq (%%ecx, %1, 2), %%mm0 \n\t" // line 7 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
536 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
| 99 | 537 "paddusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
538 "pxor %%mm2, %%mm0 \n\t" |
| 787 | 539 "movq %%mm0, (%%ecx, %1, 2) \n\t" // line 7 |
| 96 | 540 |
| 541 : | |
| 787 | 542 : "r" (src), "r" (stride), "m" (co->pQPb) |
| 543 : "%eax", "%ecx" | |
| 96 | 544 ); |
| 545 #else | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
546 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
547 const int l1= stride; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
548 const int l2= stride + l1; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
549 const int l3= stride + l2; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
550 const int l4= stride + l3; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
551 const int l5= stride + l4; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
552 const int l6= stride + l5; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
553 const int l7= stride + l6; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
554 // const int l8= stride + l7; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
555 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
556 int x; |
| 111 | 557 |
| 558 src+= stride*3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
559 for(x=0; x<BLOCK_SIZE; x++) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
560 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
561 int a= src[l3] - src[l4]; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
562 int b= src[l4] - src[l5]; |
| 99 | 563 int c= src[l5] - src[l6]; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
564 |
| 141 | 565 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); |
| 566 d= MAX(d, 0); | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
567 |
| 787 | 568 if(d < co->QP*2) |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
569 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
570 int v = d * SIGN(-b); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
571 |
| 141 | 572 src[l2] +=v>>3; |
| 573 src[l3] +=v>>2; | |
| 574 src[l4] +=(3*v)>>3; | |
| 575 src[l5] -=(3*v)>>3; | |
| 576 src[l6] -=v>>2; | |
| 577 src[l7] -=v>>3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
578 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
579 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
580 src++; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
581 } |
| 96 | 582 #endif |
| 583 } | |
| 584 | |
| 787 | 585 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
586 { |
| 163 | 587 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 588 /* | |
| 589 uint8_t tmp[16]; | |
| 590 const int l1= stride; | |
| 591 const int l2= stride + l1; | |
| 592 const int l3= stride + l2; | |
| 593 const int l4= (int)tmp - (int)src - stride*3; | |
| 594 const int l5= (int)tmp - (int)src - stride*3 + 8; | |
| 595 const int l6= stride*3 + l3; | |
| 596 const int l7= stride + l6; | |
| 597 const int l8= stride + l7; | |
| 598 | |
| 599 memcpy(tmp, src+stride*7, 8); | |
| 600 memcpy(tmp+8, src+stride*8, 8); | |
| 601 */ | |
| 111 | 602 src+= stride*4; |
| 163 | 603 asm volatile( |
| 604 | |
| 605 #if 0 //sligtly more accurate and slightly slower | |
| 606 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 607 "leal (%0, %1), %%eax \n\t" | |
| 787 | 608 "leal (%%eax, %1, 4), %%ecx \n\t" |
| 163 | 609 // 0 1 2 3 4 5 6 7 |
| 787 | 610 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
| 611 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
| 163 | 612 |
| 613 | |
| 614 "movq (%0, %1, 2), %%mm0 \n\t" // l2 | |
| 615 "movq (%0), %%mm1 \n\t" // l0 | |
| 616 "movq %%mm0, %%mm2 \n\t" // l2 | |
| 617 PAVGB(%%mm7, %%mm0) // ~l2/2 | |
| 618 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | |
| 619 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | |
| 620 | |
| 621 "movq (%%eax), %%mm1 \n\t" // l1 | |
| 622 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 | |
| 623 "movq %%mm1, %%mm4 \n\t" // l1 | |
| 624 PAVGB(%%mm7, %%mm1) // ~l1/2 | |
| 625 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | |
| 626 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | |
| 627 | |
| 628 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | |
| 629 "psubusb %%mm1, %%mm0 \n\t" | |
| 630 "psubusb %%mm4, %%mm1 \n\t" | |
| 631 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | |
| 632 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 | |
| 633 | |
| 634 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
| 635 "movq %%mm0, %%mm4 \n\t" // l4 | |
| 636 PAVGB(%%mm7, %%mm0) // ~l4/2 | |
| 637 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | |
| 638 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | |
| 639 | |
| 787 | 640 "movq (%%ecx), %%mm2 \n\t" // l5 |
| 163 | 641 "movq %%mm3, %%mm5 \n\t" // l3 |
| 642 PAVGB(%%mm7, %%mm3) // ~l3/2 | |
| 643 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | |
| 644 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | |
| 645 | |
| 646 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | |
| 647 "psubusb %%mm3, %%mm0 \n\t" | |
| 648 "psubusb %%mm6, %%mm3 \n\t" | |
| 649 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | |
| 650 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | |
| 651 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | |
| 652 | |
| 787 | 653 "movq (%%ecx, %1), %%mm6 \n\t" // l6 |
| 163 | 654 "movq %%mm6, %%mm5 \n\t" // l6 |
| 655 PAVGB(%%mm7, %%mm6) // ~l6/2 | |
| 656 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | |
| 657 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | |
| 658 | |
| 787 | 659 "movq (%%ecx, %1, 2), %%mm5 \n\t" // l7 |
| 163 | 660 "movq %%mm2, %%mm4 \n\t" // l5 |
| 661 PAVGB(%%mm7, %%mm2) // ~l5/2 | |
| 662 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | |
| 663 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | |
| 664 | |
| 665 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | |
| 666 "psubusb %%mm2, %%mm6 \n\t" | |
| 667 "psubusb %%mm4, %%mm2 \n\t" | |
| 668 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | |
| 669 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | |
| 670 | |
| 671 | |
| 672 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | |
| 787 | 673 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? |
| 210 | 674 "paddusb "MANGLE(b01)", %%mm4 \n\t" |
| 163 | 675 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
| 676 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | |
| 677 "pand %%mm4, %%mm3 \n\t" | |
| 678 | |
| 679 "movq %%mm3, %%mm1 \n\t" | |
| 210 | 680 // "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 681 PAVGB(%%mm7, %%mm3) |
| 682 PAVGB(%%mm7, %%mm3) | |
| 683 "paddusb %%mm1, %%mm3 \n\t" | |
| 210 | 684 // "paddusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 685 |
| 686 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 | |
| 687 "movq (%0, %1, 4), %%mm5 \n\t" //l4 | |
| 688 "movq (%0, %1, 4), %%mm4 \n\t" //l4 | |
| 689 "psubusb %%mm6, %%mm5 \n\t" | |
| 690 "psubusb %%mm4, %%mm6 \n\t" | |
| 691 "por %%mm6, %%mm5 \n\t" // |l3-l4| | |
| 692 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | |
| 693 "pxor %%mm6, %%mm0 \n\t" | |
| 694 "pand %%mm0, %%mm3 \n\t" | |
| 695 PMINUB(%%mm5, %%mm3, %%mm0) | |
| 696 | |
| 210 | 697 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 698 PAVGB(%%mm7, %%mm3) |
| 699 | |
| 700 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 701 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 702 "pxor %%mm6, %%mm0 \n\t" | |
| 703 "pxor %%mm6, %%mm2 \n\t" | |
| 704 "psubb %%mm3, %%mm0 \n\t" | |
| 705 "paddb %%mm3, %%mm2 \n\t" | |
| 706 "pxor %%mm6, %%mm0 \n\t" | |
| 707 "pxor %%mm6, %%mm2 \n\t" | |
| 708 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 709 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 710 #endif | |
| 711 | |
| 712 "leal (%0, %1), %%eax \n\t" | |
| 713 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | |
| 714 // 0 1 2 3 4 5 6 7 | |
| 787 | 715 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
| 716 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 | |
| 163 | 717 |
| 718 | |
| 719 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 | |
| 720 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
| 721 "pxor %%mm6, %%mm1 \n\t" // -l3-1 | |
| 722 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | |
| 723 // mm1=-l3-1, mm0=128-q | |
| 724 | |
| 725 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 | |
| 726 "movq (%%eax, %1), %%mm3 \n\t" // l2 | |
| 727 "pxor %%mm6, %%mm2 \n\t" // -l5-1 | |
| 728 "movq %%mm2, %%mm5 \n\t" // -l5-1 | |
| 210 | 729 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
| 787 | 730 "leal (%%eax, %1, 4), %%ecx \n\t" |
| 163 | 731 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 |
| 732 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | |
| 733 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | |
| 734 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | |
| 735 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | |
| 736 | |
| 737 "movq (%%eax), %%mm2 \n\t" // l1 | |
| 738 "pxor %%mm6, %%mm2 \n\t" // -l1-1 | |
| 739 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | |
| 740 PAVGB((%0), %%mm1) // (l0-l3+256)/2 | |
| 210 | 741 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 |
| 163 | 742 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
| 743 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | |
| 744 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | |
| 745 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | |
| 746 | |
| 787 | 747 PAVGB((%%ecx, %1), %%mm5) // (l6-l5+256)/2 |
| 748 "movq (%%ecx, %1, 2), %%mm1 \n\t" // l7 | |
| 163 | 749 "pxor %%mm6, %%mm1 \n\t" // -l7-1 |
| 750 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | |
| 210 | 751 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
| 163 | 752 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
| 753 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | |
| 754 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | |
| 755 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | |
| 756 | |
| 210 | 757 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
| 758 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 | |
| 163 | 759 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 |
| 760 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | |
| 761 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | |
| 762 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | |
| 763 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | |
| 764 | |
| 765 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | |
| 766 | |
| 210 | 767 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
| 787 | 768 "movq %2, %%mm2 \n\t" // QP |
| 163 | 769 PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
| 770 "psubb %%mm6, %%mm2 \n\t" | |
| 771 | |
| 772 "movq %%mm4, %%mm1 \n\t" | |
| 773 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | |
| 774 "pxor %%mm1, %%mm4 \n\t" | |
| 775 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | |
| 776 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | |
| 777 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | |
| 778 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | |
| 779 | |
| 780 "movq %%mm4, %%mm3 \n\t" // d | |
| 210 | 781 "psubusb "MANGLE(b01)", %%mm4 \n\t" |
| 163 | 782 PAVGB(%%mm7, %%mm4) // d/32 |
| 783 PAVGB(%%mm7, %%mm4) // (d + 32)/64 | |
| 784 "paddb %%mm3, %%mm4 \n\t" // 5d/64 | |
| 785 "pand %%mm2, %%mm4 \n\t" | |
| 786 | |
| 210 | 787 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 |
| 163 | 788 "psubb %%mm0, %%mm5 \n\t" // q |
| 789 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | |
| 790 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | |
| 791 "pxor %%mm7, %%mm5 \n\t" | |
| 792 | |
| 793 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | |
| 794 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | |
| 795 | |
| 796 "pand %%mm7, %%mm4 \n\t" | |
| 797 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 798 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 799 "pxor %%mm1, %%mm0 \n\t" | |
| 800 "pxor %%mm1, %%mm2 \n\t" | |
| 801 "paddb %%mm4, %%mm0 \n\t" | |
| 802 "psubb %%mm4, %%mm2 \n\t" | |
| 803 "pxor %%mm1, %%mm0 \n\t" | |
| 804 "pxor %%mm1, %%mm2 \n\t" | |
| 805 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 806 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 807 | |
| 808 : | |
| 787 | 809 : "r" (src), "r" (stride), "m" (c->pQPb) |
| 810 : "%eax", "%ecx" | |
| 163 | 811 ); |
| 812 | |
| 813 /* | |
| 814 { | |
| 815 int x; | |
| 816 src-= stride; | |
| 817 for(x=0; x<BLOCK_SIZE; x++) | |
| 818 { | |
| 819 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
| 820 if(ABS(middleEnergy)< 8*QP) | |
| 821 { | |
| 822 const int q=(src[l4] - src[l5])/2; | |
| 823 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
| 824 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
| 825 | |
| 826 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
| 827 d= MAX(d, 0); | |
| 828 | |
| 829 d= (5*d + 32) >> 6; | |
| 830 d*= SIGN(-middleEnergy); | |
| 831 | |
| 832 if(q>0) | |
| 833 { | |
| 834 d= d<0 ? 0 : d; | |
| 835 d= d>q ? q : d; | |
| 836 } | |
| 837 else | |
| 838 { | |
| 839 d= d>0 ? 0 : d; | |
| 840 d= d<q ? q : d; | |
| 841 } | |
| 842 | |
| 843 src[l4]-= d; | |
| 844 src[l5]+= d; | |
| 845 } | |
| 846 src++; | |
| 847 } | |
| 848 src-=8; | |
| 849 for(x=0; x<8; x++) | |
| 850 { | |
| 851 int y; | |
| 852 for(y=4; y<6; y++) | |
| 853 { | |
| 854 int d= src[x+y*stride] - tmp[x+(y-4)*8]; | |
| 855 int ad= ABS(d); | |
| 856 static int max=0; | |
| 857 static int sum=0; | |
| 858 static int num=0; | |
| 859 static int bias=0; | |
| 860 | |
| 861 if(max<ad) max=ad; | |
| 862 sum+= ad>3 ? 1 : 0; | |
| 863 if(ad>3) | |
| 864 { | |
| 865 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | |
| 866 } | |
| 867 if(y==4) bias+=d; | |
| 868 num++; | |
| 869 if(num%1000000 == 0) | |
| 870 { | |
| 871 printf(" %d %d %d %d\n", num, sum, max, bias); | |
| 872 } | |
| 873 } | |
| 874 } | |
| 875 } | |
| 876 */ | |
| 877 #elif defined (HAVE_MMX) | |
| 878 src+= stride*4; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
879 asm volatile( |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
880 "pxor %%mm7, %%mm7 \n\t" |
| 787 | 881 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars |
| 882 "andl $0xFFFFFFF8, %%ecx \n\t" // align | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
883 // 0 1 2 3 4 5 6 7 |
| 787 | 884 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 |
| 885 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
886 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
887 "movq (%0), %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
888 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
889 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
890 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
891 |
| 810 | 892 "movq (%0, %1), %%mm2 \n\t" |
| 893 "leal (%0, %1, 2), %%eax \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
894 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
895 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
896 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
897 |
| 810 | 898 "movq (%%eax), %%mm4 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
899 "movq %%mm4, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
900 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
901 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
902 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
903 "paddw %%mm0, %%mm0 \n\t" // 2L0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
904 "paddw %%mm1, %%mm1 \n\t" // 2H0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
905 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
906 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
907 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
908 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
909 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
910 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
911 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
912 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
913 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
914 |
| 810 | 915 "movq (%%eax, %1), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
916 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
917 "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
918 "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
919 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
920 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
921 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
922 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
923 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
| 787 | 924 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 925 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
926 |
| 810 | 927 "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
928 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
929 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
930 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
931 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
932 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
933 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
| 787 | 934 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 |
| 935 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
936 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
937 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
938 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
939 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
940 |
| 810 | 941 "leal (%%eax, %1), %0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
942 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
943 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
944 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
945 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
946 //50 opcodes so far |
| 810 | 947 "movq (%0, %1, 2), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
948 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
949 "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
950 "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
951 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
952 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
953 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
954 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
955 |
| 810 | 956 "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
957 "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
958 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
| 810 | 959 "movq (%%eax, %1, 4), %%mm6 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
960 "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
961 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
962 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
963 "paddw %%mm0, %%mm0 \n\t" // 2L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
964 "paddw %%mm1, %%mm1 \n\t" // 2H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
965 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
966 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
967 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
968 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
969 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
970 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
971 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
972 |
| 810 | 973 "movq (%0, %1, 4), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
974 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
975 "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
976 "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
977 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
978 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
979 "paddw %%mm3, %%mm3 \n\t" // 2H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
980 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
981 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
982 |
| 787 | 983 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 984 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 140 | 985 |
| 986 #ifdef HAVE_MMX2 | |
| 987 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 988 "psubw %%mm0, %%mm6 \n\t" | |
| 989 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
| 990 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 991 "psubw %%mm1, %%mm6 \n\t" | |
| 992 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
| 993 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 994 "psubw %%mm2, %%mm6 \n\t" | |
| 995 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
| 996 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 997 "psubw %%mm3, %%mm6 \n\t" | |
| 998 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
| 999 #else | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1000 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1001 "pcmpgtw %%mm0, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1002 "pxor %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1003 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1004 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1005 "pcmpgtw %%mm1, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1006 "pxor %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1007 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1008 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1009 "pcmpgtw %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1010 "pxor %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1011 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1012 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1013 "pcmpgtw %%mm3, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1014 "pxor %%mm6, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1015 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
| 140 | 1016 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1017 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1018 #ifdef HAVE_MMX2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1019 "pminsw %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1020 "pminsw %%mm3, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1021 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1022 "movq %%mm0, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1023 "psubusw %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1024 "psubw %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1025 "movq %%mm1, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1026 "psubusw %%mm3, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1027 "psubw %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1028 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1029 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1030 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1031 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1032 "pxor %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1033 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1034 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1035 "pxor %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1036 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1037 // 100 opcodes |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1038 "movd %2, %%mm2 \n\t" // QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1039 "psllw $3, %%mm2 \n\t" // 8QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1040 "movq %%mm2, %%mm3 \n\t" // 8QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1041 "pcmpgtw %%mm4, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1042 "pcmpgtw %%mm5, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1043 "pand %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1044 "pand %%mm3, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1045 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1046 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1047 "psubusw %%mm0, %%mm4 \n\t" // hd |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1048 "psubusw %%mm1, %%mm5 \n\t" // ld |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1049 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1050 |
| 211 | 1051 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1052 "pmullw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1053 "pmullw %%mm2, %%mm5 \n\t" |
| 211 | 1054 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1055 "paddw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1056 "paddw %%mm2, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1057 "psrlw $6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1058 "psrlw $6, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1059 |
| 787 | 1060 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 |
| 1061 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1062 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1063 "pxor %%mm2, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1064 "pxor %%mm3, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1065 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1066 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1067 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1068 "pxor %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1069 "pxor %%mm3, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1070 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1071 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1072 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1073 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1074 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1075 "pxor %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1076 "pxor %%mm7, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1077 "pand %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1078 "pand %%mm3, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1079 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1080 #ifdef HAVE_MMX2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1081 "pminsw %%mm0, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1082 "pminsw %%mm1, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1083 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1084 "movq %%mm4, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1085 "psubusw %%mm0, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1086 "psubw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1087 "movq %%mm5, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1088 "psubusw %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1089 "psubw %%mm2, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1090 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1091 "pxor %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1092 "pxor %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1093 "psubw %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1094 "psubw %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1095 "packsswb %%mm5, %%mm4 \n\t" |
| 810 | 1096 "movq (%0), %%mm0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1097 "paddb %%mm4, %%mm0 \n\t" |
| 810 | 1098 "movq %%mm0, (%0) \n\t" |
| 1099 "movq (%0, %1), %%mm0 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1100 "psubb %%mm4, %%mm0 \n\t" |
| 810 | 1101 "movq %%mm0, (%0, %1) \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1102 |
| 810 | 1103 : "+r" (src) |
| 1104 : "r" (stride), "m" (c->pQPb) | |
| 1105 : "%eax", "%ecx" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1106 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1107 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1108 const int l1= stride; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1109 const int l2= stride + l1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1110 const int l3= stride + l2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1111 const int l4= stride + l3; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1112 const int l5= stride + l4; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1113 const int l6= stride + l5; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1114 const int l7= stride + l6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1115 const int l8= stride + l7; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1116 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
1117 int x; |
| 111 | 1118 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
1119 for(x=0; x<BLOCK_SIZE; x++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1120 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1121 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
| 787 | 1122 if(ABS(middleEnergy) < 8*c->QP) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1123 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1124 const int q=(src[l4] - src[l5])/2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1125 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1126 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1127 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1128 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1129 d= MAX(d, 0); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1130 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1131 d= (5*d + 32) >> 6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1132 d*= SIGN(-middleEnergy); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1133 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1134 if(q>0) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1135 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1136 d= d<0 ? 0 : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1137 d= d>q ? q : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1138 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1139 else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1140 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1141 d= d>0 ? 0 : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1142 d= d<q ? q : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1143 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1144 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1145 src[l4]-= d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1146 src[l5]+= d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1147 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1148 src++; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1149 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1150 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1151 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1152 |
| 787 | 1153 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1154 { |
| 132 | 1155 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1156 asm volatile( |
| 787 | 1157 "pxor %%mm6, %%mm6 \n\t" |
| 1158 "pcmpeqb %%mm7, %%mm7 \n\t" | |
| 1159 "movq %2, %%mm0 \n\t" | |
| 1160 "punpcklbw %%mm6, %%mm0 \n\t" | |
| 1161 "psrlw $1, %%mm0 \n\t" | |
| 1162 "psubw %%mm7, %%mm0 \n\t" | |
| 1163 "packuswb %%mm0, %%mm0 \n\t" | |
| 1164 "movq %%mm0, %3 \n\t" | |
| 130 | 1165 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1166 "leal (%0, %1), %%eax \n\t" |
| 787 | 1167 "leal (%%eax, %1, 4), %%edx \n\t" |
| 1168 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1169 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1170 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1171 |
| 169 | 1172 #undef FIND_MIN_MAX |
| 132 | 1173 #ifdef HAVE_MMX2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1174 #define FIND_MIN_MAX(addr)\ |
| 130 | 1175 "movq " #addr ", %%mm0 \n\t"\ |
| 167 | 1176 "pminub %%mm0, %%mm7 \n\t"\ |
| 1177 "pmaxub %%mm0, %%mm6 \n\t" | |
| 132 | 1178 #else |
| 1179 #define FIND_MIN_MAX(addr)\ | |
| 1180 "movq " #addr ", %%mm0 \n\t"\ | |
| 167 | 1181 "movq %%mm7, %%mm1 \n\t"\ |
| 1182 "psubusb %%mm0, %%mm6 \n\t"\ | |
| 1183 "paddb %%mm0, %%mm6 \n\t"\ | |
| 132 | 1184 "psubusb %%mm0, %%mm1 \n\t"\ |
| 167 | 1185 "psubb %%mm1, %%mm7 \n\t" |
| 132 | 1186 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1187 |
| 130 | 1188 FIND_MIN_MAX((%%eax)) |
| 1189 FIND_MIN_MAX((%%eax, %1)) | |
| 1190 FIND_MIN_MAX((%%eax, %1, 2)) | |
| 1191 FIND_MIN_MAX((%0, %1, 4)) | |
| 787 | 1192 FIND_MIN_MAX((%%edx)) |
| 1193 FIND_MIN_MAX((%%edx, %1)) | |
| 1194 FIND_MIN_MAX((%%edx, %1, 2)) | |
| 130 | 1195 FIND_MIN_MAX((%0, %1, 8)) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1196 |
| 167 | 1197 "movq %%mm7, %%mm4 \n\t" |
| 1198 "psrlq $8, %%mm7 \n\t" | |
| 1199 #ifdef HAVE_MMX2 | |
| 1200 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1201 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
| 1202 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1203 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
| 1204 "pminub %%mm4, %%mm7 \n\t" | |
| 1205 #else | |
| 1206 "movq %%mm7, %%mm1 \n\t" | |
| 1207 "psubusb %%mm4, %%mm1 \n\t" | |
| 1208 "psubb %%mm1, %%mm7 \n\t" | |
| 1209 "movq %%mm7, %%mm4 \n\t" | |
| 1210 "psrlq $16, %%mm7 \n\t" | |
| 1211 "movq %%mm7, %%mm1 \n\t" | |
| 1212 "psubusb %%mm4, %%mm1 \n\t" | |
| 1213 "psubb %%mm1, %%mm7 \n\t" | |
| 1214 "movq %%mm7, %%mm4 \n\t" | |
| 1215 "psrlq $32, %%mm7 \n\t" | |
| 1216 "movq %%mm7, %%mm1 \n\t" | |
| 1217 "psubusb %%mm4, %%mm1 \n\t" | |
| 1218 "psubb %%mm1, %%mm7 \n\t" | |
| 1219 #endif | |
| 1220 | |
| 1221 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1222 "movq %%mm6, %%mm4 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1223 "psrlq $8, %%mm6 \n\t" |
| 132 | 1224 #ifdef HAVE_MMX2 |
| 167 | 1225 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1226 "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
| 167 | 1227 "pmaxub %%mm4, %%mm6 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1228 "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
| 167 | 1229 "pmaxub %%mm4, %%mm6 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1230 #else |
| 167 | 1231 "psubusb %%mm4, %%mm6 \n\t" |
| 1232 "paddb %%mm4, %%mm6 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1233 "movq %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1234 "psrlq $16, %%mm6 \n\t" |
| 167 | 1235 "psubusb %%mm4, %%mm6 \n\t" |
| 1236 "paddb %%mm4, %%mm6 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1237 "movq %%mm6, %%mm4 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1238 "psrlq $32, %%mm6 \n\t" |
| 167 | 1239 "psubusb %%mm4, %%mm6 \n\t" |
| 1240 "paddb %%mm4, %%mm6 \n\t" | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1241 #endif |
| 167 | 1242 "movq %%mm6, %%mm0 \n\t" // max |
| 1243 "psubb %%mm7, %%mm6 \n\t" // max - min | |
| 1244 "movd %%mm6, %%ecx \n\t" | |
| 210 | 1245 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" |
| 167 | 1246 " jb 1f \n\t" |
| 787 | 1247 "leal -24(%%esp), %%ecx \n\t" |
| 1248 "andl $0xFFFFFFF8, %%ecx \n\t" | |
| 167 | 1249 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1250 "punpcklbw %%mm7, %%mm7 \n\t" |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1251 "punpcklbw %%mm7, %%mm7 \n\t" |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1252 "punpcklbw %%mm7, %%mm7 \n\t" |
| 787 | 1253 "movq %%mm7, (%%ecx) \n\t" |
| 130 | 1254 |
| 1255 "movq (%0), %%mm0 \n\t" // L10 | |
| 1256 "movq %%mm0, %%mm1 \n\t" // L10 | |
| 1257 "movq %%mm0, %%mm2 \n\t" // L10 | |
| 1258 "psllq $8, %%mm1 \n\t" | |
| 1259 "psrlq $8, %%mm2 \n\t" | |
| 1260 "movd -4(%0), %%mm3 \n\t" | |
| 1261 "movd 8(%0), %%mm4 \n\t" | |
| 1262 "psrlq $24, %%mm3 \n\t" | |
| 1263 "psllq $56, %%mm4 \n\t" | |
| 1264 "por %%mm3, %%mm1 \n\t" // L00 | |
| 1265 "por %%mm4, %%mm2 \n\t" // L20 | |
| 1266 "movq %%mm1, %%mm3 \n\t" // L00 | |
| 1267 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 | |
| 1268 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 | |
| 1269 "psubusb %%mm7, %%mm0 \n\t" | |
| 1270 "psubusb %%mm7, %%mm2 \n\t" | |
| 1271 "psubusb %%mm7, %%mm3 \n\t" | |
| 210 | 1272 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 |
| 1273 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 | |
| 1274 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 | |
| 130 | 1275 "paddb %%mm2, %%mm0 \n\t" |
| 1276 "paddb %%mm3, %%mm0 \n\t" | |
| 1277 | |
| 1278 "movq (%%eax), %%mm2 \n\t" // L11 | |
| 1279 "movq %%mm2, %%mm3 \n\t" // L11 | |
| 1280 "movq %%mm2, %%mm4 \n\t" // L11 | |
| 1281 "psllq $8, %%mm3 \n\t" | |
| 1282 "psrlq $8, %%mm4 \n\t" | |
| 1283 "movd -4(%%eax), %%mm5 \n\t" | |
| 1284 "movd 8(%%eax), %%mm6 \n\t" | |
| 1285 "psrlq $24, %%mm5 \n\t" | |
| 1286 "psllq $56, %%mm6 \n\t" | |
| 1287 "por %%mm5, %%mm3 \n\t" // L01 | |
| 1288 "por %%mm6, %%mm4 \n\t" // L21 | |
| 1289 "movq %%mm3, %%mm5 \n\t" // L01 | |
| 1290 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 | |
| 1291 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 | |
| 1292 "psubusb %%mm7, %%mm2 \n\t" | |
| 1293 "psubusb %%mm7, %%mm4 \n\t" | |
| 1294 "psubusb %%mm7, %%mm5 \n\t" | |
| 210 | 1295 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 |
| 1296 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 | |
| 1297 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 | |
| 130 | 1298 "paddb %%mm4, %%mm2 \n\t" |
| 1299 "paddb %%mm5, %%mm2 \n\t" | |
| 1300 // 0, 2, 3, 1 | |
| 1301 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ | |
| 1302 "movq " #src ", " #sx " \n\t" /* src[0] */\ | |
| 1303 "movq " #sx ", " #lx " \n\t" /* src[0] */\ | |
| 1304 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ | |
| 1305 "psllq $8, " #lx " \n\t"\ | |
| 1306 "psrlq $8, " #t0 " \n\t"\ | |
| 1307 "movd -4" #src ", " #t1 " \n\t"\ | |
| 1308 "psrlq $24, " #t1 " \n\t"\ | |
| 1309 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ | |
| 1310 "movd 8" #src ", " #t1 " \n\t"\ | |
| 1311 "psllq $56, " #t1 " \n\t"\ | |
| 1312 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | |
| 1313 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | |
| 1314 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | |
| 1315 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | |
| 135 | 1316 PAVGB(lx, pplx) \ |
| 787 | 1317 "movq " #lx ", 8(%%ecx) \n\t"\ |
| 1318 "movq (%%ecx), " #lx " \n\t"\ | |
| 140 | 1319 "psubusb " #lx ", " #t1 " \n\t"\ |
| 1320 "psubusb " #lx ", " #t0 " \n\t"\ | |
| 1321 "psubusb " #lx ", " #sx " \n\t"\ | |
| 210 | 1322 "movq "MANGLE(b00)", " #lx " \n\t"\ |
| 140 | 1323 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
| 1324 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ | |
| 1325 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ | |
| 130 | 1326 "paddb " #t1 ", " #t0 " \n\t"\ |
| 1327 "paddb " #t0 ", " #sx " \n\t"\ | |
| 1328 \ | |
| 1329 PAVGB(plx, pplx) /* filtered */\ | |
| 1330 "movq " #dst ", " #t0 " \n\t" /* dst */\ | |
| 134 | 1331 "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
| 787 | 1332 "psubusb %3, " #t0 " \n\t"\ |
| 1333 "paddusb %3, " #t1 " \n\t"\ | |
| 134 | 1334 PMAXUB(t0, pplx)\ |
| 1335 PMINUB(t1, pplx, t0)\ | |
| 130 | 1336 "paddb " #sx ", " #ppsx " \n\t"\ |
| 1337 "paddb " #psx ", " #ppsx " \n\t"\ | |
| 210 | 1338 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
| 1339 "pand "MANGLE(b08)", " #ppsx " \n\t"\ | |
| 140 | 1340 "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
| 134 | 1341 "pand " #ppsx ", " #pplx " \n\t"\ |
| 130 | 1342 "pandn " #dst ", " #ppsx " \n\t"\ |
| 140 | 1343 "por " #pplx ", " #ppsx " \n\t"\ |
| 135 | 1344 "movq " #ppsx ", " #dst " \n\t"\ |
| 787 | 1345 "movq 8(%%ecx), " #lx " \n\t" |
| 134 | 1346 |
| 130 | 1347 /* |
| 1348 0000000 | |
| 1349 1111111 | |
| 1350 | |
| 1351 1111110 | |
| 1352 1111101 | |
| 1353 1111100 | |
| 1354 1111011 | |
| 1355 1111010 | |
| 1356 1111001 | |
| 1357 | |
| 1358 1111000 | |
| 1359 1110111 | |
| 1360 | |
| 1361 */ | |
| 1362 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) | |
| 1363 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1364 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1365 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
| 787 | 1366 DERING_CORE((%0, %1, 4),(%%edx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
| 1367 DERING_CORE((%%edx),(%%edx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1368 DERING_CORE((%%edx, %1), (%%edx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
| 1369 DERING_CORE((%%edx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1370 DERING_CORE((%0, %1, 8),(%%edx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1371 |
| 167 | 1372 "1: \n\t" |
| 787 | 1373 : : "r" (src), "r" (stride), "m" (c->pQPb), "m"(c->pQPb2) |
| 1374 : "%eax", "%edx", "%ecx" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1375 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1376 #else |
| 134 | 1377 int y; |
| 1378 int min=255; | |
| 1379 int max=0; | |
| 1380 int avg; | |
| 1381 uint8_t *p; | |
| 1382 int s[10]; | |
| 787 | 1383 const int QP2= c->QP/2 + 1; |
| 134 | 1384 |
| 1385 for(y=1; y<9; y++) | |
| 1386 { | |
| 1387 int x; | |
| 1388 p= src + stride*y; | |
| 1389 for(x=1; x<9; x++) | |
| 1390 { | |
| 1391 p++; | |
| 1392 if(*p > max) max= *p; | |
| 1393 if(*p < min) min= *p; | |
| 1394 } | |
| 1395 } | |
| 787 | 1396 avg= (min + max + 1)>>1; |
| 134 | 1397 |
| 167 | 1398 if(max - min <deringThreshold) return; |
| 1399 | |
| 134 | 1400 for(y=0; y<10; y++) |
| 1401 { | |
| 1402 int t = 0; | |
| 787 | 1403 |
| 1404 if(src[stride*y + 0] > avg) t+= 1; | |
| 1405 if(src[stride*y + 1] > avg) t+= 2; | |
| 1406 if(src[stride*y + 2] > avg) t+= 4; | |
| 1407 if(src[stride*y + 3] > avg) t+= 8; | |
| 1408 if(src[stride*y + 4] > avg) t+= 16; | |
| 1409 if(src[stride*y + 5] > avg) t+= 32; | |
| 1410 if(src[stride*y + 6] > avg) t+= 64; | |
| 1411 if(src[stride*y + 7] > avg) t+= 128; | |
| 1412 if(src[stride*y + 8] > avg) t+= 256; | |
| 1413 if(src[stride*y + 9] > avg) t+= 512; | |
| 1414 | |
| 134 | 1415 t |= (~t)<<16; |
| 1416 t &= (t<<1) & (t>>1); | |
| 1417 s[y] = t; | |
| 1418 } | |
| 787 | 1419 |
| 1420 for(y=1; y<9; y++) | |
| 1421 { | |
| 1422 int t = s[y-1] & s[y] & s[y+1]; | |
| 1423 t|= t>>16; | |
| 1424 s[y-1]= t; | |
| 1425 } | |
| 134 | 1426 |
| 1427 for(y=1; y<9; y++) | |
| 1428 { | |
| 1429 int x; | |
| 787 | 1430 int t = s[y-1]; |
| 134 | 1431 |
| 1432 p= src + stride*y; | |
| 1433 for(x=1; x<9; x++) | |
| 1434 { | |
| 1435 p++; | |
| 1436 if(t & (1<<x)) | |
| 1437 { | |
| 1438 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | |
| 1439 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | |
| 1440 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | |
| 1441 f= (f + 8)>>4; | |
| 1442 | |
| 167 | 1443 #ifdef DEBUG_DERING_THRESHOLD |
| 1444 asm volatile("emms\n\t":); | |
| 1445 { | |
| 1446 static long long numPixels=0; | |
| 1447 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | |
| 1448 // if((max-min)<20 || (max-min)*QP<200) | |
| 1449 // if((max-min)*QP < 500) | |
| 1450 // if(max-min<QP/2) | |
| 1451 if(max-min < 20) | |
| 1452 { | |
| 1453 static int numSkiped=0; | |
| 1454 static int errorSum=0; | |
| 1455 static int worstQP=0; | |
| 1456 static int worstRange=0; | |
| 1457 static int worstDiff=0; | |
| 1458 int diff= (f - *p); | |
| 1459 int absDiff= ABS(diff); | |
| 1460 int error= diff*diff; | |
| 1461 | |
| 1462 if(x==1 || x==8 || y==1 || y==8) continue; | |
| 1463 | |
| 1464 numSkiped++; | |
| 1465 if(absDiff > worstDiff) | |
| 1466 { | |
| 1467 worstDiff= absDiff; | |
| 1468 worstQP= QP; | |
| 1469 worstRange= max-min; | |
| 1470 } | |
| 1471 errorSum+= error; | |
| 1472 | |
| 1473 if(1024LL*1024LL*1024LL % numSkiped == 0) | |
| 1474 { | |
| 1475 printf( "sum:%1.3f, skip:%d, wQP:%d, " | |
| 1476 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | |
| 1477 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, | |
| 1478 worstDiff, (float)numSkiped/numPixels); | |
| 1479 } | |
| 1480 } | |
| 1481 } | |
| 1482 #endif | |
| 787 | 1483 if (*p + QP2 < f) *p= *p + QP2; |
| 1484 else if(*p - QP2 > f) *p= *p - QP2; | |
| 134 | 1485 else *p=f; |
| 1486 } | |
| 1487 } | |
| 1488 } | |
| 167 | 1489 #ifdef DEBUG_DERING_THRESHOLD |
| 1490 if(max-min < 20) | |
| 1491 { | |
| 1492 for(y=1; y<9; y++) | |
| 1493 { | |
| 1494 int x; | |
| 1495 int t = 0; | |
| 1496 p= src + stride*y; | |
| 1497 for(x=1; x<9; x++) | |
| 1498 { | |
| 1499 p++; | |
| 1500 *p = MIN(*p + 20, 255); | |
| 1501 } | |
| 1502 } | |
| 1503 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | |
| 1504 } | |
| 1505 #endif | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1506 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1507 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1508 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1509 /** |
| 1109 | 1510 * Deinterlaces the given block by linearly interpolating every second line. |
| 142 | 1511 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1512 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1513 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1514 */ |
| 169 | 1515 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1516 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1517 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1518 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1519 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1520 "leal (%0, %1), %%eax \n\t" |
| 787 | 1521 "leal (%%eax, %1, 4), %%ecx \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1522 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1523 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1524 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1525 "movq (%0), %%mm0 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1526 "movq (%%eax, %1), %%mm1 \n\t" |
| 111 | 1527 PAVGB(%%mm1, %%mm0) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1528 "movq %%mm0, (%%eax) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1529 "movq (%0, %1, 4), %%mm0 \n\t" |
| 111 | 1530 PAVGB(%%mm0, %%mm1) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1531 "movq %%mm1, (%%eax, %1, 2) \n\t" |
| 787 | 1532 "movq (%%ecx, %1), %%mm1 \n\t" |
| 111 | 1533 PAVGB(%%mm1, %%mm0) |
| 787 | 1534 "movq %%mm0, (%%ecx) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1535 "movq (%0, %1, 8), %%mm0 \n\t" |
| 111 | 1536 PAVGB(%%mm0, %%mm1) |
| 787 | 1537 "movq %%mm1, (%%ecx, %1, 2) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1538 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1539 : : "r" (src), "r" (stride) |
| 787 | 1540 : "%eax", "%ecx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1541 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1542 #else |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1543 int a, b, x; |
| 142 | 1544 src+= 4*stride; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1545 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1546 for(x=0; x<2; x++){ |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1547 a= *(uint32_t*)&src[stride*0]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1548 b= *(uint32_t*)&src[stride*2]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1549 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1550 a= *(uint32_t*)&src[stride*4]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1551 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1552 b= *(uint32_t*)&src[stride*6]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1553 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1554 a= *(uint32_t*)&src[stride*8]; |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1555 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1556 src += 4; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1557 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1558 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1559 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1560 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1561 /** |
| 1109 | 1562 * Deinterlaces the given block by cubic interpolating every second line. |
| 142 | 1563 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1564 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1565 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 1566 * this filter will read lines 3-15 and write 7-13 | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1567 */ |
| 169 | 1568 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1569 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1570 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1571 src+= stride*3; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1572 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1573 "leal (%0, %1), %%eax \n\t" |
| 787 | 1574 "leal (%%eax, %1, 4), %%edx \n\t" |
| 1575 "leal (%%edx, %1, 4), %%ecx \n\t" | |
| 111 | 1576 "addl %1, %%ecx \n\t" |
| 1577 "pxor %%mm7, %%mm7 \n\t" | |
| 1578 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 787 | 1579 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1580 |
| 111 | 1581 #define DEINT_CUBIC(a,b,c,d,e)\ |
| 1582 "movq " #a ", %%mm0 \n\t"\ | |
| 1583 "movq " #b ", %%mm1 \n\t"\ | |
| 1584 "movq " #d ", %%mm2 \n\t"\ | |
| 1585 "movq " #e ", %%mm3 \n\t"\ | |
| 1586 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ | |
| 1587 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ | |
| 1588 "movq %%mm0, %%mm2 \n\t"\ | |
| 1589 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
| 1590 "punpckhbw %%mm7, %%mm2 \n\t"\ | |
| 1591 "movq %%mm1, %%mm3 \n\t"\ | |
| 1592 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
| 1593 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
| 1594 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ | |
| 1595 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ | |
| 1596 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ | |
| 1597 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ | |
| 1598 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | |
| 1599 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | |
| 1600 "packuswb %%mm3, %%mm1 \n\t"\ | |
| 1601 "movq %%mm1, " #c " \n\t" | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1602 |
| 787 | 1603 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx, %1)) |
| 1604 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%edx), (%%edx, %1), (%0, %1, 8)) | |
| 1605 DEINT_CUBIC((%0, %1, 4), (%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%ecx)) | |
| 1606 DEINT_CUBIC((%%edx, %1), (%0, %1, 8), (%%edx, %1, 4), (%%ecx), (%%ecx, %1, 2)) | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1607 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1608 : : "r" (src), "r" (stride) |
| 787 | 1609 : "%eax", "%edx", "ecx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1610 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1611 #else |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1612 int x; |
| 142 | 1613 src+= stride*3; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1614 for(x=0; x<8; x++) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1615 { |
| 1157 | 1616 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); |
| 1617 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); | |
| 1618 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); | |
| 1619 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1620 src++; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1621 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1622 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1623 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1624 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1625 /** |
| 1109 | 1626 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. |
| 142 | 1627 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1628 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1629 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 787 | 1630 * this filter will read lines 4-13 and write 5-11 |
| 1631 */ | |
| 1632 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) | |
| 1633 { | |
| 1634 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 1635 src+= stride*4; | |
| 1636 asm volatile( | |
| 1637 "leal (%0, %1), %%eax \n\t" | |
| 1638 "leal (%%eax, %1, 4), %%edx \n\t" | |
| 1639 "pxor %%mm7, %%mm7 \n\t" | |
| 1640 "movq (%2), %%mm0 \n\t" | |
| 1641 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 1642 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
| 1643 | |
| 1644 #define DEINT_FF(a,b,c,d)\ | |
| 1645 "movq " #a ", %%mm1 \n\t"\ | |
| 1646 "movq " #b ", %%mm2 \n\t"\ | |
| 1647 "movq " #c ", %%mm3 \n\t"\ | |
| 1648 "movq " #d ", %%mm4 \n\t"\ | |
| 1649 PAVGB(%%mm3, %%mm1) \ | |
| 1650 PAVGB(%%mm4, %%mm0) \ | |
| 1651 "movq %%mm0, %%mm3 \n\t"\ | |
| 1652 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
| 1653 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
| 1654 "movq %%mm1, %%mm4 \n\t"\ | |
| 1655 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
| 1656 "punpckhbw %%mm7, %%mm4 \n\t"\ | |
| 1657 "psllw $2, %%mm1 \n\t"\ | |
| 1658 "psllw $2, %%mm4 \n\t"\ | |
| 1659 "psubw %%mm0, %%mm1 \n\t"\ | |
| 1660 "psubw %%mm3, %%mm4 \n\t"\ | |
| 1661 "movq %%mm2, %%mm5 \n\t"\ | |
| 1662 "movq %%mm2, %%mm0 \n\t"\ | |
| 1663 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
| 1664 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
| 1665 "paddw %%mm2, %%mm1 \n\t"\ | |
| 1666 "paddw %%mm5, %%mm4 \n\t"\ | |
| 1667 "psraw $2, %%mm1 \n\t"\ | |
| 1668 "psraw $2, %%mm4 \n\t"\ | |
| 1669 "packuswb %%mm4, %%mm1 \n\t"\ | |
| 1670 "movq %%mm1, " #b " \n\t"\ | |
| 1671 | |
| 1672 DEINT_FF((%0) , (%%eax) , (%%eax, %1), (%%eax, %1, 2)) | |
| 1673 DEINT_FF((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%edx) ) | |
| 1674 DEINT_FF((%0, %1, 4), (%%edx) , (%%edx, %1), (%%edx, %1, 2)) | |
| 1675 DEINT_FF((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8), (%%edx, %1, 4)) | |
| 1676 | |
| 1677 "movq %%mm0, (%2) \n\t" | |
| 1678 : : "r" (src), "r" (stride), "r"(tmp) | |
| 1679 : "%eax", "%edx" | |
| 1680 ); | |
| 1681 #else | |
| 1682 int x; | |
| 1683 src+= stride*4; | |
| 1684 for(x=0; x<8; x++) | |
| 1685 { | |
| 1686 int t1= tmp[x]; | |
| 1687 int t2= src[stride*1]; | |
| 1688 | |
| 1157 | 1689 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); |
| 787 | 1690 t1= src[stride*4]; |
| 1157 | 1691 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); |
| 787 | 1692 t2= src[stride*6]; |
| 1157 | 1693 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); |
| 787 | 1694 t1= src[stride*8]; |
| 1157 | 1695 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); |
| 787 | 1696 tmp[x]= t1; |
| 1697 | |
| 1698 src++; | |
| 1699 } | |
| 1700 #endif | |
| 1701 } | |
| 1702 | |
| 1703 /** | |
| 1157 | 1704 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. |
| 1705 * will be called for every 8x8 block and can read & write from line 4-15 | |
| 1706 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1707 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 1708 * this filter will read lines 4-13 and write 4-11 | |
| 1709 */ | |
| 1710 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) | |
| 1711 { | |
| 1712 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 1713 src+= stride*4; | |
| 1714 asm volatile( | |
| 1715 "leal (%0, %1), %%eax \n\t" | |
| 1716 "leal (%%eax, %1, 4), %%edx \n\t" | |
| 1717 "pxor %%mm7, %%mm7 \n\t" | |
| 1718 "movq (%2), %%mm0 \n\t" | |
| 1719 "movq (%3), %%mm1 \n\t" | |
| 1720 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 1721 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx | |
| 1722 | |
| 1723 #define DEINT_L5(t1,t2,a,b,c)\ | |
| 1724 "movq " #a ", %%mm2 \n\t"\ | |
| 1725 "movq " #b ", %%mm3 \n\t"\ | |
| 1726 "movq " #c ", %%mm4 \n\t"\ | |
| 1727 PAVGB(t2, %%mm3) \ | |
| 1728 PAVGB(t1, %%mm4) \ | |
| 1729 "movq %%mm2, %%mm5 \n\t"\ | |
| 1730 "movq %%mm2, " #t1 " \n\t"\ | |
| 1731 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
| 1732 "punpckhbw %%mm7, %%mm5 \n\t"\ | |
| 1733 "movq %%mm2, %%mm6 \n\t"\ | |
| 1734 "paddw %%mm2, %%mm2 \n\t"\ | |
| 1735 "paddw %%mm6, %%mm2 \n\t"\ | |
| 1736 "movq %%mm5, %%mm6 \n\t"\ | |
| 1737 "paddw %%mm5, %%mm5 \n\t"\ | |
| 1738 "paddw %%mm6, %%mm5 \n\t"\ | |
| 1739 "movq %%mm3, %%mm6 \n\t"\ | |
| 1740 "punpcklbw %%mm7, %%mm3 \n\t"\ | |
| 1741 "punpckhbw %%mm7, %%mm6 \n\t"\ | |
| 1742 "paddw %%mm3, %%mm3 \n\t"\ | |
| 1743 "paddw %%mm6, %%mm6 \n\t"\ | |
| 1744 "paddw %%mm3, %%mm2 \n\t"\ | |
| 1745 "paddw %%mm6, %%mm5 \n\t"\ | |
| 1746 "movq %%mm4, %%mm6 \n\t"\ | |
| 1747 "punpcklbw %%mm7, %%mm4 \n\t"\ | |
| 1748 "punpckhbw %%mm7, %%mm6 \n\t"\ | |
| 1749 "psubw %%mm4, %%mm2 \n\t"\ | |
| 1750 "psubw %%mm6, %%mm5 \n\t"\ | |
| 1751 "psraw $2, %%mm2 \n\t"\ | |
| 1752 "psraw $2, %%mm5 \n\t"\ | |
| 1753 "packuswb %%mm5, %%mm2 \n\t"\ | |
| 1754 "movq %%mm2, " #a " \n\t"\ | |
| 1755 | |
| 1756 DEINT_L5(%%mm0, %%mm1, (%0) , (%%eax) , (%%eax, %1) ) | |
| 1757 DEINT_L5(%%mm1, %%mm0, (%%eax) , (%%eax, %1) , (%%eax, %1, 2)) | |
| 1758 DEINT_L5(%%mm0, %%mm1, (%%eax, %1) , (%%eax, %1, 2), (%0, %1, 4) ) | |
| 1759 DEINT_L5(%%mm1, %%mm0, (%%eax, %1, 2), (%0, %1, 4) , (%%edx) ) | |
| 1760 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%edx) , (%%edx, %1) ) | |
| 1761 DEINT_L5(%%mm1, %%mm0, (%%edx) , (%%edx, %1) , (%%edx, %1, 2)) | |
| 1762 DEINT_L5(%%mm0, %%mm1, (%%edx, %1) , (%%edx, %1, 2), (%0, %1, 8) ) | |
| 1763 DEINT_L5(%%mm1, %%mm0, (%%edx, %1, 2), (%0, %1, 8) , (%%edx, %1, 4)) | |
| 1764 | |
| 1765 "movq %%mm0, (%2) \n\t" | |
| 1766 "movq %%mm1, (%3) \n\t" | |
| 1767 : : "r" (src), "r" (stride), "r"(tmp), "r"(tmp2) | |
| 1768 : "%eax", "%edx" | |
| 1769 ); | |
| 1770 #else | |
| 1771 int x; | |
| 1772 src+= stride*4; | |
| 1773 for(x=0; x<8; x++) | |
| 1774 { | |
| 1775 int t1= tmp[x]; | |
| 1776 int t2= tmp2[x]; | |
| 1777 int t3= src[0]; | |
| 1778 | |
| 1779 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); | |
| 1780 t1= src[stride*1]; | |
| 1781 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); | |
| 1782 t2= src[stride*2]; | |
| 1783 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); | |
| 1784 t3= src[stride*3]; | |
| 1785 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); | |
| 1786 t1= src[stride*4]; | |
| 1787 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); | |
| 1788 t2= src[stride*5]; | |
| 1789 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); | |
| 1790 t3= src[stride*6]; | |
| 1791 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); | |
| 1792 t1= src[stride*7]; | |
| 1793 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); | |
| 1794 | |
| 1795 tmp[x]= t3; | |
| 1796 tmp2[x]= t1; | |
| 1797 | |
| 1798 src++; | |
| 1799 } | |
| 1800 #endif | |
| 1801 } | |
| 1802 | |
| 1803 /** | |
| 1109 | 1804 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. |
| 787 | 1805 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1806 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1807 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 142 | 1808 * this filter will read lines 4-13 and write 4-11 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1809 */ |
| 1581 | 1810 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1811 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1812 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1813 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1814 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1815 "leal (%0, %1), %%eax \n\t" |
| 787 | 1816 "leal (%%eax, %1, 4), %%edx \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1817 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1818 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1819 |
| 1581 | 1820 "movq (%2), %%mm0 \n\t" // L0 |
| 1821 "movq (%%eax), %%mm1 \n\t" // L2 | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1822 PAVGB(%%mm1, %%mm0) // L0+L2 |
| 1581 | 1823 "movq (%0), %%mm2 \n\t" // L1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1824 PAVGB(%%mm2, %%mm0) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1825 "movq %%mm0, (%0) \n\t" |
| 1581 | 1826 "movq (%%eax, %1), %%mm0 \n\t" // L3 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1827 PAVGB(%%mm0, %%mm2) // L1+L3 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1828 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1829 "movq %%mm2, (%%eax) \n\t" |
| 1581 | 1830 "movq (%%eax, %1, 2), %%mm2 \n\t" // L4 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1831 PAVGB(%%mm2, %%mm1) // L2+L4 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1832 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1833 "movq %%mm1, (%%eax, %1) \n\t" |
| 1581 | 1834 "movq (%0, %1, 4), %%mm1 \n\t" // L5 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1835 PAVGB(%%mm1, %%mm0) // L3+L5 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1836 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1837 "movq %%mm0, (%%eax, %1, 2) \n\t" |
| 1581 | 1838 "movq (%%edx), %%mm0 \n\t" // L6 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1839 PAVGB(%%mm0, %%mm2) // L4+L6 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1840 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1841 "movq %%mm2, (%0, %1, 4) \n\t" |
| 1581 | 1842 "movq (%%edx, %1), %%mm2 \n\t" // L7 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1843 PAVGB(%%mm2, %%mm1) // L5+L7 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1844 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 |
| 787 | 1845 "movq %%mm1, (%%edx) \n\t" |
| 1581 | 1846 "movq (%%edx, %1, 2), %%mm1 \n\t" // L8 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1847 PAVGB(%%mm1, %%mm0) // L6+L8 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1848 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 |
| 787 | 1849 "movq %%mm0, (%%edx, %1) \n\t" |
| 1581 | 1850 "movq (%0, %1, 8), %%mm0 \n\t" // L9 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1851 PAVGB(%%mm0, %%mm2) // L7+L9 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1852 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 |
| 787 | 1853 "movq %%mm2, (%%edx, %1, 2) \n\t" |
| 1581 | 1854 "movq %%mm1, (%2) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1855 |
| 1581 | 1856 : : "r" (src), "r" (stride), "r" (tmp) |
| 787 | 1857 : "%eax", "%edx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1858 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1859 #else |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1860 int a, b, c, x; |
| 142 | 1861 src+= 4*stride; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1862 |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1863 for(x=0; x<2; x++){ |
| 1581 | 1864 a= *(uint32_t*)&tmp[stride*0]; |
| 1865 b= *(uint32_t*)&src[stride*0]; | |
| 1866 c= *(uint32_t*)&src[stride*1]; | |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1867 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1868 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1869 |
| 1581 | 1870 a= *(uint32_t*)&src[stride*2]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1871 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1872 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1873 |
| 1581 | 1874 b= *(uint32_t*)&src[stride*3]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1875 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1876 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1877 |
| 1581 | 1878 c= *(uint32_t*)&src[stride*4]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1879 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1880 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1881 |
| 1581 | 1882 a= *(uint32_t*)&src[stride*5]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1883 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1884 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1885 |
| 1581 | 1886 b= *(uint32_t*)&src[stride*6]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1887 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1888 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1889 |
| 1581 | 1890 c= *(uint32_t*)&src[stride*7]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1891 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1892 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1893 |
| 1581 | 1894 a= *(uint32_t*)&src[stride*8]; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1895 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1896 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
|
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1897 |
| 1581 | 1898 *(uint32_t*)&tmp[stride*0]= c; |
|
1158
71d890b5c13b
faster C linear blend & interpolate deinterlacers
michaelni
parents:
1157
diff
changeset
|
1899 src += 4; |
| 1581 | 1900 tmp += 4; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1901 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1902 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1903 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1904 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1905 /** |
| 1109 | 1906 * Deinterlaces the given block by applying a median filter to every second line. |
| 142 | 1907 * will be called for every 8x8 block and can read & write from line 4-15, |
| 1908 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1909 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1910 */ |
| 169 | 1911 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1912 { |
| 107 | 1913 #ifdef HAVE_MMX |
| 142 | 1914 src+= 4*stride; |
| 107 | 1915 #ifdef HAVE_MMX2 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1916 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1917 "leal (%0, %1), %%eax \n\t" |
| 787 | 1918 "leal (%%eax, %1, 4), %%edx \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1919 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1920 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1921 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1922 "movq (%0), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1923 "movq (%%eax, %1), %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1924 "movq (%%eax), %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1925 "movq %%mm0, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1926 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1927 "pminub %%mm3, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1928 "pmaxub %%mm2, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1929 "pminub %%mm1, %%mm0 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1930 "movq %%mm0, (%%eax) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1931 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1932 "movq (%0, %1, 4), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1933 "movq (%%eax, %1, 2), %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1934 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1935 "pmaxub %%mm1, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1936 "pminub %%mm3, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1937 "pmaxub %%mm0, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1938 "pminub %%mm1, %%mm2 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1939 "movq %%mm2, (%%eax, %1, 2) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1940 |
| 787 | 1941 "movq (%%edx), %%mm2 \n\t" // |
| 1942 "movq (%%edx, %1), %%mm1 \n\t" // | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1943 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1944 "pmaxub %%mm0, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1945 "pminub %%mm3, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1946 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1947 "pminub %%mm0, %%mm2 \n\t" |
| 787 | 1948 "movq %%mm2, (%%edx) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1949 |
| 787 | 1950 "movq (%%edx, %1, 2), %%mm2 \n\t" // |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1951 "movq (%0, %1, 8), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1952 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1953 "pmaxub %%mm0, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1954 "pminub %%mm3, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1955 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1956 "pminub %%mm0, %%mm2 \n\t" |
| 787 | 1957 "movq %%mm2, (%%edx, %1, 2) \n\t" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1958 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1959 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1960 : : "r" (src), "r" (stride) |
| 787 | 1961 : "%eax", "%edx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1962 ); |
| 107 | 1963 |
| 1964 #else // MMX without MMX2 | |
| 1965 asm volatile( | |
| 1966 "leal (%0, %1), %%eax \n\t" | |
| 787 | 1967 "leal (%%eax, %1, 4), %%edx \n\t" |
| 107 | 1968 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 1969 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
| 107 | 1970 "pxor %%mm7, %%mm7 \n\t" |
| 1971 | |
| 1972 #define MEDIAN(a,b,c)\ | |
| 1973 "movq " #a ", %%mm0 \n\t"\ | |
| 1974 "movq " #b ", %%mm2 \n\t"\ | |
| 1975 "movq " #c ", %%mm1 \n\t"\ | |
| 1976 "movq %%mm0, %%mm3 \n\t"\ | |
| 1977 "movq %%mm1, %%mm4 \n\t"\ | |
| 1978 "movq %%mm2, %%mm5 \n\t"\ | |
| 1979 "psubusb %%mm1, %%mm3 \n\t"\ | |
| 1980 "psubusb %%mm2, %%mm4 \n\t"\ | |
| 1981 "psubusb %%mm0, %%mm5 \n\t"\ | |
| 1982 "pcmpeqb %%mm7, %%mm3 \n\t"\ | |
| 1983 "pcmpeqb %%mm7, %%mm4 \n\t"\ | |
| 1984 "pcmpeqb %%mm7, %%mm5 \n\t"\ | |
| 1985 "movq %%mm3, %%mm6 \n\t"\ | |
| 1986 "pxor %%mm4, %%mm3 \n\t"\ | |
| 1987 "pxor %%mm5, %%mm4 \n\t"\ | |
| 1988 "pxor %%mm6, %%mm5 \n\t"\ | |
| 1989 "por %%mm3, %%mm1 \n\t"\ | |
| 1990 "por %%mm4, %%mm2 \n\t"\ | |
| 1991 "por %%mm5, %%mm0 \n\t"\ | |
| 1992 "pand %%mm2, %%mm0 \n\t"\ | |
| 1993 "pand %%mm1, %%mm0 \n\t"\ | |
| 1994 "movq %%mm0, " #b " \n\t" | |
| 1995 | |
| 1996 MEDIAN((%0), (%%eax), (%%eax, %1)) | |
| 1997 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) | |
| 787 | 1998 MEDIAN((%0, %1, 4), (%%edx), (%%edx, %1)) |
| 1999 MEDIAN((%%edx, %1), (%%edx, %1, 2), (%0, %1, 8)) | |
| 107 | 2000 |
| 2001 : : "r" (src), "r" (stride) | |
| 787 | 2002 : "%eax", "%edx" |
| 107 | 2003 ); |
| 2004 #endif // MMX | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2005 #else |
|
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2006 int x, y; |
| 142 | 2007 src+= 4*stride; |
|
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2008 // FIXME - there should be a way to do a few columns in parallel like w/mmx |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2009 for(x=0; x<8; x++) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2010 { |
|
1029
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2011 uint8_t *colsrc = src; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2012 for (y=0; y<4; y++) |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2013 { |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2014 int a, b, c, d, e, f; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2015 a = colsrc[0 ]; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2016 b = colsrc[stride ]; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2017 c = colsrc[stride*2]; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2018 d = (a-b)>>31; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2019 e = (b-c)>>31; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2020 f = (c-a)>>31; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2021 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2022 colsrc += stride*2; |
|
804cc05a3f61
C implementation of the median deinterlacer (seems to be the only one
rfelker
parents:
957
diff
changeset
|
2023 } |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2024 src++; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2025 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2026 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2027 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2028 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2029 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2030 /** |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2031 * transposes and shift the given 8x8 Block into dst1 and dst2 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2032 */ |
| 169 | 2033 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2034 { |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2035 asm( |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2036 "leal (%0, %1), %%eax \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2037 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 2038 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2039 "movq (%0), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2040 "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2041 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2042 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2043 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2044 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2045 "movq (%%eax, %1), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2046 "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2047 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2048 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2049 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2050 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2051 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2052 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2053 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2054 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2055 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2056 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2057 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2058 "movd %%mm0, 128(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2059 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2060 "movd %%mm0, 144(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2061 "movd %%mm3, 160(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2062 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2063 "movd %%mm3, 176(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2064 "movd %%mm3, 48(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2065 "movd %%mm2, 192(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2066 "movd %%mm2, 64(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2067 "psrlq $32, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2068 "movd %%mm2, 80(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2069 "movd %%mm1, 96(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2070 "psrlq $32, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2071 "movd %%mm1, 112(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2072 |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2073 "leal (%%eax, %1, 4), %%eax \n\t" |
|
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2074 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2075 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2076 "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2077 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2078 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2079 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2080 |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2081 "movq (%%eax, %1), %%mm1 \n\t" |
|
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2082 "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2083 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2084 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2085 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2086 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2087 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2088 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2089 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2090 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2091 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2092 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2093 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2094 "movd %%mm0, 132(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2095 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2096 "movd %%mm0, 148(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2097 "movd %%mm3, 164(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2098 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2099 "movd %%mm3, 180(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2100 "movd %%mm3, 52(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2101 "movd %%mm2, 196(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2102 "movd %%mm2, 68(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2103 "psrlq $32, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2104 "movd %%mm2, 84(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2105 "movd %%mm1, 100(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2106 "psrlq $32, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2107 "movd %%mm1, 116(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2108 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2109 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2110 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) |
|
789
54079a650ba8
using fewer registers (fixes compilation bug hopefully)
michael
parents:
788
diff
changeset
|
2111 : "%eax" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2112 ); |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2113 } |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2114 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2115 /** |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2116 * transposes the given 8x8 block |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2117 */ |
| 169 | 2118 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2119 { |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2120 asm( |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2121 "leal (%0, %1), %%eax \n\t" |
| 787 | 2122 "leal (%%eax, %1, 4), %%edx \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2123 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 2124 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2125 "movq (%2), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2126 "movq 16(%2), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2127 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2128 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2129 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2130 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2131 "movq 32(%2), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2132 "movq 48(%2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2133 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2134 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2135 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2136 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2137 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2138 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2139 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2140 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2141 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2142 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2143 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2144 "movd %%mm0, (%0) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2145 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2146 "movd %%mm0, (%%eax) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2147 "movd %%mm3, (%%eax, %1) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2148 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2149 "movd %%mm3, (%%eax, %1, 2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2150 "movd %%mm2, (%0, %1, 4) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2151 "psrlq $32, %%mm2 \n\t" |
| 787 | 2152 "movd %%mm2, (%%edx) \n\t" |
| 2153 "movd %%mm1, (%%edx, %1) \n\t" | |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2154 "psrlq $32, %%mm1 \n\t" |
| 787 | 2155 "movd %%mm1, (%%edx, %1, 2) \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2156 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2157 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2158 "movq 64(%2), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2159 "movq 80(%2), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2160 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2161 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2162 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2163 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2164 "movq 96(%2), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2165 "movq 112(%2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2166 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2167 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2168 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2169 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2170 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2171 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2172 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2173 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2174 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2175 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2176 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2177 "movd %%mm0, 4(%0) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2178 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2179 "movd %%mm0, 4(%%eax) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2180 "movd %%mm3, 4(%%eax, %1) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2181 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2182 "movd %%mm3, 4(%%eax, %1, 2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2183 "movd %%mm2, 4(%0, %1, 4) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2184 "psrlq $32, %%mm2 \n\t" |
| 787 | 2185 "movd %%mm2, 4(%%edx) \n\t" |
| 2186 "movd %%mm1, 4(%%edx, %1) \n\t" | |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2187 "psrlq $32, %%mm1 \n\t" |
| 787 | 2188 "movd %%mm1, 4(%%edx, %1, 2) \n\t" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2189 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2190 :: "r" (dst), "r" (dstStride), "r" (src) |
| 787 | 2191 : "%eax", "%edx" |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2192 ); |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2193 } |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2194 #endif |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2195 //static int test=0; |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2196 |
| 943 | 2197 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
| 158 | 2198 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
| 156 | 2199 { |
| 787 | 2200 // to save a register (FIXME do this outside of the loops) |
| 2201 tempBluredPast[127]= maxNoise[0]; | |
| 2202 tempBluredPast[128]= maxNoise[1]; | |
| 2203 tempBluredPast[129]= maxNoise[2]; | |
| 2204 | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2205 #define FAST_L2_DIFF |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2206 //#define L1_DIFF //u should change the thresholds too if u try that one |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2207 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2208 asm volatile( |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2209 "leal (%2, %2, 2), %%eax \n\t" // 3*stride |
| 787 | 2210 "leal (%2, %2, 4), %%edx \n\t" // 5*stride |
| 2211 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2212 // 0 1 2 3 4 5 6 7 8 9 |
| 787 | 2213 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2214 //FIXME reorder? |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2215 #ifdef L1_DIFF //needs mmx2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2216 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2217 "psadbw (%1), %%mm0 \n\t" // |L0-R0| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2218 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2219 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2220 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2221 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2222 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2223 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2224 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2225 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2226 "paddw %%mm1, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2227 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| |
| 787 | 2228 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2229 "paddw %%mm2, %%mm0 \n\t" |
| 787 | 2230 "psadbw (%1, %%edx), %%mm5 \n\t" // |L5-R5| |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2231 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2232 "paddw %%mm3, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2233 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2234 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2235 "paddw %%mm4, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2236 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2237 "paddw %%mm5, %%mm6 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2238 "paddw %%mm7, %%mm6 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2239 "paddw %%mm6, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2240 #elif defined (FAST_L2_DIFF) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2241 "pcmpeqb %%mm7, %%mm7 \n\t" |
| 210 | 2242 "movq "MANGLE(b80)", %%mm6 \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2243 "pxor %%mm0, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2244 #define L2_DIFF_CORE(a, b)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2245 "movq " #a ", %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2246 "movq " #b ", %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2247 "pxor %%mm7, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2248 PAVGB(%%mm2, %%mm5)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2249 "paddb %%mm6, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2250 "movq %%mm5, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2251 "psllw $8, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2252 "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2253 "pmaddwd %%mm2, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2254 "paddd %%mm2, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2255 "psrld $14, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2256 "paddd %%mm5, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2257 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2258 L2_DIFF_CORE((%0), (%1)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2259 L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2260 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2261 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2262 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
| 787 | 2263 L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2264 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2265 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2266 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2267 #else |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2268 "pxor %%mm7, %%mm7 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2269 "pxor %%mm0, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2270 #define L2_DIFF_CORE(a, b)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2271 "movq " #a ", %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2272 "movq " #b ", %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2273 "movq %%mm5, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2274 "movq %%mm2, %%mm3 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2275 "punpcklbw %%mm7, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2276 "punpckhbw %%mm7, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2277 "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2278 "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2279 "psubw %%mm2, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2280 "psubw %%mm3, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2281 "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2282 "pmaddwd %%mm1, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2283 "paddd %%mm1, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2284 "paddd %%mm5, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2285 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2286 L2_DIFF_CORE((%0), (%1)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2287 L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2288 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2289 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2290 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
| 787 | 2291 L2_DIFF_CORE((%0, %%edx), (%1, %%edx)) |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2292 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2293 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2294 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2295 #endif |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2296 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2297 "movq %%mm0, %%mm4 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2298 "psrlq $32, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2299 "paddd %%mm0, %%mm4 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2300 "movd %%mm4, %%ecx \n\t" |
| 158 | 2301 "shll $2, %%ecx \n\t" |
| 787 | 2302 "movl %3, %%edx \n\t" |
| 2303 "addl -4(%%edx), %%ecx \n\t" | |
| 2304 "addl 4(%%edx), %%ecx \n\t" | |
| 2305 "addl -1024(%%edx), %%ecx \n\t" | |
| 158 | 2306 "addl $4, %%ecx \n\t" |
| 787 | 2307 "addl 1024(%%edx), %%ecx \n\t" |
| 158 | 2308 "shrl $3, %%ecx \n\t" |
| 787 | 2309 "movl %%ecx, (%%edx) \n\t" |
| 158 | 2310 |
| 210 | 2311 // "movl %3, %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2312 // "movl %%ecx, test \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2313 // "jmp 4f \n\t" |
| 787 | 2314 "cmpl 512(%%edx), %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2315 " jb 2f \n\t" |
| 787 | 2316 "cmpl 516(%%edx), %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2317 " jb 1f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2318 |
| 787 | 2319 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2320 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2321 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2322 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2323 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2324 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2325 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
| 787 | 2326 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2327 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2328 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2329 "movq %%mm0, (%1) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2330 "movq %%mm1, (%1, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2331 "movq %%mm2, (%1, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2332 "movq %%mm3, (%1, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2333 "movq %%mm4, (%1, %2, 4) \n\t" // L4 |
| 787 | 2334 "movq %%mm5, (%1, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2335 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2336 "movq %%mm7, (%1, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2337 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2338 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2339 "1: \n\t" |
| 787 | 2340 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2341 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2342 "movq (%0), %%mm0 \n\t" // L0 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2343 PAVGB((%1), %%mm0) // L0 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2344 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2345 PAVGB((%1, %2), %%mm1) // L1 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2346 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2347 PAVGB((%1, %2, 2), %%mm2) // L2 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2348 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2349 PAVGB((%1, %%eax), %%mm3) // L3 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2350 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2351 PAVGB((%1, %2, 4), %%mm4) // L4 |
| 787 | 2352 "movq (%0, %%edx), %%mm5 \n\t" // L5 |
| 2353 PAVGB((%1, %%edx), %%mm5) // L5 | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2354 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2355 PAVGB((%1, %%eax, 2), %%mm6) // L6 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2356 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
363
ff766a367974
3dnow temporal denoiser bugfix by R?mi Guyomarch <rguyom@pobox.com>
michael
parents:
334
diff
changeset
|
2357 PAVGB((%1, %%ecx), %%mm7) // L7 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2358 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2359 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2360 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2361 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2362 "movq %%mm4, (%1, %2, 4) \n\t" // R4 |
| 787 | 2363 "movq %%mm5, (%1, %%edx) \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2364 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2365 "movq %%mm7, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2366 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2367 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2368 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2369 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2370 "movq %%mm4, (%0, %2, 4) \n\t" // L4 |
| 787 | 2371 "movq %%mm5, (%0, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2372 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2373 "movq %%mm7, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2374 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2375 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2376 "2: \n\t" |
| 787 | 2377 "cmpl 508(%%edx), %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2378 " jb 3f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2379 |
| 787 | 2380 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2381 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2382 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2383 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2384 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2385 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2386 "movq (%1), %%mm4 \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2387 "movq (%1, %2), %%mm5 \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2388 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2389 "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2390 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2391 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2392 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2393 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2394 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2395 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2396 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2397 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2398 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2399 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2400 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2401 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2402 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2403 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2404 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2405 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2406 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2407 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
| 787 | 2408 "movq (%0, %%edx), %%mm1 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2409 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2410 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2411 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
| 787 | 2412 "movq (%1, %%edx), %%mm5 \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2413 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2414 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2415 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2416 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2417 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2418 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2419 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2420 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2421 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2422 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2423 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
| 787 | 2424 "movq %%mm1, (%1, %%edx) \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2425 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2426 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2427 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
| 787 | 2428 "movq %%mm1, (%0, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2429 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2430 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2431 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2432 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2433 "3: \n\t" |
| 787 | 2434 "leal (%%eax, %2, 2), %%edx \n\t" // 5*stride |
| 2435 "leal (%%edx, %2, 2), %%ecx \n\t" // 7*stride | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2436 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2437 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2438 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2439 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2440 "movq (%1), %%mm4 \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2441 "movq (%1, %2), %%mm5 \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2442 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2443 "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2444 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2445 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2446 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2447 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2448 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2449 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2450 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2451 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2452 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2453 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2454 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2455 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2456 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2457 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2458 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2459 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2460 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2461 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2462 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2463 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2464 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2465 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
| 787 | 2466 "movq (%0, %%edx), %%mm1 \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2467 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2468 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2469 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
| 787 | 2470 "movq (%1, %%edx), %%mm5 \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2471 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2472 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2473 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2474 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2475 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2476 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2477 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2478 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2479 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2480 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2481 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2482 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2483 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2484 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2485 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
| 787 | 2486 "movq %%mm1, (%1, %%edx) \n\t" // R5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2487 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2488 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2489 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
| 787 | 2490 "movq %%mm1, (%0, %%edx) \n\t" // L5 |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2491 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2492 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2493 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2494 "4: \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2495 |
| 158 | 2496 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) |
| 787 | 2497 : "%eax", "%edx", "%ecx", "memory" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2498 ); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2499 //printf("%d\n", test); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2500 #else |
| 788 | 2501 { |
| 156 | 2502 int y; |
| 2503 int d=0; | |
| 2504 int sysd=0; | |
| 158 | 2505 int i; |
| 156 | 2506 |
| 2507 for(y=0; y<8; y++) | |
| 2508 { | |
| 2509 int x; | |
| 2510 for(x=0; x<8; x++) | |
| 2511 { | |
| 2512 int ref= tempBlured[ x + y*stride ]; | |
| 2513 int cur= src[ x + y*stride ]; | |
| 2514 int d1=ref - cur; | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2515 // if(x==0 || x==7) d1+= d1>>1; |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2516 // if(y==0 || y==7) d1+= d1>>1; |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2517 // d+= ABS(d1); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2518 d+= d1*d1; |
| 156 | 2519 sysd+= d1; |
| 2520 } | |
| 2521 } | |
| 158 | 2522 i=d; |
| 2523 d= ( | |
| 2524 4*d | |
| 2525 +(*(tempBluredPast-256)) | |
| 2526 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) | |
| 2527 +(*(tempBluredPast+256)) | |
| 2528 +4)>>3; | |
| 2529 *tempBluredPast=i; | |
| 2530 // ((*tempBluredPast)*3 + d + 2)>>2; | |
| 2531 | |
| 156 | 2532 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); |
| 2533 /* | |
| 2534 Switch between | |
| 2535 1 0 0 0 0 0 0 (0) | |
| 2536 64 32 16 8 4 2 1 (1) | |
| 2537 64 48 36 27 20 15 11 (33) (approx) | |
| 2538 64 56 49 43 37 33 29 (200) (approx) | |
| 2539 */ | |
| 2540 if(d > maxNoise[1]) | |
| 2541 { | |
| 2542 if(d < maxNoise[2]) | |
| 2543 { | |
| 2544 for(y=0; y<8; y++) | |
| 2545 { | |
| 2546 int x; | |
| 2547 for(x=0; x<8; x++) | |
| 2548 { | |
| 2549 int ref= tempBlured[ x + y*stride ]; | |
| 2550 int cur= src[ x + y*stride ]; | |
| 2551 tempBlured[ x + y*stride ]= | |
| 2552 src[ x + y*stride ]= | |
| 2553 (ref + cur + 1)>>1; | |
| 2554 } | |
| 2555 } | |
| 2556 } | |
| 2557 else | |
| 2558 { | |
| 2559 for(y=0; y<8; y++) | |
| 2560 { | |
| 2561 int x; | |
| 2562 for(x=0; x<8; x++) | |
| 2563 { | |
| 2564 tempBlured[ x + y*stride ]= src[ x + y*stride ]; | |
| 2565 } | |
| 2566 } | |
| 2567 } | |
| 2568 } | |
| 2569 else | |
| 2570 { | |
| 2571 if(d < maxNoise[0]) | |
| 2572 { | |
| 2573 for(y=0; y<8; y++) | |
| 2574 { | |
| 2575 int x; | |
| 2576 for(x=0; x<8; x++) | |
| 2577 { | |
| 2578 int ref= tempBlured[ x + y*stride ]; | |
| 2579 int cur= src[ x + y*stride ]; | |
| 2580 tempBlured[ x + y*stride ]= | |
| 2581 src[ x + y*stride ]= | |
| 2582 (ref*7 + cur + 4)>>3; | |
| 2583 } | |
| 2584 } | |
| 2585 } | |
| 2586 else | |
| 2587 { | |
| 2588 for(y=0; y<8; y++) | |
| 2589 { | |
| 2590 int x; | |
| 2591 for(x=0; x<8; x++) | |
| 2592 { | |
| 2593 int ref= tempBlured[ x + y*stride ]; | |
| 2594 int cur= src[ x + y*stride ]; | |
| 2595 tempBlured[ x + y*stride ]= | |
| 2596 src[ x + y*stride ]= | |
| 2597 (ref*3 + cur + 2)>>2; | |
| 2598 } | |
| 2599 } | |
| 2600 } | |
| 2601 } | |
| 788 | 2602 } |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2603 #endif |
| 156 | 2604 } |
| 2605 | |
| 169 | 2606 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 787 | 2607 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
| 96 | 2608 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2609 /** |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2610 * Copies a block from src to dst and fixes the blacklevel |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2611 * levelFix == 0 -> dont touch the brighness & contrast |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2612 */ |
|
634
be1cb0e1f276
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
600
diff
changeset
|
2613 #undef SCALED_CPY |
|
be1cb0e1f276
warning fixes by Dominik Mierzejewski <dominik@rangers.eu.org>
arpi
parents:
600
diff
changeset
|
2614 |
| 169 | 2615 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
| 787 | 2616 int levelFix, int64_t *packedOffsetAndScale) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2617 { |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2618 #ifndef HAVE_MMX |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2619 int i; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2620 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2621 if(levelFix) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2622 { |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2623 #ifdef HAVE_MMX |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2624 asm volatile( |
| 787 | 2625 "movq (%%eax), %%mm2 \n\t" // packedYOffset |
| 2626 "movq 8(%%eax), %%mm3 \n\t" // packedYScale | |
| 2627 "leal (%2,%4), %%eax \n\t" | |
| 2628 "leal (%3,%5), %%edx \n\t" | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2629 "pxor %%mm4, %%mm4 \n\t" |
| 173 | 2630 #ifdef HAVE_MMX2 |
| 2631 #define SCALED_CPY(src1, src2, dst1, dst2) \ | |
| 2632 "movq " #src1 ", %%mm0 \n\t"\ | |
| 2633 "movq " #src1 ", %%mm5 \n\t"\ | |
| 2634 "movq " #src2 ", %%mm1 \n\t"\ | |
| 2635 "movq " #src2 ", %%mm6 \n\t"\ | |
| 2636 "punpcklbw %%mm0, %%mm0 \n\t"\ | |
| 2637 "punpckhbw %%mm5, %%mm5 \n\t"\ | |
| 2638 "punpcklbw %%mm1, %%mm1 \n\t"\ | |
| 2639 "punpckhbw %%mm6, %%mm6 \n\t"\ | |
| 2640 "pmulhuw %%mm3, %%mm0 \n\t"\ | |
| 2641 "pmulhuw %%mm3, %%mm5 \n\t"\ | |
| 2642 "pmulhuw %%mm3, %%mm1 \n\t"\ | |
| 2643 "pmulhuw %%mm3, %%mm6 \n\t"\ | |
| 2644 "psubw %%mm2, %%mm0 \n\t"\ | |
| 2645 "psubw %%mm2, %%mm5 \n\t"\ | |
| 2646 "psubw %%mm2, %%mm1 \n\t"\ | |
| 2647 "psubw %%mm2, %%mm6 \n\t"\ | |
| 2648 "packuswb %%mm5, %%mm0 \n\t"\ | |
| 2649 "packuswb %%mm6, %%mm1 \n\t"\ | |
| 2650 "movq %%mm0, " #dst1 " \n\t"\ | |
| 2651 "movq %%mm1, " #dst2 " \n\t"\ | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2652 |
| 173 | 2653 #else //HAVE_MMX2 |
| 166 | 2654 #define SCALED_CPY(src1, src2, dst1, dst2) \ |
| 2655 "movq " #src1 ", %%mm0 \n\t"\ | |
| 2656 "movq " #src1 ", %%mm5 \n\t"\ | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2657 "punpcklbw %%mm4, %%mm0 \n\t"\ |
|
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2658 "punpckhbw %%mm4, %%mm5 \n\t"\ |
| 117 | 2659 "psubw %%mm2, %%mm0 \n\t"\ |
| 2660 "psubw %%mm2, %%mm5 \n\t"\ | |
| 166 | 2661 "movq " #src2 ", %%mm1 \n\t"\ |
| 117 | 2662 "psllw $6, %%mm0 \n\t"\ |
| 2663 "psllw $6, %%mm5 \n\t"\ | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2664 "pmulhw %%mm3, %%mm0 \n\t"\ |
| 166 | 2665 "movq " #src2 ", %%mm6 \n\t"\ |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2666 "pmulhw %%mm3, %%mm5 \n\t"\ |
|
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2667 "punpcklbw %%mm4, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2668 "punpckhbw %%mm4, %%mm6 \n\t"\ |
| 117 | 2669 "psubw %%mm2, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2670 "psubw %%mm2, %%mm6 \n\t"\ |
| 117 | 2671 "psllw $6, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2672 "psllw $6, %%mm6 \n\t"\ |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2673 "pmulhw %%mm3, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2674 "pmulhw %%mm3, %%mm6 \n\t"\ |
|
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2675 "packuswb %%mm5, %%mm0 \n\t"\ |
|
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2676 "packuswb %%mm6, %%mm1 \n\t"\ |
| 166 | 2677 "movq %%mm0, " #dst1 " \n\t"\ |
| 2678 "movq %%mm1, " #dst2 " \n\t"\ | |
| 2679 | |
| 173 | 2680 #endif //!HAVE_MMX2 |
| 2681 | |
| 787 | 2682 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) |
| 2683 SCALED_CPY((%2, %4, 2), (%%eax, %4, 2), (%3, %5, 2), (%%edx, %5, 2)) | |
| 2684 SCALED_CPY((%2, %4, 4), (%%eax, %4, 4), (%3, %5, 4), (%%edx, %5, 4)) | |
| 2685 "leal (%%eax,%4,4), %%eax \n\t" | |
| 2686 "leal (%%edx,%5,4), %%edx \n\t" | |
| 2687 SCALED_CPY((%%eax, %4), (%%eax, %4, 2), (%%edx, %5), (%%edx, %5, 2)) | |
| 166 | 2688 |
| 2689 | |
| 787 | 2690 : "=&a" (packedOffsetAndScale) |
| 2691 : "0" (packedOffsetAndScale), | |
| 2692 "r"(src), | |
| 166 | 2693 "r"(dst), |
| 2694 "r" (srcStride), | |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2695 "r" (dstStride) |
| 787 | 2696 : "%edx" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2697 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2698 #else |
| 164 | 2699 for(i=0; i<8; i++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2700 memcpy( &(dst[dstStride*i]), |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2701 &(src[srcStride*i]), BLOCK_SIZE); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2702 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2703 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2704 else |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2705 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2706 #ifdef HAVE_MMX |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2707 asm volatile( |
| 166 | 2708 "leal (%0,%2), %%eax \n\t" |
| 787 | 2709 "leal (%1,%3), %%edx \n\t" |
| 166 | 2710 |
| 2711 #define SIMPLE_CPY(src1, src2, dst1, dst2) \ | |
| 2712 "movq " #src1 ", %%mm0 \n\t"\ | |
| 2713 "movq " #src2 ", %%mm1 \n\t"\ | |
| 2714 "movq %%mm0, " #dst1 " \n\t"\ | |
| 2715 "movq %%mm1, " #dst2 " \n\t"\ | |
| 2716 | |
| 2717 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) | |
| 787 | 2718 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%edx, %3, 2)) |
| 2719 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%edx, %3, 4)) | |
| 166 | 2720 "leal (%%eax,%2,4), %%eax \n\t" |
| 787 | 2721 "leal (%%edx,%3,4), %%edx \n\t" |
| 2722 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%edx, %3), (%%edx, %3, 2)) | |
| 166 | 2723 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2724 : : "r" (src), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2725 "r" (dst), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2726 "r" (srcStride), |
| 164 | 2727 "r" (dstStride) |
| 787 | 2728 : "%eax", "%edx" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2729 ); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2730 #else |
| 164 | 2731 for(i=0; i<8; i++) |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2732 memcpy( &(dst[dstStride*i]), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2733 &(src[srcStride*i]), BLOCK_SIZE); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2734 #endif |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2735 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2736 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2737 |
| 224 | 2738 /** |
| 2739 * Duplicates the given 8 src pixels ? times upward | |
| 2740 */ | |
| 2741 static inline void RENAME(duplicate)(uint8_t src[], int stride) | |
| 2742 { | |
| 2743 #ifdef HAVE_MMX | |
| 2744 asm volatile( | |
| 2745 "movq (%0), %%mm0 \n\t" | |
| 2746 "addl %1, %0 \n\t" | |
| 2747 "movq %%mm0, (%0) \n\t" | |
| 2748 "movq %%mm0, (%0, %1) \n\t" | |
| 2749 "movq %%mm0, (%0, %1, 2) \n\t" | |
| 2750 : "+r" (src) | |
| 2751 : "r" (-stride) | |
| 2752 ); | |
| 2753 #else | |
| 2754 int i; | |
| 2755 uint8_t *p=src; | |
| 2756 for(i=0; i<3; i++) | |
| 2757 { | |
| 2758 p-= stride; | |
| 2759 memcpy(p, src, 8); | |
| 2760 } | |
| 2761 #endif | |
| 2762 } | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2763 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2764 /** |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2765 * Filters array of bytes (Y or U or V values) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2766 */ |
| 169 | 2767 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 787 | 2768 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2769 { |
| 787 | 2770 PPContext __attribute__((aligned(8))) c= *c2; //copy to stack for faster access |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2771 int x,y; |
|
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2772 #ifdef COMPILE_TIME_MODE |
|
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2773 const int mode= COMPILE_TIME_MODE; |
|
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2774 #else |
| 787 | 2775 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; |
|
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2776 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2777 int black=0, white=255; // blackest black and whitest white in the picture |
| 223 | 2778 int QPCorrecture= 256*256; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2779 |
|
886
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
2780 int copyAhead; |
|
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
2781 #ifdef HAVE_MMX |
|
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
2782 int i; |
|
3abff5a87548
warning patch by (Dominik Mierzejewski <dominik at rangers dot eu dot org>)
michael
parents:
810
diff
changeset
|
2783 #endif |
| 164 | 2784 |
| 957 | 2785 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; |
| 2786 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; | |
| 2787 | |
| 787 | 2788 //FIXME remove |
| 2789 uint64_t * const yHistogram= c.yHistogram; | |
| 2790 uint8_t * const tempSrc= c.tempSrc; | |
| 2791 uint8_t * const tempDst= c.tempDst; | |
| 791 | 2792 const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; |
|
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2793 |
| 158 | 2794 #ifdef HAVE_MMX |
| 1724 | 2795 for(i=0; i<57; i++){ |
| 791 | 2796 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; |
| 2797 int threshold= offset*2 + 1; | |
| 2798 c.mmxDcOffset[i]= 0x7F - offset; | |
| 2799 c.mmxDcThreshold[i]= 0x7F - threshold; | |
| 2800 c.mmxDcOffset[i]*= 0x0101010101010101LL; | |
| 2801 c.mmxDcThreshold[i]*= 0x0101010101010101LL; | |
| 2802 } | |
| 158 | 2803 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2804 |
| 164 | 2805 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
| 787 | 2806 else if( (mode & LINEAR_BLEND_DEINT_FILTER) |
| 1157 | 2807 || (mode & FFMPEG_DEINT_FILTER) |
| 2808 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; | |
| 164 | 2809 else if( (mode & V_DEBLOCK) |
| 2810 || (mode & LINEAR_IPOL_DEINT_FILTER) | |
| 2811 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; | |
| 2812 else if(mode & V_X1_FILTER) copyAhead=11; | |
| 787 | 2813 // else if(mode & V_RK1_FILTER) copyAhead=10; |
| 164 | 2814 else if(mode & DERING) copyAhead=9; |
| 2815 else copyAhead=8; | |
| 2816 | |
| 2817 copyAhead-= 8; | |
| 2818 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2819 if(!isColor) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2820 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2821 uint64_t sum= 0; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2822 int i; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2823 uint64_t maxClipped; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2824 uint64_t clipped; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2825 double scale; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2826 |
| 787 | 2827 c.frameNum++; |
| 2828 // first frame is fscked so we ignore it | |
| 2829 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2830 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2831 for(i=0; i<256; i++) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2832 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2833 sum+= yHistogram[i]; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2834 // printf("%d ", yHistogram[i]); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2835 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2836 // printf("\n\n"); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2837 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2838 /* we allways get a completly black picture first */ |
| 793 | 2839 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2840 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2841 clipped= sum; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2842 for(black=255; black>0; black--) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2843 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2844 if(clipped < maxClipped) break; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2845 clipped-= yHistogram[black]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2846 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2847 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2848 clipped= sum; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2849 for(white=0; white<256; white++) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2850 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2851 if(clipped < maxClipped) break; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2852 clipped-= yHistogram[white]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2853 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2854 |
| 787 | 2855 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); |
| 173 | 2856 |
| 2857 #ifdef HAVE_MMX2 | |
| 787 | 2858 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
| 2859 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; | |
| 173 | 2860 #else |
| 787 | 2861 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
| 2862 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; | |
| 173 | 2863 #endif |
| 2864 | |
| 787 | 2865 c.packedYOffset|= c.packedYOffset<<32; |
| 2866 c.packedYOffset|= c.packedYOffset<<16; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2867 |
| 787 | 2868 c.packedYScale|= c.packedYScale<<32; |
| 2869 c.packedYScale|= c.packedYScale<<16; | |
| 223 | 2870 |
| 2871 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); | |
| 2872 else QPCorrecture= 256*256; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2873 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2874 else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2875 { |
| 787 | 2876 c.packedYScale= 0x0100010001000100LL; |
| 2877 c.packedYOffset= 0; | |
| 223 | 2878 QPCorrecture= 256*256; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2879 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2880 |
|
148
1cfc4d567c0a
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
142
diff
changeset
|
2881 /* copy & deinterlace first row of blocks */ |
| 142 | 2882 y=-BLOCK_SIZE; |
| 2883 { | |
| 2884 uint8_t *srcBlock= &(src[y*srcStride]); | |
| 224 | 2885 uint8_t *dstBlock= tempDst + dstStride; |
| 142 | 2886 |
| 2887 // From this point on it is guranteed that we can read and write 16 lines downward | |
| 2888 // finish 1 block before the next otherwise we´ll might have a problem | |
| 2889 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
| 2890 for(x=0; x<width; x+=BLOCK_SIZE) | |
| 2891 { | |
| 2892 | |
| 2893 #ifdef HAVE_MMX2 | |
| 2894 /* | |
| 2895 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 2896 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 2897 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 2898 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 2899 */ | |
| 2900 | |
| 2901 asm( | |
| 2902 "movl %4, %%eax \n\t" | |
| 2903 "shrl $2, %%eax \n\t" | |
| 2904 "andl $6, %%eax \n\t" | |
| 164 | 2905 "addl %5, %%eax \n\t" |
| 787 | 2906 "movl %%eax, %%edx \n\t" |
| 142 | 2907 "imul %1, %%eax \n\t" |
| 787 | 2908 "imul %3, %%edx \n\t" |
| 142 | 2909 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 2910 "prefetcht0 32(%%edx, %2) \n\t" |
| 142 | 2911 "addl %1, %%eax \n\t" |
| 787 | 2912 "addl %3, %%edx \n\t" |
| 142 | 2913 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 2914 "prefetcht0 32(%%edx, %2) \n\t" |
| 142 | 2915 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
| 164 | 2916 "m" (x), "m" (copyAhead) |
| 787 | 2917 : "%eax", "%edx" |
| 142 | 2918 ); |
| 2919 | |
| 2920 #elif defined(HAVE_3DNOW) | |
| 2921 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 2922 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
| 2923 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 2924 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 2925 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 2926 */ | |
| 2927 #endif | |
| 2928 | |
| 224 | 2929 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
| 787 | 2930 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
| 224 | 2931 |
| 2932 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); | |
| 142 | 2933 |
| 2934 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
| 169 | 2935 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
| 142 | 2936 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 1581 | 2937 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); |
| 142 | 2938 else if(mode & MEDIAN_DEINT_FILTER) |
| 169 | 2939 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
| 142 | 2940 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
| 169 | 2941 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
| 787 | 2942 else if(mode & FFMPEG_DEINT_FILTER) |
| 2943 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
| 1157 | 2944 else if(mode & LOWPASS5_DEINT_FILTER) |
| 2945 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); | |
| 142 | 2946 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
| 169 | 2947 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
| 142 | 2948 */ |
| 2949 dstBlock+=8; | |
| 2950 srcBlock+=8; | |
| 2951 } | |
| 941 | 2952 if(width==dstStride) |
| 2953 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride); | |
| 2954 else | |
| 2955 { | |
| 943 | 2956 int i; |
| 941 | 2957 for(i=0; i<copyAhead; i++) |
| 2958 { | |
| 2959 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); | |
| 2960 } | |
| 2961 } | |
| 142 | 2962 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2963 |
| 787 | 2964 //printf("\n"); |
| 111 | 2965 for(y=0; y<height; y+=BLOCK_SIZE) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2966 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2967 //1% speedup if these are here instead of the inner loop |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2968 uint8_t *srcBlock= &(src[y*srcStride]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2969 uint8_t *dstBlock= &(dst[y*dstStride]); |
| 169 | 2970 #ifdef HAVE_MMX |
| 787 | 2971 uint8_t *tempBlock1= c.tempBlocks; |
| 2972 uint8_t *tempBlock2= c.tempBlocks + 8; | |
| 169 | 2973 #endif |
| 957 | 2974 int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; |
| 1196 | 2975 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*QPStride]; |
| 156 | 2976 int QP=0; |
| 130 | 2977 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
| 2978 if not than use a temporary buffer */ | |
| 111 | 2979 if(y+15 >= height) |
| 2980 { | |
| 156 | 2981 int i; |
| 164 | 2982 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
| 111 | 2983 blockcopy to dst later */ |
| 164 | 2984 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
| 2985 srcStride*MAX(height-y-copyAhead, 0) ); | |
| 2986 | |
| 2987 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ | |
| 2988 for(i=MAX(height-y, 8); i<copyAhead+8; i++) | |
| 156 | 2989 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); |
| 2990 | |
| 164 | 2991 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
| 2992 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) ); | |
| 2993 | |
| 2994 /* duplicate last line of dst to fill the void upto line (copyAhead) */ | |
| 2995 for(i=height-y+1; i<=copyAhead; i++) | |
| 156 | 2996 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); |
| 2997 | |
| 130 | 2998 dstBlock= tempDst + dstStride; |
| 111 | 2999 srcBlock= tempSrc; |
| 3000 } | |
| 787 | 3001 //printf("\n"); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3002 |
| 112 | 3003 // From this point on it is guranteed that we can read and write 16 lines downward |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3004 // finish 1 block before the next otherwise we´ll might have a problem |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3005 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3006 for(x=0; x<width; x+=BLOCK_SIZE) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3007 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3008 const int stride= dstStride; |
| 169 | 3009 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3010 uint8_t *tmpXchg; |
| 169 | 3011 #endif |
| 791 | 3012 if(isColor) |
| 121 | 3013 { |
| 957 | 3014 QP= QPptr[x>>qpHShift]; |
| 3015 c.nonBQP= nonBQPptr[x>>qpHShift]; | |
| 791 | 3016 } |
| 3017 else | |
| 3018 { | |
| 3019 QP= QPptr[x>>4]; | |
| 223 | 3020 QP= (QP* QPCorrecture + 256*128)>>16; |
| 791 | 3021 c.nonBQP= nonBQPptr[x>>4]; |
| 3022 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; | |
|
148
1cfc4d567c0a
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
142
diff
changeset
|
3023 yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
| 121 | 3024 } |
| 787 | 3025 c.QP= QP; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3026 #ifdef HAVE_MMX |
| 111 | 3027 asm volatile( |
| 787 | 3028 "movd %1, %%mm7 \n\t" |
| 111 | 3029 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
| 3030 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | |
| 3031 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | |
| 787 | 3032 "movq %%mm7, %0 \n\t" |
| 3033 : "=m" (c.pQPb) | |
| 3034 : "r" (QP) | |
| 111 | 3035 ); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3036 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3037 |
| 96 | 3038 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3039 #ifdef HAVE_MMX2 |
| 126 | 3040 /* |
| 3041 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 3042 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 3043 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 3044 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 3045 */ | |
| 3046 | |
| 3047 asm( | |
| 3048 "movl %4, %%eax \n\t" | |
| 3049 "shrl $2, %%eax \n\t" | |
| 3050 "andl $6, %%eax \n\t" | |
| 164 | 3051 "addl %5, %%eax \n\t" |
| 787 | 3052 "movl %%eax, %%edx \n\t" |
| 126 | 3053 "imul %1, %%eax \n\t" |
| 787 | 3054 "imul %3, %%edx \n\t" |
| 126 | 3055 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 3056 "prefetcht0 32(%%edx, %2) \n\t" |
| 126 | 3057 "addl %1, %%eax \n\t" |
| 787 | 3058 "addl %3, %%edx \n\t" |
| 126 | 3059 "prefetchnta 32(%%eax, %0) \n\t" |
| 787 | 3060 "prefetcht0 32(%%edx, %2) \n\t" |
| 126 | 3061 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
| 164 | 3062 "m" (x), "m" (copyAhead) |
| 787 | 3063 : "%eax", "%edx" |
| 126 | 3064 ); |
| 3065 | |
| 96 | 3066 #elif defined(HAVE_3DNOW) |
| 3067 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 111 | 3068 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
| 3069 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 3070 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 3071 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 96 | 3072 */ |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3073 #endif |
| 111 | 3074 |
| 169 | 3075 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
| 787 | 3076 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3077 |
| 111 | 3078 if(mode & LINEAR_IPOL_DEINT_FILTER) |
| 169 | 3079 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
| 111 | 3080 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 1581 | 3081 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); |
| 111 | 3082 else if(mode & MEDIAN_DEINT_FILTER) |
| 169 | 3083 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
| 111 | 3084 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
| 169 | 3085 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
| 787 | 3086 else if(mode & FFMPEG_DEINT_FILTER) |
| 3087 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); | |
| 1157 | 3088 else if(mode & LOWPASS5_DEINT_FILTER) |
| 3089 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); | |
| 111 | 3090 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
| 169 | 3091 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
3092 */ |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3093 |
| 111 | 3094 /* only deblock if we have 2 blocks */ |
| 3095 if(y + 8 < height) | |
| 3096 { | |
| 787 | 3097 if(mode & V_X1_FILTER) |
| 3098 RENAME(vertX1Filter)(dstBlock, stride, &c); | |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3099 else if(mode & V_DEBLOCK) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3100 { |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3101 const int t= RENAME(vertClassify)(dstBlock, stride, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3102 |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3103 if(t==1) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3104 RENAME(doVertLowPass)(dstBlock, stride, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3105 else if(t==2) |
| 787 | 3106 RENAME(doVertDefFilter)(dstBlock, stride, &c); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3107 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3108 } |
| 130 | 3109 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3110 #ifdef HAVE_MMX |
| 169 | 3111 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3112 #endif |
| 111 | 3113 /* check if we have a previous block to deblock it with dstBlock */ |
| 112 | 3114 if(x - 8 >= 0) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3115 { |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3116 #ifdef HAVE_MMX |
| 787 | 3117 if(mode & H_X1_FILTER) |
| 3118 RENAME(vertX1Filter)(tempBlock1, 16, &c); | |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3119 else if(mode & H_DEBLOCK) |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3120 { |
|
1327
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3121 //START_TIMER |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3122 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3123 //STOP_TIMER("dc & minmax") |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3124 if(t==1) |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3125 RENAME(doVertLowPass)(tempBlock1, 16, &c); |
|
854571532c89
blinking blocks around thin vertical lines and dots bugfix
michaelni
parents:
1196
diff
changeset
|
3126 else if(t==2) |
| 787 | 3127 RENAME(doVertDefFilter)(tempBlock1, 16, &c); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3128 } |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3129 |
| 169 | 3130 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3131 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3132 #else |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3133 if(mode & H_X1_FILTER) |
|
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3134 horizX1Filter(dstBlock-4, stride, QP); |
|
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3135 else if(mode & H_DEBLOCK) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3136 { |
| 787 | 3137 if( isHorizDC(dstBlock-4, stride, &c)) |
| 96 | 3138 { |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3139 if(isHorizMinMaxOk(dstBlock-4, stride, QP)) |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3140 doHorizLowPass(dstBlock-4, stride, QP); |
| 96 | 3141 } |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3142 else |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3143 doHorizDefFilter(dstBlock-4, stride, QP); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3144 } |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3145 #endif |
| 130 | 3146 if(mode & DERING) |
| 3147 { | |
| 3148 //FIXME filter first line | |
| 787 | 3149 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); |
| 130 | 3150 } |
| 156 | 3151 |
| 3152 if(mode & TEMP_NOISE_FILTER) | |
| 3153 { | |
| 169 | 3154 RENAME(tempNoiseReducer)(dstBlock-8, stride, |
| 787 | 3155 c.tempBlured[isColor] + y*dstStride + x, |
| 3156 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
| 3157 c.ppMode.maxTmpNoise); | |
| 156 | 3158 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3159 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3160 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3161 dstBlock+=8; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3162 srcBlock+=8; |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3163 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3164 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3165 tmpXchg= tempBlock1; |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3166 tempBlock1= tempBlock2; |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3167 tempBlock2 = tmpXchg; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3168 #endif |
| 111 | 3169 } |
| 3170 | |
| 156 | 3171 if(mode & DERING) |
| 3172 { | |
| 787 | 3173 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); |
| 156 | 3174 } |
| 3175 | |
| 3176 if((mode & TEMP_NOISE_FILTER)) | |
| 3177 { | |
| 169 | 3178 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
| 787 | 3179 c.tempBlured[isColor] + y*dstStride + x, |
| 3180 c.tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
| 3181 c.ppMode.maxTmpNoise); | |
| 156 | 3182 } |
| 3183 | |
| 142 | 3184 /* did we use a tmp buffer for the last lines*/ |
| 112 | 3185 if(y+15 >= height) |
| 111 | 3186 { |
| 3187 uint8_t *dstBlock= &(dst[y*dstStride]); | |
| 941 | 3188 if(width==dstStride) |
| 3189 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y)); | |
| 3190 else | |
| 3191 { | |
| 944 | 3192 int i; |
| 941 | 3193 for(i=0; i<height-y; i++) |
| 3194 { | |
| 3195 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); | |
| 3196 } | |
| 3197 } | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3198 } |
| 163 | 3199 /* |
| 3200 for(x=0; x<width; x+=32) | |
| 3201 { | |
| 164 | 3202 volatile int i; |
| 163 | 3203 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
| 3204 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | |
| 164 | 3205 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
| 3206 // + dstBlock[x +13*dstStride] | |
| 3207 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; | |
| 3208 }*/ | |
| 3209 } | |
| 96 | 3210 #ifdef HAVE_3DNOW |
| 3211 asm volatile("femms"); | |
| 3212 #elif defined (HAVE_MMX) | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3213 asm volatile("emms"); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3214 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3215 |
| 163 | 3216 #ifdef DEBUG_BRIGHTNESS |
| 3217 if(!isColor) | |
| 3218 { | |
| 3219 int max=1; | |
| 3220 int i; | |
| 3221 for(i=0; i<256; i++) | |
| 3222 if(yHistogram[i] > max) max=yHistogram[i]; | |
| 3223 | |
| 3224 for(i=1; i<256; i++) | |
| 3225 { | |
| 3226 int x; | |
| 3227 int start=yHistogram[i-1]/(max/256+1); | |
| 3228 int end=yHistogram[i]/(max/256+1); | |
| 3229 int inc= end > start ? 1 : -1; | |
| 3230 for(x=start; x!=end+inc; x+=inc) | |
| 3231 dst[ i*dstStride + x]+=128; | |
| 3232 } | |
| 3233 | |
| 3234 for(i=0; i<100; i+=2) | |
| 3235 { | |
| 3236 dst[ (white)*dstStride + i]+=128; | |
| 3237 dst[ (black)*dstStride + i]+=128; | |
| 3238 } | |
| 3239 | |
| 3240 } | |
| 3241 #endif | |
| 3242 | |
| 787 | 3243 *c2= c; //copy local context back |
| 3244 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3245 } |
