Mercurial > libavcodec.hg
annotate libpostproc/postprocess_template.c @ 229:f418b5c5ff67 libavcodec
PATCH by Rik Snel <rsnel@cube.dyndns.org>
this patch enhances the jpeg header writer. It can be asked to omit
quantisation and huffman tables and it can write different horizontal and
vertical sampling factors. (the last thing is useless for libavcodec
itself (because libavcodec only handles YUV420P at ecoder level), but the
values are initialized so that operation of libavcodec is not impaired)
| author | arpi_esp |
|---|---|
| date | Sat, 09 Feb 2002 01:23:41 +0000 |
| parents | 8b3e70afa2ba |
| children | 3912b37ba121 |
| rev | line source |
|---|---|
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1 /* |
| 223 | 2 Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
4 This program is free software; you can redistribute it and/or modify |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
5 it under the terms of the GNU General Public License as published by |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
6 the Free Software Foundation; either version 2 of the License, or |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
7 (at your option) any later version. |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
8 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
9 This program is distributed in the hope that it will be useful, |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
10 but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
12 GNU General Public License for more details. |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
13 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
14 You should have received a copy of the GNU General Public License |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
15 along with this program; if not, write to the Free Software |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
17 */ |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
18 |
| 169 | 19 #undef PAVGB |
| 20 #undef PMINUB | |
| 21 #undef PMAXUB | |
| 104 | 22 |
| 23 #ifdef HAVE_MMX2 | |
| 24 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
| 25 #elif defined (HAVE_3DNOW) | |
| 26 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
| 27 #endif | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
28 |
| 134 | 29 #ifdef HAVE_MMX2 |
| 30 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" | |
| 31 #elif defined (HAVE_MMX) | |
| 32 #define PMINUB(b,a,t) \ | |
| 33 "movq " #a ", " #t " \n\t"\ | |
| 34 "psubusb " #b ", " #t " \n\t"\ | |
| 35 "psubb " #t ", " #a " \n\t" | |
| 36 #endif | |
| 37 | |
| 38 #ifdef HAVE_MMX2 | |
| 39 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" | |
| 40 #elif defined (HAVE_MMX) | |
| 41 #define PMAXUB(a,b) \ | |
| 42 "psubusb " #a ", " #b " \n\t"\ | |
| 43 "paddb " #a ", " #b " \n\t" | |
| 44 #endif | |
| 45 | |
| 46 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
47 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
48 /** |
| 111 | 49 * Check if the middle 8x8 Block in the given 8x16 block is flat |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
50 */ |
| 169 | 51 static inline int RENAME(isVertDC)(uint8_t src[], int stride){ |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
52 int numEq= 0; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
53 #ifndef HAVE_MMX |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
54 int y; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
55 #endif |
| 111 | 56 src+= stride*4; // src points to begin of the 8x8 Block |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
57 #ifdef HAVE_MMX |
| 119 | 58 asm volatile( |
| 59 "leal (%1, %2), %%eax \n\t" | |
| 60 "leal (%%eax, %2, 4), %%ebx \n\t" | |
| 61 // 0 1 2 3 4 5 6 7 8 9 | |
| 62 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 | |
| 210 | 63 "movq "MANGLE(mmxDCOffset)", %%mm7 \n\t" // mm7 = 0x7F |
| 64 "movq "MANGLE(mmxDCThreshold)", %%mm6 \n\t" // mm6 = 0x7D | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
65 "movq (%1), %%mm0 \n\t" |
| 119 | 66 "movq (%%eax), %%mm1 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
67 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
68 "paddb %%mm7, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
69 "pcmpgtb %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
70 |
| 119 | 71 "movq (%%eax,%2), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
72 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
73 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
74 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
75 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
76 |
| 119 | 77 "movq (%%eax, %2, 2), %%mm1 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
78 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
79 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
80 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
81 "paddb %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
82 |
| 119 | 83 "movq (%1, %2, 4), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
84 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
85 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
86 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
87 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
88 |
| 119 | 89 "movq (%%ebx), %%mm1 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
90 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
91 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
92 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
93 "paddb %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
94 |
| 119 | 95 "movq (%%ebx, %2), %%mm2 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
96 "psubb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
97 "paddb %%mm7, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
98 "pcmpgtb %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
99 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
100 |
| 119 | 101 "movq (%%ebx, %2, 2), %%mm1 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
102 "psubb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
103 "paddb %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
104 "pcmpgtb %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
105 "paddb %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
106 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
107 " \n\t" |
| 167 | 108 #ifdef HAVE_MMX2 |
| 109 "pxor %%mm7, %%mm7 \n\t" | |
| 110 "psadbw %%mm7, %%mm0 \n\t" | |
| 111 #else | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
112 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
113 "psrlw $8, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
114 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
115 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
116 "psrlq $16, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
117 "paddb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
118 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
119 "psrlq $32, %%mm0 \n\t" |
| 167 | 120 "paddb %%mm1, %%mm0 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
121 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
122 "movd %%mm0, %0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
123 : "=r" (numEq) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
124 : "r" (src), "r" (stride) |
| 167 | 125 : "%ebx" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
126 ); |
| 167 | 127 numEq= (-numEq) &0xFF; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
128 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
129 #else |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
130 for(y=0; y<BLOCK_SIZE-1; y++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
131 { |
| 181 | 132 if(((src[0] - src[0+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; |
| 133 if(((src[1] - src[1+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
| 134 if(((src[2] - src[2+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
| 135 if(((src[3] - src[3+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
| 136 if(((src[4] - src[4+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
| 137 if(((src[5] - src[5+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
| 138 if(((src[6] - src[6+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
| 139 if(((src[7] - src[7+stride] + dcOffset)&0xFFFF) < dcThreshold) numEq++; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
140 src+= stride; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
141 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
142 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
143 /* if(abs(numEq - asmEq) > 0) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
144 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
145 printf("\nasm:%d c:%d\n", asmEq, numEq); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
146 for(int y=0; y<8; y++) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
147 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
148 for(int x=0; x<8; x++) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
149 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
150 printf("%d ", temp[x + y*stride]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
151 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
152 printf("\n"); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
153 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
154 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
155 */ |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
156 // for(int i=0; i<numEq/8; i++) src[i]=255; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
157 return (numEq > vFlatnessThreshold) ? 1 : 0; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
158 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
159 |
| 169 | 160 static inline int RENAME(isVertMinMaxOk)(uint8_t src[], int stride, int QP) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
161 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
162 #ifdef HAVE_MMX |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
163 int isOk; |
| 111 | 164 src+= stride*3; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
165 asm volatile( |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
166 // "int $3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
167 "movq (%1, %2), %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
168 "movq (%1, %2, 8), %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
169 "movq %%mm0, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
170 "psubusb %%mm1, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
171 "psubusb %%mm2, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
172 "por %%mm1, %%mm0 \n\t" // ABS Diff |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
173 |
| 210 | 174 "movq "MANGLE(pQPb)", %%mm7 \n\t" // QP,..., QP |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
175 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
176 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 |
| 210 | 177 "pcmpeqd "MANGLE(b00)", %%mm0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
178 "psrlq $16, %%mm0 \n\t" |
| 210 | 179 "pcmpeqd "MANGLE(bFF)", %%mm0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
180 // "movd %%mm0, (%1, %2, 4)\n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
181 "movd %%mm0, %0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
182 : "=r" (isOk) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
183 : "r" (src), "r" (stride) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
184 ); |
| 120 | 185 return isOk; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
186 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
187 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
188 int isOk2= 1; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
189 int x; |
| 111 | 190 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
191 for(x=0; x<BLOCK_SIZE; x++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
192 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
193 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
194 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
195 /* if(isOk && !isOk2 || !isOk && isOk2) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
196 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
197 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
198 for(int y=0; y<9; y++) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
199 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
200 for(int x=0; x<8; x++) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
201 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
202 printf("%d ", src[x + y*stride]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
203 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
204 printf("\n"); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
205 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
206 } */ |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
207 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
208 return isOk2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
209 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
210 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
211 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
212 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
213 /** |
| 111 | 214 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
| 107 | 215 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
216 */ |
| 169 | 217 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
218 { |
| 96 | 219 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 220 src+= stride*3; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
221 asm volatile( //"movv %0 %1 %2\n\t" |
| 210 | 222 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
223 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
224 "movq (%0), %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
225 "movq (%0, %1), %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
226 "movq %%mm5, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
227 "movq %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
228 "psubusb %%mm6, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
229 "psubusb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
230 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
231 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
| 210 | 232 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // diff <= QP -> FF |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
233 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
234 "pand %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
235 "pandn %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
236 "por %%mm2, %%mm6 \n\t"// First Line to Filter |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
237 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
238 "movq (%0, %1, 8), %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
239 "leal (%0, %1, 4), %%eax \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
240 "leal (%0, %1, 8), %%ebx \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
241 "subl %1, %%ebx \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
242 "addl %1, %0 \n\t" // %0 points to line 1 not 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
243 "movq (%0, %1, 8), %%mm7 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
244 "movq %%mm5, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
245 "movq %%mm7, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
246 "psubusb %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
247 "psubusb %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
248 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
249 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
| 210 | 250 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // diff <= QP -> FF |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
251 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
252 "pand %%mm2, %%mm7 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
253 "pandn %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
254 "por %%mm2, %%mm7 \n\t" // First Line to Filter |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
255 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
256 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
257 // 1 2 3 4 5 6 7 8 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
258 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
259 // 6 4 2 2 1 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
260 // 6 4 4 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
261 // 6 8 2 |
| 111 | 262 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
263 "movq (%0, %1), %%mm0 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
264 "movq %%mm0, %%mm1 \n\t" // 1 |
| 96 | 265 PAVGB(%%mm6, %%mm0) //1 1 /2 |
| 266 PAVGB(%%mm6, %%mm0) //3 1 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
267 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
268 "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
269 "movq %%mm2, %%mm5 \n\t" // 1 |
| 96 | 270 PAVGB((%%eax), %%mm2) // 11 /2 |
| 271 PAVGB((%0, %1, 2), %%mm2) // 211 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
272 "movq %%mm2, %%mm3 \n\t" // 211 /4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
273 "movq (%0), %%mm4 \n\t" // 1 |
| 96 | 274 PAVGB(%%mm4, %%mm3) // 4 211 /8 |
| 275 PAVGB(%%mm0, %%mm3) //642211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
276 "movq %%mm3, (%0) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
277 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
278 "movq %%mm1, %%mm0 \n\t" // 1 |
| 96 | 279 PAVGB(%%mm6, %%mm0) //1 1 /2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
280 "movq %%mm4, %%mm3 \n\t" // 1 |
| 96 | 281 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
| 282 PAVGB((%%eax,%1,2), %%mm5) // 11 /2 | |
| 283 PAVGB((%%eax), %%mm5) // 211 /4 | |
| 284 PAVGB(%%mm5, %%mm3) // 2 2211 /8 | |
| 285 PAVGB(%%mm0, %%mm3) //4242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
286 "movq %%mm3, (%0,%1) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
287 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
| 96 | 288 PAVGB(%%mm4, %%mm6) //11 /2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
289 "movq (%%ebx), %%mm0 \n\t" // 1 |
| 96 | 290 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
291 "movq %%mm0, %%mm3 \n\t" // 11/2 |
| 96 | 292 PAVGB(%%mm1, %%mm0) // 2 11/4 |
| 293 PAVGB(%%mm6, %%mm0) //222 11/8 | |
| 294 PAVGB(%%mm2, %%mm0) //22242211/16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
295 "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
296 "movq %%mm0, (%0, %1, 2) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
297 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
298 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 96 | 299 PAVGB((%%ebx), %%mm0) // 11 /2 |
| 300 PAVGB(%%mm0, %%mm6) //11 11 /4 | |
| 301 PAVGB(%%mm1, %%mm4) // 11 /2 | |
| 302 PAVGB(%%mm2, %%mm1) // 11 /2 | |
| 303 PAVGB(%%mm1, %%mm6) //1122 11 /8 | |
| 304 PAVGB(%%mm5, %%mm6) //112242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
305 "movq (%%eax), %%mm5 \n\t" // 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
306 "movq %%mm6, (%%eax) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
307 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
308 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 |
| 96 | 309 PAVGB(%%mm7, %%mm6) // 11 /2 |
| 310 PAVGB(%%mm4, %%mm6) // 11 11 /4 | |
| 311 PAVGB(%%mm3, %%mm6) // 11 2211 /8 | |
| 312 PAVGB(%%mm5, %%mm2) // 11 /2 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
313 "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
| 96 | 314 PAVGB(%%mm4, %%mm2) // 112 /4 |
| 315 PAVGB(%%mm2, %%mm6) // 112242211 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
316 "movq %%mm6, (%0, %1, 4) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
317 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
| 96 | 318 PAVGB(%%mm7, %%mm1) // 11 2 /4 |
| 319 PAVGB(%%mm4, %%mm5) // 11 /2 | |
| 320 PAVGB(%%mm5, %%mm0) // 11 11 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
321 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
| 96 | 322 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
| 323 PAVGB(%%mm0, %%mm1) // 11224222 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
324 "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
325 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
| 96 | 326 PAVGB((%%ebx), %%mm2) // 112 4 /8 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
327 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 96 | 328 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
| 329 PAVGB(%%mm7, %%mm6) // 1 12 /4 | |
| 330 PAVGB(%%mm2, %%mm6) // 1122424 /4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
331 "movq %%mm6, (%%ebx) \n\t" // X |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
332 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
| 96 | 333 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
| 334 PAVGB(%%mm7, %%mm5) // 11 6 /8 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
335 |
| 96 | 336 PAVGB(%%mm3, %%mm0) // 112 /4 |
| 337 PAVGB(%%mm0, %%mm5) // 112246 /16 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
338 "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
| 140 | 339 "subl %1, %0 \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
340 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
341 : |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
342 : "r" (src), "r" (stride) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
343 : "%eax", "%ebx" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
344 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
345 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
346 const int l1= stride; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
347 const int l2= stride + l1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
348 const int l3= stride + l2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
349 const int l4= stride + l3; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
350 const int l5= stride + l4; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
351 const int l6= stride + l5; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
352 const int l7= stride + l6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
353 const int l8= stride + l7; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
354 const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
355 int x; |
| 111 | 356 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
357 for(x=0; x<BLOCK_SIZE; x++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
358 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
359 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
360 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
361 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
362 int sums[9]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
363 sums[0] = first + src[l1]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
364 sums[1] = src[l1] + src[l2]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
365 sums[2] = src[l2] + src[l3]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
366 sums[3] = src[l3] + src[l4]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
367 sums[4] = src[l4] + src[l5]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
368 sums[5] = src[l5] + src[l6]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
369 sums[6] = src[l6] + src[l7]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
370 sums[7] = src[l7] + src[l8]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
371 sums[8] = src[l8] + last; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
372 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
373 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
374 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
375 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
376 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
377 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
378 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
379 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
380 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
381 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
382 src++; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
383 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
384 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
385 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
386 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
387 |
| 96 | 388 /** |
| 389 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | |
| 390 * values are correctly clipped (MMX2) | |
| 391 * values are wraparound (C) | |
| 392 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | |
| 393 0 8 16 24 | |
| 394 x = 8 | |
| 395 x/2 = 4 | |
| 396 x/8 = 1 | |
| 397 1 12 12 23 | |
| 398 */ | |
| 169 | 399 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
| 96 | 400 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
401 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 402 src+= stride*3; |
| 96 | 403 // FIXME rounding |
| 404 asm volatile( | |
| 405 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 210 | 406 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 96 | 407 "leal (%0, %1), %%eax \n\t" |
| 408 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 409 // 0 1 2 3 4 5 6 7 8 9 | |
| 410 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 210 | 411 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
| 96 | 412 "movq %%mm0, %%mm1 \n\t" // QP,..., QP |
| 210 | 413 "paddusb "MANGLE(b02)", %%mm0 \n\t" |
| 96 | 414 "psrlw $2, %%mm0 \n\t" |
| 210 | 415 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 |
| 96 | 416 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... |
| 417 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | |
| 418 "movq (%%ebx), %%mm3 \n\t" // line 5 | |
| 419 "movq %%mm2, %%mm4 \n\t" // line 4 | |
| 420 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | |
| 421 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
422 PAVGB(%%mm3, %%mm5) |
| 96 | 423 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 |
| 424 "psubusb %%mm3, %%mm4 \n\t" | |
| 425 "psubusb %%mm2, %%mm3 \n\t" | |
| 426 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | |
| 427 "psubusb %%mm0, %%mm4 \n\t" | |
| 428 "pcmpeqb %%mm7, %%mm4 \n\t" | |
| 429 "pand %%mm4, %%mm5 \n\t" // d/2 | |
| 430 | |
| 431 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | |
| 432 "paddb %%mm5, %%mm2 \n\t" | |
| 433 // "psubb %%mm6, %%mm2 \n\t" | |
| 434 "movq %%mm2, (%0,%1, 4) \n\t" | |
| 435 | |
| 436 "movq (%%ebx), %%mm2 \n\t" | |
| 437 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 | |
| 438 "psubb %%mm5, %%mm2 \n\t" | |
| 439 // "psubb %%mm6, %%mm2 \n\t" | |
| 440 "movq %%mm2, (%%ebx) \n\t" | |
| 441 | |
| 442 "paddb %%mm6, %%mm5 \n\t" | |
| 443 "psrlw $2, %%mm5 \n\t" | |
| 210 | 444 "pand "MANGLE(b3F)", %%mm5 \n\t" |
| 445 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 | |
| 96 | 446 |
| 447 "movq (%%eax, %1, 2), %%mm2 \n\t" | |
| 448 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | |
| 449 "paddsb %%mm5, %%mm2 \n\t" | |
| 450 "psubb %%mm6, %%mm2 \n\t" | |
| 451 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
| 452 | |
| 453 "movq (%%ebx, %1), %%mm2 \n\t" | |
| 454 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 | |
| 455 "psubsb %%mm5, %%mm2 \n\t" | |
| 456 "psubb %%mm6, %%mm2 \n\t" | |
| 457 "movq %%mm2, (%%ebx, %1) \n\t" | |
| 458 | |
| 459 : | |
| 460 : "r" (src), "r" (stride) | |
| 461 : "%eax", "%ebx" | |
| 462 ); | |
| 463 #else | |
| 464 const int l1= stride; | |
| 465 const int l2= stride + l1; | |
| 466 const int l3= stride + l2; | |
| 467 const int l4= stride + l3; | |
| 468 const int l5= stride + l4; | |
| 469 const int l6= stride + l5; | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
470 // const int l7= stride + l6; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
471 // const int l8= stride + l7; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
472 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
473 int x; |
| 141 | 474 const int QP15= QP + (QP>>2); |
| 111 | 475 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
476 for(x=0; x<BLOCK_SIZE; x++) |
| 96 | 477 { |
| 141 | 478 const int v = (src[x+l5] - src[x+l4]); |
| 479 if(ABS(v) < QP15) | |
| 96 | 480 { |
| 141 | 481 src[x+l3] +=v>>3; |
| 482 src[x+l4] +=v>>1; | |
| 483 src[x+l5] -=v>>1; | |
| 484 src[x+l6] -=v>>3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
485 |
| 96 | 486 } |
| 487 } | |
| 488 | |
| 489 #endif | |
| 490 } | |
| 491 | |
| 492 /** | |
| 493 * Experimental Filter 1 | |
| 99 | 494 * will not damage linear gradients |
| 495 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
496 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
497 * MMX2 version does correct clipping C version doesnt |
| 96 | 498 */ |
| 169 | 499 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, int QP) |
| 96 | 500 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
501 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 111 | 502 src+= stride*3; |
| 503 | |
| 96 | 504 asm volatile( |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
505 "pxor %%mm7, %%mm7 \n\t" // 0 |
| 210 | 506 // "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
507 "leal (%0, %1), %%eax \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
508 "leal (%%eax, %1, 4), %%ebx \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
509 // 0 1 2 3 4 5 6 7 8 9 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
510 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
511 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
512 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
513 "movq %%mm1, %%mm2 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
514 "psubusb %%mm0, %%mm1 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
515 "psubusb %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
516 "por %%mm1, %%mm0 \n\t" // |l2 - l3| |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
517 "movq (%%ebx), %%mm3 \n\t" // line 5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
518 "movq (%%ebx, %1), %%mm4 \n\t" // line 6 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
519 "movq %%mm3, %%mm5 \n\t" // line 5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
520 "psubusb %%mm4, %%mm3 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
521 "psubusb %%mm5, %%mm4 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
522 "por %%mm4, %%mm3 \n\t" // |l5 - l6| |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
523 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
524 "movq %%mm2, %%mm1 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
525 "psubusb %%mm5, %%mm2 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
526 "movq %%mm2, %%mm4 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
527 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
528 "psubusb %%mm1, %%mm5 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
529 "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
530 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
531 "movq %%mm4, %%mm3 \n\t" // d |
| 210 | 532 "psubusb "MANGLE(pQPb)", %%mm4 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
533 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
| 210 | 534 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
535 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
536 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
537 PAVGB(%%mm7, %%mm3) // d/2 |
| 99 | 538 "movq %%mm3, %%mm1 \n\t" // d/2 |
| 539 PAVGB(%%mm7, %%mm3) // d/4 | |
| 540 PAVGB(%%mm1, %%mm3) // 3*d/8 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
541 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
542 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
543 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
544 "psubusb %%mm3, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
545 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
546 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
547 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
548 "movq (%%ebx), %%mm0 \n\t" // line 5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
549 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
550 "paddusb %%mm3, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
551 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
552 "movq %%mm0, (%%ebx) \n\t" // line 5 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
553 |
| 99 | 554 PAVGB(%%mm7, %%mm1) // d/4 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
555 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
556 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
557 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
| 99 | 558 "psubusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
559 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
560 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
561 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
562 "movq (%%ebx, %1), %%mm0 \n\t" // line 6 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
563 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
| 99 | 564 "paddusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
565 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
566 "movq %%mm0, (%%ebx, %1) \n\t" // line 6 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
567 |
| 99 | 568 PAVGB(%%mm7, %%mm1) // d/8 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
569 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
570 "movq (%%eax, %1), %%mm0 \n\t" // line 2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
571 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
| 99 | 572 "psubusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
573 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
574 "movq %%mm0, (%%eax, %1) \n\t" // line 2 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
575 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
576 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
577 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
| 99 | 578 "paddusb %%mm1, %%mm0 \n\t" |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
579 "pxor %%mm2, %%mm0 \n\t" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
580 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 |
| 96 | 581 |
| 582 : | |
| 583 : "r" (src), "r" (stride) | |
| 584 : "%eax", "%ebx" | |
| 585 ); | |
| 586 #else | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
587 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
588 const int l1= stride; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
589 const int l2= stride + l1; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
590 const int l3= stride + l2; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
591 const int l4= stride + l3; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
592 const int l5= stride + l4; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
593 const int l6= stride + l5; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
594 const int l7= stride + l6; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
595 // const int l8= stride + l7; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
596 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
597 int x; |
| 111 | 598 |
| 599 src+= stride*3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
600 for(x=0; x<BLOCK_SIZE; x++) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
601 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
602 int a= src[l3] - src[l4]; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
603 int b= src[l4] - src[l5]; |
| 99 | 604 int c= src[l5] - src[l6]; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
605 |
| 141 | 606 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); |
| 607 d= MAX(d, 0); | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
608 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
609 if(d < QP) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
610 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
611 int v = d * SIGN(-b); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
612 |
| 141 | 613 src[l2] +=v>>3; |
| 614 src[l3] +=v>>2; | |
| 615 src[l4] +=(3*v)>>3; | |
| 616 src[l5] -=(3*v)>>3; | |
| 617 src[l6] -=v>>2; | |
| 618 src[l7] -=v>>3; | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
619 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
620 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
621 src++; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
622 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
623 /* |
| 96 | 624 const int l1= stride; |
| 625 const int l2= stride + l1; | |
| 626 const int l3= stride + l2; | |
| 627 const int l4= stride + l3; | |
| 628 const int l5= stride + l4; | |
| 629 const int l6= stride + l5; | |
| 630 const int l7= stride + l6; | |
| 631 const int l8= stride + l7; | |
| 632 const int l9= stride + l8; | |
| 633 for(int x=0; x<BLOCK_SIZE; x++) | |
| 634 { | |
| 635 int v2= src[l2]; | |
| 636 int v3= src[l3]; | |
| 637 int v4= src[l4]; | |
| 638 int v5= src[l5]; | |
| 639 int v6= src[l6]; | |
| 640 int v7= src[l7]; | |
| 641 | |
| 642 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 ) | |
| 643 { | |
| 644 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16; | |
| 645 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16; | |
| 646 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16; | |
| 647 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; | |
| 648 } | |
| 649 src++; | |
| 650 } | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
651 */ |
| 96 | 652 #endif |
| 653 } | |
| 654 | |
| 169 | 655 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, int QP) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
656 { |
| 163 | 657 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 658 /* | |
| 659 uint8_t tmp[16]; | |
| 660 const int l1= stride; | |
| 661 const int l2= stride + l1; | |
| 662 const int l3= stride + l2; | |
| 663 const int l4= (int)tmp - (int)src - stride*3; | |
| 664 const int l5= (int)tmp - (int)src - stride*3 + 8; | |
| 665 const int l6= stride*3 + l3; | |
| 666 const int l7= stride + l6; | |
| 667 const int l8= stride + l7; | |
| 668 | |
| 669 memcpy(tmp, src+stride*7, 8); | |
| 670 memcpy(tmp+8, src+stride*8, 8); | |
| 671 */ | |
| 111 | 672 src+= stride*4; |
| 163 | 673 asm volatile( |
| 674 | |
| 675 #if 0 //sligtly more accurate and slightly slower | |
| 676 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 677 "leal (%0, %1), %%eax \n\t" | |
| 678 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 679 // 0 1 2 3 4 5 6 7 | |
| 680 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | |
| 681 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | |
| 682 | |
| 683 | |
| 684 "movq (%0, %1, 2), %%mm0 \n\t" // l2 | |
| 685 "movq (%0), %%mm1 \n\t" // l0 | |
| 686 "movq %%mm0, %%mm2 \n\t" // l2 | |
| 687 PAVGB(%%mm7, %%mm0) // ~l2/2 | |
| 688 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | |
| 689 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | |
| 690 | |
| 691 "movq (%%eax), %%mm1 \n\t" // l1 | |
| 692 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 | |
| 693 "movq %%mm1, %%mm4 \n\t" // l1 | |
| 694 PAVGB(%%mm7, %%mm1) // ~l1/2 | |
| 695 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | |
| 696 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | |
| 697 | |
| 698 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | |
| 699 "psubusb %%mm1, %%mm0 \n\t" | |
| 700 "psubusb %%mm4, %%mm1 \n\t" | |
| 701 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | |
| 702 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 | |
| 703 | |
| 704 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
| 705 "movq %%mm0, %%mm4 \n\t" // l4 | |
| 706 PAVGB(%%mm7, %%mm0) // ~l4/2 | |
| 707 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | |
| 708 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | |
| 709 | |
| 710 "movq (%%ebx), %%mm2 \n\t" // l5 | |
| 711 "movq %%mm3, %%mm5 \n\t" // l3 | |
| 712 PAVGB(%%mm7, %%mm3) // ~l3/2 | |
| 713 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | |
| 714 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | |
| 715 | |
| 716 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | |
| 717 "psubusb %%mm3, %%mm0 \n\t" | |
| 718 "psubusb %%mm6, %%mm3 \n\t" | |
| 719 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | |
| 720 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | |
| 721 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | |
| 722 | |
| 723 "movq (%%ebx, %1), %%mm6 \n\t" // l6 | |
| 724 "movq %%mm6, %%mm5 \n\t" // l6 | |
| 725 PAVGB(%%mm7, %%mm6) // ~l6/2 | |
| 726 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | |
| 727 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | |
| 728 | |
| 729 "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7 | |
| 730 "movq %%mm2, %%mm4 \n\t" // l5 | |
| 731 PAVGB(%%mm7, %%mm2) // ~l5/2 | |
| 732 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | |
| 733 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | |
| 734 | |
| 735 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | |
| 736 "psubusb %%mm2, %%mm6 \n\t" | |
| 737 "psubusb %%mm4, %%mm2 \n\t" | |
| 738 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | |
| 739 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | |
| 740 | |
| 741 | |
| 742 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | |
| 210 | 743 "movq "MANGLE(pQPb)", %%mm4 \n\t" // QP //FIXME QP+1 ? |
| 744 "paddusb "MANGLE(b01)", %%mm4 \n\t" | |
| 163 | 745 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
| 746 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | |
| 747 "pand %%mm4, %%mm3 \n\t" | |
| 748 | |
| 749 "movq %%mm3, %%mm1 \n\t" | |
| 210 | 750 // "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 751 PAVGB(%%mm7, %%mm3) |
| 752 PAVGB(%%mm7, %%mm3) | |
| 753 "paddusb %%mm1, %%mm3 \n\t" | |
| 210 | 754 // "paddusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 755 |
| 756 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 | |
| 757 "movq (%0, %1, 4), %%mm5 \n\t" //l4 | |
| 758 "movq (%0, %1, 4), %%mm4 \n\t" //l4 | |
| 759 "psubusb %%mm6, %%mm5 \n\t" | |
| 760 "psubusb %%mm4, %%mm6 \n\t" | |
| 761 "por %%mm6, %%mm5 \n\t" // |l3-l4| | |
| 762 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | |
| 763 "pxor %%mm6, %%mm0 \n\t" | |
| 764 "pand %%mm0, %%mm3 \n\t" | |
| 765 PMINUB(%%mm5, %%mm3, %%mm0) | |
| 766 | |
| 210 | 767 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 163 | 768 PAVGB(%%mm7, %%mm3) |
| 769 | |
| 770 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 771 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 772 "pxor %%mm6, %%mm0 \n\t" | |
| 773 "pxor %%mm6, %%mm2 \n\t" | |
| 774 "psubb %%mm3, %%mm0 \n\t" | |
| 775 "paddb %%mm3, %%mm2 \n\t" | |
| 776 "pxor %%mm6, %%mm0 \n\t" | |
| 777 "pxor %%mm6, %%mm2 \n\t" | |
| 778 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 779 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 780 #endif | |
| 781 | |
| 782 "leal (%0, %1), %%eax \n\t" | |
| 783 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | |
| 784 // 0 1 2 3 4 5 6 7 | |
| 785 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | |
| 786 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | |
| 787 | |
| 788 | |
| 789 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 | |
| 790 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
| 791 "pxor %%mm6, %%mm1 \n\t" // -l3-1 | |
| 792 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | |
| 793 // mm1=-l3-1, mm0=128-q | |
| 794 | |
| 795 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 | |
| 796 "movq (%%eax, %1), %%mm3 \n\t" // l2 | |
| 797 "pxor %%mm6, %%mm2 \n\t" // -l5-1 | |
| 798 "movq %%mm2, %%mm5 \n\t" // -l5-1 | |
| 210 | 799 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
| 163 | 800 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 801 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 | |
| 802 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | |
| 803 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | |
| 804 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | |
| 805 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | |
| 806 | |
| 807 "movq (%%eax), %%mm2 \n\t" // l1 | |
| 808 "pxor %%mm6, %%mm2 \n\t" // -l1-1 | |
| 809 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | |
| 810 PAVGB((%0), %%mm1) // (l0-l3+256)/2 | |
| 210 | 811 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 |
| 163 | 812 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
| 813 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | |
| 814 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | |
| 815 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | |
| 816 | |
| 817 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2 | |
| 818 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7 | |
| 819 "pxor %%mm6, %%mm1 \n\t" // -l7-1 | |
| 820 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | |
| 210 | 821 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
| 163 | 822 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
| 823 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | |
| 824 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | |
| 825 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | |
| 826 | |
| 210 | 827 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
| 828 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 | |
| 163 | 829 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 |
| 830 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | |
| 831 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | |
| 832 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | |
| 833 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | |
| 834 | |
| 835 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | |
| 836 | |
| 210 | 837 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
| 838 "movq "MANGLE(pQPb)", %%mm2 \n\t" // QP | |
| 163 | 839 PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
| 840 "psubb %%mm6, %%mm2 \n\t" | |
| 841 | |
| 842 "movq %%mm4, %%mm1 \n\t" | |
| 843 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | |
| 844 "pxor %%mm1, %%mm4 \n\t" | |
| 845 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | |
| 846 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | |
| 847 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | |
| 848 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | |
| 849 | |
| 850 "movq %%mm4, %%mm3 \n\t" // d | |
| 210 | 851 "psubusb "MANGLE(b01)", %%mm4 \n\t" |
| 163 | 852 PAVGB(%%mm7, %%mm4) // d/32 |
| 853 PAVGB(%%mm7, %%mm4) // (d + 32)/64 | |
| 854 "paddb %%mm3, %%mm4 \n\t" // 5d/64 | |
| 855 "pand %%mm2, %%mm4 \n\t" | |
| 856 | |
| 210 | 857 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 |
| 163 | 858 "psubb %%mm0, %%mm5 \n\t" // q |
| 859 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | |
| 860 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | |
| 861 "pxor %%mm7, %%mm5 \n\t" | |
| 862 | |
| 863 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | |
| 864 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | |
| 865 | |
| 866 "pand %%mm7, %%mm4 \n\t" | |
| 867 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 868 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 869 "pxor %%mm1, %%mm0 \n\t" | |
| 870 "pxor %%mm1, %%mm2 \n\t" | |
| 871 "paddb %%mm4, %%mm0 \n\t" | |
| 872 "psubb %%mm4, %%mm2 \n\t" | |
| 873 "pxor %%mm1, %%mm0 \n\t" | |
| 874 "pxor %%mm1, %%mm2 \n\t" | |
| 875 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 876 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 877 | |
| 878 : | |
| 879 : "r" (src), "r" (stride) | |
| 880 : "%eax", "%ebx" | |
| 881 ); | |
| 882 | |
| 883 /* | |
| 884 { | |
| 885 int x; | |
| 886 src-= stride; | |
| 887 for(x=0; x<BLOCK_SIZE; x++) | |
| 888 { | |
| 889 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
| 890 if(ABS(middleEnergy)< 8*QP) | |
| 891 { | |
| 892 const int q=(src[l4] - src[l5])/2; | |
| 893 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
| 894 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
| 895 | |
| 896 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
| 897 d= MAX(d, 0); | |
| 898 | |
| 899 d= (5*d + 32) >> 6; | |
| 900 d*= SIGN(-middleEnergy); | |
| 901 | |
| 902 if(q>0) | |
| 903 { | |
| 904 d= d<0 ? 0 : d; | |
| 905 d= d>q ? q : d; | |
| 906 } | |
| 907 else | |
| 908 { | |
| 909 d= d>0 ? 0 : d; | |
| 910 d= d<q ? q : d; | |
| 911 } | |
| 912 | |
| 913 src[l4]-= d; | |
| 914 src[l5]+= d; | |
| 915 } | |
| 916 src++; | |
| 917 } | |
| 918 src-=8; | |
| 919 for(x=0; x<8; x++) | |
| 920 { | |
| 921 int y; | |
| 922 for(y=4; y<6; y++) | |
| 923 { | |
| 924 int d= src[x+y*stride] - tmp[x+(y-4)*8]; | |
| 925 int ad= ABS(d); | |
| 926 static int max=0; | |
| 927 static int sum=0; | |
| 928 static int num=0; | |
| 929 static int bias=0; | |
| 930 | |
| 931 if(max<ad) max=ad; | |
| 932 sum+= ad>3 ? 1 : 0; | |
| 933 if(ad>3) | |
| 934 { | |
| 935 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | |
| 936 } | |
| 937 if(y==4) bias+=d; | |
| 938 num++; | |
| 939 if(num%1000000 == 0) | |
| 940 { | |
| 941 printf(" %d %d %d %d\n", num, sum, max, bias); | |
| 942 } | |
| 943 } | |
| 944 } | |
| 945 } | |
| 946 */ | |
| 947 #elif defined (HAVE_MMX) | |
| 948 src+= stride*4; | |
| 949 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
950 asm volatile( |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
951 "pxor %%mm7, %%mm7 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
952 "leal (%0, %1), %%eax \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
953 "leal (%%eax, %1, 4), %%ebx \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
954 // 0 1 2 3 4 5 6 7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
955 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
956 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
957 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
958 "movq (%0), %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
959 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
960 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
961 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
962 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
963 "movq (%%eax), %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
964 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
965 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
966 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
967 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
968 "movq (%%eax, %1), %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
969 "movq %%mm4, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
970 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
971 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
972 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
973 "paddw %%mm0, %%mm0 \n\t" // 2L0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
974 "paddw %%mm1, %%mm1 \n\t" // 2H0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
975 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
976 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
977 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
978 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
979 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
980 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
981 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
982 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
983 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
984 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
985 "movq (%%eax, %1, 2), %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
986 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
987 "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
988 "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
989 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
990 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
991 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
992 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
993 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
| 210 | 994 "movq %%mm0, "MANGLE(temp0)" \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 995 "movq %%mm1, "MANGLE(temp1)" \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
996 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
997 "movq (%0, %1, 4), %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
998 "movq %%mm0, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
999 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1000 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1001 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1002 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1003 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
| 210 | 1004 "movq %%mm2, "MANGLE(temp2)" \n\t" // L3 - L4 |
| 1005 "movq %%mm3, "MANGLE(temp3)" \n\t" // H3 - H4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1006 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1007 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1008 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1009 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1010 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1011 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1012 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1013 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1014 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1015 //50 opcodes so far |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1016 "movq (%%ebx), %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1017 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1018 "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1019 "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1020 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1021 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1022 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1023 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1024 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1025 "movq (%%ebx, %1), %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1026 "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1027 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1028 "movq (%%ebx, %1), %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1029 "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1030 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1031 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1032 "paddw %%mm0, %%mm0 \n\t" // 2L4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1033 "paddw %%mm1, %%mm1 \n\t" // 2H4 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1034 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1035 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1036 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1037 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1038 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1039 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1040 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1041 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1042 "movq (%%ebx, %1, 2), %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1043 "movq %%mm2, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1044 "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1045 "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1046 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1047 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1048 "paddw %%mm3, %%mm3 \n\t" // 2H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1049 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1050 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1051 |
| 210 | 1052 "movq "MANGLE(temp0)", %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 1053 "movq "MANGLE(temp1)", %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 140 | 1054 |
| 1055 #ifdef HAVE_MMX2 | |
| 1056 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1057 "psubw %%mm0, %%mm6 \n\t" | |
| 1058 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
| 1059 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1060 "psubw %%mm1, %%mm6 \n\t" | |
| 1061 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
| 1062 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1063 "psubw %%mm2, %%mm6 \n\t" | |
| 1064 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
| 1065 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1066 "psubw %%mm3, %%mm6 \n\t" | |
| 1067 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
| 1068 #else | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1069 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1070 "pcmpgtw %%mm0, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1071 "pxor %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1072 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1073 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1074 "pcmpgtw %%mm1, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1075 "pxor %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1076 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1077 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1078 "pcmpgtw %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1079 "pxor %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1080 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1081 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1082 "pcmpgtw %%mm3, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1083 "pxor %%mm6, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1084 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
| 140 | 1085 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1086 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1087 #ifdef HAVE_MMX2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1088 "pminsw %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1089 "pminsw %%mm3, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1090 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1091 "movq %%mm0, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1092 "psubusw %%mm2, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1093 "psubw %%mm6, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1094 "movq %%mm1, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1095 "psubusw %%mm3, %%mm6 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1096 "psubw %%mm6, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1097 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1098 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1099 "movq %%mm7, %%mm6 \n\t" // 0 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1100 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1101 "pxor %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1102 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1103 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1104 "pxor %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1105 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1106 // 100 opcodes |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1107 "movd %2, %%mm2 \n\t" // QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1108 "punpcklwd %%mm2, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1109 "punpcklwd %%mm2, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1110 "psllw $3, %%mm2 \n\t" // 8QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1111 "movq %%mm2, %%mm3 \n\t" // 8QP |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1112 "pcmpgtw %%mm4, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1113 "pcmpgtw %%mm5, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1114 "pand %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1115 "pand %%mm3, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1116 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1117 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1118 "psubusw %%mm0, %%mm4 \n\t" // hd |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1119 "psubusw %%mm1, %%mm5 \n\t" // ld |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1120 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1121 |
| 211 | 1122 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1123 "pmullw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1124 "pmullw %%mm2, %%mm5 \n\t" |
| 211 | 1125 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1126 "paddw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1127 "paddw %%mm2, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1128 "psrlw $6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1129 "psrlw $6, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1130 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1131 /* |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1132 "movq w06, %%mm2 \n\t" // 6 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1133 "paddw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1134 "paddw %%mm2, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1135 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1136 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1137 "pmulhw %%mm2, %%mm4 \n\t" // hd/13 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1138 "pmulhw %%mm2, %%mm5 \n\t" // ld/13 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1139 */ |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1140 |
| 210 | 1141 "movq "MANGLE(temp2)", %%mm0 \n\t" // L3 - L4 |
| 1142 "movq "MANGLE(temp3)", %%mm1 \n\t" // H3 - H4 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1143 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1144 "pxor %%mm2, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1145 "pxor %%mm3, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1146 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1147 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1148 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1149 "pxor %%mm2, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1150 "pxor %%mm3, %%mm1 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1151 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1152 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1153 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1154 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1155 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1156 "pxor %%mm6, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1157 "pxor %%mm7, %%mm3 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1158 "pand %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1159 "pand %%mm3, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1160 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1161 #ifdef HAVE_MMX2 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1162 "pminsw %%mm0, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1163 "pminsw %%mm1, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1164 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1165 "movq %%mm4, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1166 "psubusw %%mm0, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1167 "psubw %%mm2, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1168 "movq %%mm5, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1169 "psubusw %%mm1, %%mm2 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1170 "psubw %%mm2, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1171 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1172 "pxor %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1173 "pxor %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1174 "psubw %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1175 "psubw %%mm7, %%mm5 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1176 "packsswb %%mm5, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1177 "movq (%%eax, %1, 2), %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1178 "paddb %%mm4, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1179 "movq %%mm0, (%%eax, %1, 2) \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1180 "movq (%0, %1, 4), %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1181 "psubb %%mm4, %%mm0 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1182 "movq %%mm0, (%0, %1, 4) \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1183 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1184 : |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1185 : "r" (src), "r" (stride), "r" (QP) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1186 : "%eax", "%ebx" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1187 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1188 #else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1189 const int l1= stride; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1190 const int l2= stride + l1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1191 const int l3= stride + l2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1192 const int l4= stride + l3; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1193 const int l5= stride + l4; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1194 const int l6= stride + l5; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1195 const int l7= stride + l6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1196 const int l8= stride + l7; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1197 // const int l9= stride + l8; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
1198 int x; |
| 111 | 1199 src+= stride*3; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
1200 for(x=0; x<BLOCK_SIZE; x++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1201 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1202 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1203 if(ABS(middleEnergy) < 8*QP) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1204 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1205 const int q=(src[l4] - src[l5])/2; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1206 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1207 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1208 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1209 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1210 d= MAX(d, 0); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1211 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1212 d= (5*d + 32) >> 6; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1213 d*= SIGN(-middleEnergy); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1214 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1215 if(q>0) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1216 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1217 d= d<0 ? 0 : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1218 d= d>q ? q : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1219 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1220 else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1221 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1222 d= d>0 ? 0 : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1223 d= d<q ? q : d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1224 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1225 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1226 src[l4]-= d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1227 src[l5]+= d; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1228 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1229 src++; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1230 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1231 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1232 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1233 |
| 169 | 1234 static inline void RENAME(dering)(uint8_t src[], int stride, int QP) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1235 { |
| 132 | 1236 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1237 asm volatile( |
| 210 | 1238 "movq "MANGLE(pQPb)", %%mm0 \n\t" |
| 130 | 1239 "paddusb %%mm0, %%mm0 \n\t" |
| 210 | 1240 "movq %%mm0, "MANGLE(pQPb2)" \n\t" |
| 130 | 1241 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1242 "leal (%0, %1), %%eax \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1243 "leal (%%eax, %1, 4), %%ebx \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1244 // 0 1 2 3 4 5 6 7 8 9 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1245 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1246 |
| 167 | 1247 "pcmpeqb %%mm7, %%mm7 \n\t" |
| 1248 "pxor %%mm6, %%mm6 \n\t" | |
| 169 | 1249 #undef FIND_MIN_MAX |
| 132 | 1250 #ifdef HAVE_MMX2 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1251 #define FIND_MIN_MAX(addr)\ |
| 130 | 1252 "movq " #addr ", %%mm0 \n\t"\ |
| 167 | 1253 "pminub %%mm0, %%mm7 \n\t"\ |
| 1254 "pmaxub %%mm0, %%mm6 \n\t" | |
| 132 | 1255 #else |
| 1256 #define FIND_MIN_MAX(addr)\ | |
| 1257 "movq " #addr ", %%mm0 \n\t"\ | |
| 167 | 1258 "movq %%mm7, %%mm1 \n\t"\ |
| 1259 "psubusb %%mm0, %%mm6 \n\t"\ | |
| 1260 "paddb %%mm0, %%mm6 \n\t"\ | |
| 132 | 1261 "psubusb %%mm0, %%mm1 \n\t"\ |
| 167 | 1262 "psubb %%mm1, %%mm7 \n\t" |
| 132 | 1263 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1264 |
| 130 | 1265 FIND_MIN_MAX((%%eax)) |
| 1266 FIND_MIN_MAX((%%eax, %1)) | |
| 1267 FIND_MIN_MAX((%%eax, %1, 2)) | |
| 1268 FIND_MIN_MAX((%0, %1, 4)) | |
| 1269 FIND_MIN_MAX((%%ebx)) | |
| 1270 FIND_MIN_MAX((%%ebx, %1)) | |
| 1271 FIND_MIN_MAX((%%ebx, %1, 2)) | |
| 1272 FIND_MIN_MAX((%0, %1, 8)) | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1273 |
| 167 | 1274 "movq %%mm7, %%mm4 \n\t" |
| 1275 "psrlq $8, %%mm7 \n\t" | |
| 1276 #ifdef HAVE_MMX2 | |
| 1277 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1278 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
| 1279 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1280 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
| 1281 "pminub %%mm4, %%mm7 \n\t" | |
| 1282 #else | |
| 1283 "movq %%mm7, %%mm1 \n\t" | |
| 1284 "psubusb %%mm4, %%mm1 \n\t" | |
| 1285 "psubb %%mm1, %%mm7 \n\t" | |
| 1286 "movq %%mm7, %%mm4 \n\t" | |
| 1287 "psrlq $16, %%mm7 \n\t" | |
| 1288 "movq %%mm7, %%mm1 \n\t" | |
| 1289 "psubusb %%mm4, %%mm1 \n\t" | |
| 1290 "psubb %%mm1, %%mm7 \n\t" | |
| 1291 "movq %%mm7, %%mm4 \n\t" | |
| 1292 "psrlq $32, %%mm7 \n\t" | |
| 1293 "movq %%mm7, %%mm1 \n\t" | |
| 1294 "psubusb %%mm4, %%mm1 \n\t" | |
| 1295 "psubb %%mm1, %%mm7 \n\t" | |
| 1296 #endif | |
| 1297 | |
| 1298 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1299 "movq %%mm6, %%mm4 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1300 "psrlq $8, %%mm6 \n\t" |
| 132 | 1301 #ifdef HAVE_MMX2 |
| 167 | 1302 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1303 "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
| 167 | 1304 "pmaxub %%mm4, %%mm6 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1305 "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
| 167 | 1306 "pmaxub %%mm4, %%mm6 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1307 #else |
| 167 | 1308 "psubusb %%mm4, %%mm6 \n\t" |
| 1309 "paddb %%mm4, %%mm6 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1310 "movq %%mm6, %%mm4 \n\t" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1311 "psrlq $16, %%mm6 \n\t" |
| 167 | 1312 "psubusb %%mm4, %%mm6 \n\t" |
| 1313 "paddb %%mm4, %%mm6 \n\t" | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1314 "movq %%mm6, %%mm4 \n\t" |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1315 "psrlq $32, %%mm6 \n\t" |
| 167 | 1316 "psubusb %%mm4, %%mm6 \n\t" |
| 1317 "paddb %%mm4, %%mm6 \n\t" | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1318 #endif |
| 167 | 1319 "movq %%mm6, %%mm0 \n\t" // max |
| 1320 "psubb %%mm7, %%mm6 \n\t" // max - min | |
| 1321 "movd %%mm6, %%ecx \n\t" | |
| 210 | 1322 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" |
| 167 | 1323 " jb 1f \n\t" |
| 1324 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1325 "punpcklbw %%mm7, %%mm7 \n\t" |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1326 "punpcklbw %%mm7, %%mm7 \n\t" |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1327 "punpcklbw %%mm7, %%mm7 \n\t" |
| 210 | 1328 "movq %%mm7, "MANGLE(temp0)" \n\t" |
| 130 | 1329 |
| 1330 "movq (%0), %%mm0 \n\t" // L10 | |
| 1331 "movq %%mm0, %%mm1 \n\t" // L10 | |
| 1332 "movq %%mm0, %%mm2 \n\t" // L10 | |
| 1333 "psllq $8, %%mm1 \n\t" | |
| 1334 "psrlq $8, %%mm2 \n\t" | |
| 1335 "movd -4(%0), %%mm3 \n\t" | |
| 1336 "movd 8(%0), %%mm4 \n\t" | |
| 1337 "psrlq $24, %%mm3 \n\t" | |
| 1338 "psllq $56, %%mm4 \n\t" | |
| 1339 "por %%mm3, %%mm1 \n\t" // L00 | |
| 1340 "por %%mm4, %%mm2 \n\t" // L20 | |
| 1341 "movq %%mm1, %%mm3 \n\t" // L00 | |
| 1342 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 | |
| 1343 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 | |
| 1344 "psubusb %%mm7, %%mm0 \n\t" | |
| 1345 "psubusb %%mm7, %%mm2 \n\t" | |
| 1346 "psubusb %%mm7, %%mm3 \n\t" | |
| 210 | 1347 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 |
| 1348 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 | |
| 1349 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 | |
| 130 | 1350 "paddb %%mm2, %%mm0 \n\t" |
| 1351 "paddb %%mm3, %%mm0 \n\t" | |
| 1352 | |
| 1353 "movq (%%eax), %%mm2 \n\t" // L11 | |
| 1354 "movq %%mm2, %%mm3 \n\t" // L11 | |
| 1355 "movq %%mm2, %%mm4 \n\t" // L11 | |
| 1356 "psllq $8, %%mm3 \n\t" | |
| 1357 "psrlq $8, %%mm4 \n\t" | |
| 1358 "movd -4(%%eax), %%mm5 \n\t" | |
| 1359 "movd 8(%%eax), %%mm6 \n\t" | |
| 1360 "psrlq $24, %%mm5 \n\t" | |
| 1361 "psllq $56, %%mm6 \n\t" | |
| 1362 "por %%mm5, %%mm3 \n\t" // L01 | |
| 1363 "por %%mm6, %%mm4 \n\t" // L21 | |
| 1364 "movq %%mm3, %%mm5 \n\t" // L01 | |
| 1365 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 | |
| 1366 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 | |
| 1367 "psubusb %%mm7, %%mm2 \n\t" | |
| 1368 "psubusb %%mm7, %%mm4 \n\t" | |
| 1369 "psubusb %%mm7, %%mm5 \n\t" | |
| 210 | 1370 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 |
| 1371 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 | |
| 1372 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 | |
| 130 | 1373 "paddb %%mm4, %%mm2 \n\t" |
| 1374 "paddb %%mm5, %%mm2 \n\t" | |
| 1375 // 0, 2, 3, 1 | |
| 1376 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ | |
| 1377 "movq " #src ", " #sx " \n\t" /* src[0] */\ | |
| 1378 "movq " #sx ", " #lx " \n\t" /* src[0] */\ | |
| 1379 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ | |
| 1380 "psllq $8, " #lx " \n\t"\ | |
| 1381 "psrlq $8, " #t0 " \n\t"\ | |
| 1382 "movd -4" #src ", " #t1 " \n\t"\ | |
| 1383 "psrlq $24, " #t1 " \n\t"\ | |
| 1384 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ | |
| 1385 "movd 8" #src ", " #t1 " \n\t"\ | |
| 1386 "psllq $56, " #t1 " \n\t"\ | |
| 1387 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | |
| 1388 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | |
| 1389 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | |
| 1390 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | |
| 135 | 1391 PAVGB(lx, pplx) \ |
| 210 | 1392 "movq " #lx ", "MANGLE(temp1)" \n\t"\ |
| 1393 "movq "MANGLE(temp0)", " #lx " \n\t"\ | |
| 140 | 1394 "psubusb " #lx ", " #t1 " \n\t"\ |
| 1395 "psubusb " #lx ", " #t0 " \n\t"\ | |
| 1396 "psubusb " #lx ", " #sx " \n\t"\ | |
| 210 | 1397 "movq "MANGLE(b00)", " #lx " \n\t"\ |
| 140 | 1398 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
| 1399 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ | |
| 1400 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ | |
| 130 | 1401 "paddb " #t1 ", " #t0 " \n\t"\ |
| 1402 "paddb " #t0 ", " #sx " \n\t"\ | |
| 1403 \ | |
| 1404 PAVGB(plx, pplx) /* filtered */\ | |
| 1405 "movq " #dst ", " #t0 " \n\t" /* dst */\ | |
| 134 | 1406 "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
| 210 | 1407 "psubusb "MANGLE(pQPb2)", " #t0 " \n\t"\ |
| 1408 "paddusb "MANGLE(pQPb2)", " #t1 " \n\t"\ | |
| 134 | 1409 PMAXUB(t0, pplx)\ |
| 1410 PMINUB(t1, pplx, t0)\ | |
| 130 | 1411 "paddb " #sx ", " #ppsx " \n\t"\ |
| 1412 "paddb " #psx ", " #ppsx " \n\t"\ | |
| 210 | 1413 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
| 1414 "pand "MANGLE(b08)", " #ppsx " \n\t"\ | |
| 140 | 1415 "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
| 134 | 1416 "pand " #ppsx ", " #pplx " \n\t"\ |
| 130 | 1417 "pandn " #dst ", " #ppsx " \n\t"\ |
| 140 | 1418 "por " #pplx ", " #ppsx " \n\t"\ |
| 135 | 1419 "movq " #ppsx ", " #dst " \n\t"\ |
| 210 | 1420 "movq "MANGLE(temp1)", " #lx " \n\t" |
| 134 | 1421 |
| 130 | 1422 /* |
| 1423 0000000 | |
| 1424 1111111 | |
| 1425 | |
| 1426 1111110 | |
| 1427 1111101 | |
| 1428 1111100 | |
| 1429 1111011 | |
| 1430 1111010 | |
| 1431 1111001 | |
| 1432 | |
| 1433 1111000 | |
| 1434 1110111 | |
| 1435 | |
| 1436 */ | |
| 1437 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) | |
| 1438 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1439 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1440 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
| 1441 DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1442 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1443 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
| 1444 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1445 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1446 |
| 167 | 1447 "1: \n\t" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1448 : : "r" (src), "r" (stride), "r" (QP) |
| 167 | 1449 : "%eax", "%ebx", "%ecx" |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1450 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1451 #else |
| 134 | 1452 int y; |
| 1453 int min=255; | |
| 1454 int max=0; | |
| 1455 int avg; | |
| 1456 uint8_t *p; | |
| 1457 int s[10]; | |
| 1458 | |
| 1459 for(y=1; y<9; y++) | |
| 1460 { | |
| 1461 int x; | |
| 1462 p= src + stride*y; | |
| 1463 for(x=1; x<9; x++) | |
| 1464 { | |
| 1465 p++; | |
| 1466 if(*p > max) max= *p; | |
| 1467 if(*p < min) min= *p; | |
| 1468 } | |
| 1469 } | |
| 1470 avg= (min + max + 1)/2; | |
| 1471 | |
| 167 | 1472 if(max - min <deringThreshold) return; |
| 1473 | |
| 134 | 1474 for(y=0; y<10; y++) |
| 1475 { | |
| 1476 int x; | |
| 1477 int t = 0; | |
| 1478 p= src + stride*y; | |
| 1479 for(x=0; x<10; x++) | |
| 1480 { | |
| 1481 if(*p > avg) t |= (1<<x); | |
| 1482 p++; | |
| 1483 } | |
| 1484 t |= (~t)<<16; | |
| 1485 t &= (t<<1) & (t>>1); | |
| 1486 s[y] = t; | |
| 1487 } | |
| 1488 | |
| 1489 for(y=1; y<9; y++) | |
| 1490 { | |
| 1491 int x; | |
| 1492 int t = s[y-1] & s[y] & s[y+1]; | |
| 1493 t|= t>>16; | |
| 1494 | |
| 1495 p= src + stride*y; | |
| 1496 for(x=1; x<9; x++) | |
| 1497 { | |
| 1498 p++; | |
| 1499 if(t & (1<<x)) | |
| 1500 { | |
| 1501 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | |
| 1502 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | |
| 1503 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | |
| 1504 f= (f + 8)>>4; | |
| 1505 | |
| 167 | 1506 #ifdef DEBUG_DERING_THRESHOLD |
| 1507 asm volatile("emms\n\t":); | |
| 1508 { | |
| 1509 static long long numPixels=0; | |
| 1510 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | |
| 1511 // if((max-min)<20 || (max-min)*QP<200) | |
| 1512 // if((max-min)*QP < 500) | |
| 1513 // if(max-min<QP/2) | |
| 1514 if(max-min < 20) | |
| 1515 { | |
| 1516 static int numSkiped=0; | |
| 1517 static int errorSum=0; | |
| 1518 static int worstQP=0; | |
| 1519 static int worstRange=0; | |
| 1520 static int worstDiff=0; | |
| 1521 int diff= (f - *p); | |
| 1522 int absDiff= ABS(diff); | |
| 1523 int error= diff*diff; | |
| 1524 | |
| 1525 if(x==1 || x==8 || y==1 || y==8) continue; | |
| 1526 | |
| 1527 numSkiped++; | |
| 1528 if(absDiff > worstDiff) | |
| 1529 { | |
| 1530 worstDiff= absDiff; | |
| 1531 worstQP= QP; | |
| 1532 worstRange= max-min; | |
| 1533 } | |
| 1534 errorSum+= error; | |
| 1535 | |
| 1536 if(1024LL*1024LL*1024LL % numSkiped == 0) | |
| 1537 { | |
| 1538 printf( "sum:%1.3f, skip:%d, wQP:%d, " | |
| 1539 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | |
| 1540 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, | |
| 1541 worstDiff, (float)numSkiped/numPixels); | |
| 1542 } | |
| 1543 } | |
| 1544 } | |
| 1545 #endif | |
| 134 | 1546 if (*p + 2*QP < f) *p= *p + 2*QP; |
| 1547 else if(*p - 2*QP > f) *p= *p - 2*QP; | |
| 1548 else *p=f; | |
| 1549 } | |
| 1550 } | |
| 1551 } | |
| 167 | 1552 #ifdef DEBUG_DERING_THRESHOLD |
| 1553 if(max-min < 20) | |
| 1554 { | |
| 1555 for(y=1; y<9; y++) | |
| 1556 { | |
| 1557 int x; | |
| 1558 int t = 0; | |
| 1559 p= src + stride*y; | |
| 1560 for(x=1; x<9; x++) | |
| 1561 { | |
| 1562 p++; | |
| 1563 *p = MIN(*p + 20, 255); | |
| 1564 } | |
| 1565 } | |
| 1566 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | |
| 1567 } | |
| 1568 #endif | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1569 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1570 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
1571 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1572 /** |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1573 * Deinterlaces the given block |
| 142 | 1574 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1575 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1576 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1577 */ |
| 169 | 1578 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1579 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1580 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1581 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1582 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1583 "leal (%0, %1), %%eax \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1584 "leal (%%eax, %1, 4), %%ebx \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1585 // 0 1 2 3 4 5 6 7 8 9 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1586 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1587 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1588 "movq (%0), %%mm0 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1589 "movq (%%eax, %1), %%mm1 \n\t" |
| 111 | 1590 PAVGB(%%mm1, %%mm0) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1591 "movq %%mm0, (%%eax) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1592 "movq (%0, %1, 4), %%mm0 \n\t" |
| 111 | 1593 PAVGB(%%mm0, %%mm1) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1594 "movq %%mm1, (%%eax, %1, 2) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1595 "movq (%%ebx, %1), %%mm1 \n\t" |
| 111 | 1596 PAVGB(%%mm1, %%mm0) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1597 "movq %%mm0, (%%ebx) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1598 "movq (%0, %1, 8), %%mm0 \n\t" |
| 111 | 1599 PAVGB(%%mm0, %%mm1) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1600 "movq %%mm1, (%%ebx, %1, 2) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1601 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1602 : : "r" (src), "r" (stride) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1603 : "%eax", "%ebx" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1604 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1605 #else |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1606 int x; |
| 142 | 1607 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1608 for(x=0; x<8; x++) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1609 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1610 src[stride] = (src[0] + src[stride*2])>>1; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1611 src[stride*3] = (src[stride*2] + src[stride*4])>>1; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1612 src[stride*5] = (src[stride*4] + src[stride*6])>>1; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1613 src[stride*7] = (src[stride*6] + src[stride*8])>>1; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1614 src++; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1615 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1616 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1617 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1618 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1619 /** |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1620 * Deinterlaces the given block |
| 142 | 1621 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1622 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1623 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 1624 * this filter will read lines 3-15 and write 7-13 | |
| 111 | 1625 * no cliping in C version |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1626 */ |
| 169 | 1627 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1628 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1629 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1630 src+= stride*3; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1631 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1632 "leal (%0, %1), %%eax \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1633 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 111 | 1634 "leal (%%ebx, %1, 4), %%ecx \n\t" |
| 1635 "addl %1, %%ecx \n\t" | |
| 1636 "pxor %%mm7, %%mm7 \n\t" | |
| 1637 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 1638 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1639 |
| 111 | 1640 #define DEINT_CUBIC(a,b,c,d,e)\ |
| 1641 "movq " #a ", %%mm0 \n\t"\ | |
| 1642 "movq " #b ", %%mm1 \n\t"\ | |
| 1643 "movq " #d ", %%mm2 \n\t"\ | |
| 1644 "movq " #e ", %%mm3 \n\t"\ | |
| 1645 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ | |
| 1646 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ | |
| 1647 "movq %%mm0, %%mm2 \n\t"\ | |
| 1648 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
| 1649 "punpckhbw %%mm7, %%mm2 \n\t"\ | |
| 1650 "movq %%mm1, %%mm3 \n\t"\ | |
| 1651 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
| 1652 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
| 1653 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ | |
| 1654 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ | |
| 1655 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ | |
| 1656 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ | |
| 1657 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | |
| 1658 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | |
| 1659 "packuswb %%mm3, %%mm1 \n\t"\ | |
| 1660 "movq %%mm1, " #c " \n\t" | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1661 |
| 111 | 1662 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1)) |
| 1663 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8)) | |
| 1664 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx)) | |
| 1665 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2)) | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1666 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1667 : : "r" (src), "r" (stride) |
| 111 | 1668 : "%eax", "%ebx", "ecx" |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1669 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1670 #else |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1671 int x; |
| 142 | 1672 src+= stride*3; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1673 for(x=0; x<8; x++) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1674 { |
| 111 | 1675 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; |
| 1676 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; | |
| 1677 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; | |
| 1678 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4; | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1679 src++; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1680 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1681 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1682 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1683 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1684 /** |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1685 * Deinterlaces the given block |
| 142 | 1686 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1687 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1688 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1689 * will shift the image up by 1 line (FIXME if this is a problem) |
| 142 | 1690 * this filter will read lines 4-13 and write 4-11 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1691 */ |
| 169 | 1692 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1693 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1694 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 142 | 1695 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1696 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1697 "leal (%0, %1), %%eax \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1698 "leal (%%eax, %1, 4), %%ebx \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1699 // 0 1 2 3 4 5 6 7 8 9 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1700 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1701 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1702 "movq (%0), %%mm0 \n\t" // L0 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1703 "movq (%%eax, %1), %%mm1 \n\t" // L2 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1704 PAVGB(%%mm1, %%mm0) // L0+L2 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1705 "movq (%%eax), %%mm2 \n\t" // L1 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1706 PAVGB(%%mm2, %%mm0) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1707 "movq %%mm0, (%0) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1708 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1709 PAVGB(%%mm0, %%mm2) // L1+L3 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1710 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1711 "movq %%mm2, (%%eax) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1712 "movq (%0, %1, 4), %%mm2 \n\t" // L4 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1713 PAVGB(%%mm2, %%mm1) // L2+L4 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1714 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1715 "movq %%mm1, (%%eax, %1) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1716 "movq (%%ebx), %%mm1 \n\t" // L5 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1717 PAVGB(%%mm1, %%mm0) // L3+L5 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1718 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1719 "movq %%mm0, (%%eax, %1, 2) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1720 "movq (%%ebx, %1), %%mm0 \n\t" // L6 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1721 PAVGB(%%mm0, %%mm2) // L4+L6 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1722 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1723 "movq %%mm2, (%0, %1, 4) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1724 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1725 PAVGB(%%mm2, %%mm1) // L5+L7 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1726 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1727 "movq %%mm1, (%%ebx) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1728 "movq (%0, %1, 8), %%mm1 \n\t" // L8 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1729 PAVGB(%%mm1, %%mm0) // L6+L8 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1730 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1731 "movq %%mm0, (%%ebx, %1) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1732 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1733 PAVGB(%%mm0, %%mm2) // L7+L9 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1734 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1735 "movq %%mm2, (%%ebx, %1, 2) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1736 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1737 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1738 : : "r" (src), "r" (stride) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1739 : "%eax", "%ebx" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1740 ); |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1741 #else |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1742 int x; |
| 142 | 1743 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1744 for(x=0; x<8; x++) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1745 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1746 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1747 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1748 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1749 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1750 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1751 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1752 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1753 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1754 src++; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1755 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1756 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1757 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1758 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1759 /** |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1760 * Deinterlaces the given block |
| 142 | 1761 * will be called for every 8x8 block and can read & write from line 4-15, |
| 1762 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1763 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1764 */ |
| 169 | 1765 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1766 { |
| 107 | 1767 #ifdef HAVE_MMX |
| 142 | 1768 src+= 4*stride; |
| 107 | 1769 #ifdef HAVE_MMX2 |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1770 asm volatile( |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1771 "leal (%0, %1), %%eax \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1772 "leal (%%eax, %1, 4), %%ebx \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1773 // 0 1 2 3 4 5 6 7 8 9 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1774 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1775 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1776 "movq (%0), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1777 "movq (%%eax, %1), %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1778 "movq (%%eax), %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1779 "movq %%mm0, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1780 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1781 "pminub %%mm3, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1782 "pmaxub %%mm2, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1783 "pminub %%mm1, %%mm0 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1784 "movq %%mm0, (%%eax) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1785 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1786 "movq (%0, %1, 4), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1787 "movq (%%eax, %1, 2), %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1788 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1789 "pmaxub %%mm1, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1790 "pminub %%mm3, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1791 "pmaxub %%mm0, %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1792 "pminub %%mm1, %%mm2 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1793 "movq %%mm2, (%%eax, %1, 2) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1794 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1795 "movq (%%ebx), %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1796 "movq (%%ebx, %1), %%mm1 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1797 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1798 "pmaxub %%mm0, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1799 "pminub %%mm3, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1800 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1801 "pminub %%mm0, %%mm2 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1802 "movq %%mm2, (%%ebx) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1803 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1804 "movq (%%ebx, %1, 2), %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1805 "movq (%0, %1, 8), %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1806 "movq %%mm2, %%mm3 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1807 "pmaxub %%mm0, %%mm2 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1808 "pminub %%mm3, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1809 "pmaxub %%mm1, %%mm0 \n\t" // |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1810 "pminub %%mm0, %%mm2 \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1811 "movq %%mm2, (%%ebx, %1, 2) \n\t" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1812 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1813 |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1814 : : "r" (src), "r" (stride) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1815 : "%eax", "%ebx" |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1816 ); |
| 107 | 1817 |
| 1818 #else // MMX without MMX2 | |
| 1819 asm volatile( | |
| 1820 "leal (%0, %1), %%eax \n\t" | |
| 1821 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 1822 // 0 1 2 3 4 5 6 7 8 9 | |
| 1823 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 1824 "pxor %%mm7, %%mm7 \n\t" | |
| 1825 | |
| 1826 #define MEDIAN(a,b,c)\ | |
| 1827 "movq " #a ", %%mm0 \n\t"\ | |
| 1828 "movq " #b ", %%mm2 \n\t"\ | |
| 1829 "movq " #c ", %%mm1 \n\t"\ | |
| 1830 "movq %%mm0, %%mm3 \n\t"\ | |
| 1831 "movq %%mm1, %%mm4 \n\t"\ | |
| 1832 "movq %%mm2, %%mm5 \n\t"\ | |
| 1833 "psubusb %%mm1, %%mm3 \n\t"\ | |
| 1834 "psubusb %%mm2, %%mm4 \n\t"\ | |
| 1835 "psubusb %%mm0, %%mm5 \n\t"\ | |
| 1836 "pcmpeqb %%mm7, %%mm3 \n\t"\ | |
| 1837 "pcmpeqb %%mm7, %%mm4 \n\t"\ | |
| 1838 "pcmpeqb %%mm7, %%mm5 \n\t"\ | |
| 1839 "movq %%mm3, %%mm6 \n\t"\ | |
| 1840 "pxor %%mm4, %%mm3 \n\t"\ | |
| 1841 "pxor %%mm5, %%mm4 \n\t"\ | |
| 1842 "pxor %%mm6, %%mm5 \n\t"\ | |
| 1843 "por %%mm3, %%mm1 \n\t"\ | |
| 1844 "por %%mm4, %%mm2 \n\t"\ | |
| 1845 "por %%mm5, %%mm0 \n\t"\ | |
| 1846 "pand %%mm2, %%mm0 \n\t"\ | |
| 1847 "pand %%mm1, %%mm0 \n\t"\ | |
| 1848 "movq %%mm0, " #b " \n\t" | |
| 1849 | |
| 1850 MEDIAN((%0), (%%eax), (%%eax, %1)) | |
| 1851 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) | |
| 1852 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1)) | |
| 1853 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8)) | |
| 1854 | |
| 1855 : : "r" (src), "r" (stride) | |
| 1856 : "%eax", "%ebx" | |
| 1857 ); | |
| 1858 #endif // MMX | |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1859 #else |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1860 //FIXME |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1861 int x; |
| 142 | 1862 src+= 4*stride; |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1863 for(x=0; x<8; x++) |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1864 { |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1865 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1866 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1867 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1868 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1869 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1870 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1871 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1872 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1873 src++; |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1874 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1875 #endif |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1876 } |
|
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
1877 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
1878 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1879 /** |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1880 * transposes and shift the given 8x8 Block into dst1 and dst2 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1881 */ |
| 169 | 1882 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1883 { |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1884 asm( |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1885 "leal (%0, %1), %%eax \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1886 "leal (%%eax, %1, 4), %%ebx \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1887 // 0 1 2 3 4 5 6 7 8 9 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1888 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1889 "movq (%0), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1890 "movq (%%eax), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1891 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1892 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1893 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1894 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1895 "movq (%%eax, %1), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1896 "movq (%%eax, %1, 2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1897 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1898 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1899 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1900 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1901 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1902 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1903 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1904 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1905 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1906 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1907 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1908 "movd %%mm0, 128(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1909 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1910 "movd %%mm0, 144(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1911 "movd %%mm3, 160(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1912 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1913 "movd %%mm3, 176(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1914 "movd %%mm3, 48(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1915 "movd %%mm2, 192(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1916 "movd %%mm2, 64(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1917 "psrlq $32, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1918 "movd %%mm2, 80(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1919 "movd %%mm1, 96(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1920 "psrlq $32, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1921 "movd %%mm1, 112(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1922 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1923 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1924 "movq (%%ebx), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1925 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1926 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1927 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1928 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1929 "movq (%%ebx, %1), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1930 "movq (%%ebx, %1, 2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1931 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1932 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1933 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1934 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1935 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1936 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1937 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1938 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1939 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1940 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1941 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1942 "movd %%mm0, 132(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1943 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1944 "movd %%mm0, 148(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1945 "movd %%mm3, 164(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1946 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1947 "movd %%mm3, 180(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1948 "movd %%mm3, 52(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1949 "movd %%mm2, 196(%2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1950 "movd %%mm2, 68(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1951 "psrlq $32, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1952 "movd %%mm2, 84(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1953 "movd %%mm1, 100(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1954 "psrlq $32, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1955 "movd %%mm1, 116(%3) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1956 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1957 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1958 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1959 : "%eax", "%ebx" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1960 ); |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1961 } |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1962 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1963 /** |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1964 * transposes the given 8x8 block |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1965 */ |
| 169 | 1966 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1967 { |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1968 asm( |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1969 "leal (%0, %1), %%eax \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1970 "leal (%%eax, %1, 4), %%ebx \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1971 // 0 1 2 3 4 5 6 7 8 9 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1972 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1973 "movq (%2), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1974 "movq 16(%2), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1975 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1976 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1977 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1978 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1979 "movq 32(%2), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1980 "movq 48(%2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1981 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1982 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1983 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1984 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1985 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1986 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1987 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1988 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1989 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1990 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1991 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1992 "movd %%mm0, (%0) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1993 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1994 "movd %%mm0, (%%eax) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1995 "movd %%mm3, (%%eax, %1) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1996 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1997 "movd %%mm3, (%%eax, %1, 2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1998 "movd %%mm2, (%0, %1, 4) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
1999 "psrlq $32, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2000 "movd %%mm2, (%%ebx) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2001 "movd %%mm1, (%%ebx, %1) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2002 "psrlq $32, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2003 "movd %%mm1, (%%ebx, %1, 2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2004 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2005 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2006 "movq 64(%2), %%mm0 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2007 "movq 80(%2), %%mm1 \n\t" // abcdefgh |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2008 "movq %%mm0, %%mm2 \n\t" // 12345678 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2009 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2010 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2011 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2012 "movq 96(%2), %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2013 "movq 112(%2), %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2014 "movq %%mm1, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2015 "punpcklbw %%mm3, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2016 "punpckhbw %%mm3, %%mm4 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2017 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2018 "movq %%mm0, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2019 "punpcklwd %%mm1, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2020 "punpckhwd %%mm1, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2021 "movq %%mm2, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2022 "punpcklwd %%mm4, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2023 "punpckhwd %%mm4, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2024 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2025 "movd %%mm0, 4(%0) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2026 "psrlq $32, %%mm0 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2027 "movd %%mm0, 4(%%eax) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2028 "movd %%mm3, 4(%%eax, %1) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2029 "psrlq $32, %%mm3 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2030 "movd %%mm3, 4(%%eax, %1, 2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2031 "movd %%mm2, 4(%0, %1, 4) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2032 "psrlq $32, %%mm2 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2033 "movd %%mm2, 4(%%ebx) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2034 "movd %%mm1, 4(%%ebx, %1) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2035 "psrlq $32, %%mm1 \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2036 "movd %%mm1, 4(%%ebx, %1, 2) \n\t" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2037 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2038 :: "r" (dst), "r" (dstStride), "r" (src) |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2039 : "%eax", "%ebx" |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2040 ); |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2041 } |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2042 #endif |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2043 //static int test=0; |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2044 |
| 169 | 2045 static void inline RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
| 158 | 2046 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
| 156 | 2047 { |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2048 #define FAST_L2_DIFF |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2049 //#define L1_DIFF //u should change the thresholds too if u try that one |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2050 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2051 asm volatile( |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2052 "leal (%2, %2, 2), %%eax \n\t" // 3*stride |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2053 "leal (%2, %2, 4), %%ebx \n\t" // 5*stride |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2054 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2055 // 0 1 2 3 4 5 6 7 8 9 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2056 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+ebx %x+2eax %x+ecx %x+8%2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2057 //FIXME reorder? |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2058 #ifdef L1_DIFF //needs mmx2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2059 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2060 "psadbw (%1), %%mm0 \n\t" // |L0-R0| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2061 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2062 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2063 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2064 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2065 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2066 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2067 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2068 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2069 "paddw %%mm1, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2070 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2071 "movq (%0, %%ebx), %%mm5 \n\t" // L5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2072 "paddw %%mm2, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2073 "psadbw (%1, %%ebx), %%mm5 \n\t" // |L5-R5| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2074 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2075 "paddw %%mm3, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2076 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2077 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2078 "paddw %%mm4, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2079 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7| |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2080 "paddw %%mm5, %%mm6 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2081 "paddw %%mm7, %%mm6 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2082 "paddw %%mm6, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2083 #elif defined (FAST_L2_DIFF) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2084 "pcmpeqb %%mm7, %%mm7 \n\t" |
| 210 | 2085 "movq "MANGLE(b80)", %%mm6 \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2086 "pxor %%mm0, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2087 #define L2_DIFF_CORE(a, b)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2088 "movq " #a ", %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2089 "movq " #b ", %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2090 "pxor %%mm7, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2091 PAVGB(%%mm2, %%mm5)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2092 "paddb %%mm6, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2093 "movq %%mm5, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2094 "psllw $8, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2095 "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2096 "pmaddwd %%mm2, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2097 "paddd %%mm2, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2098 "psrld $14, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2099 "paddd %%mm5, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2100 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2101 L2_DIFF_CORE((%0), (%1)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2102 L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2103 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2104 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2105 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2106 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2107 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2108 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2109 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2110 #else |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2111 "pxor %%mm7, %%mm7 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2112 "pxor %%mm0, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2113 #define L2_DIFF_CORE(a, b)\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2114 "movq " #a ", %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2115 "movq " #b ", %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2116 "movq %%mm5, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2117 "movq %%mm2, %%mm3 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2118 "punpcklbw %%mm7, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2119 "punpckhbw %%mm7, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2120 "punpcklbw %%mm7, %%mm2 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2121 "punpckhbw %%mm7, %%mm3 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2122 "psubw %%mm2, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2123 "psubw %%mm3, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2124 "pmaddwd %%mm5, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2125 "pmaddwd %%mm1, %%mm1 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2126 "paddd %%mm1, %%mm5 \n\t"\ |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2127 "paddd %%mm5, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2128 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2129 L2_DIFF_CORE((%0), (%1)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2130 L2_DIFF_CORE((%0, %2), (%1, %2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2131 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2132 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2133 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2134 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2135 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2136 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2137 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2138 #endif |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2139 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2140 "movq %%mm0, %%mm4 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2141 "psrlq $32, %%mm0 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2142 "paddd %%mm0, %%mm4 \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2143 "movd %%mm4, %%ecx \n\t" |
| 158 | 2144 "shll $2, %%ecx \n\t" |
| 2145 "movl %3, %%ebx \n\t" | |
| 2146 "addl -4(%%ebx), %%ecx \n\t" | |
| 2147 "addl 4(%%ebx), %%ecx \n\t" | |
| 2148 "addl -1024(%%ebx), %%ecx \n\t" | |
| 2149 "addl $4, %%ecx \n\t" | |
| 2150 "addl 1024(%%ebx), %%ecx \n\t" | |
| 2151 "shrl $3, %%ecx \n\t" | |
| 2152 "movl %%ecx, (%%ebx) \n\t" | |
| 2153 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride | |
| 2154 | |
| 210 | 2155 // "movl %3, %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2156 // "movl %%ecx, test \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2157 // "jmp 4f \n\t" |
| 210 | 2158 "cmpl 4+"MANGLE(maxTmpNoise)", %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2159 " jb 2f \n\t" |
| 210 | 2160 "cmpl 8+"MANGLE(maxTmpNoise)", %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2161 " jb 1f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2162 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2163 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2164 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2165 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2166 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2167 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2168 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2169 "movq (%0, %%ebx), %%mm5 \n\t" // L5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2170 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2171 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2172 "movq %%mm0, (%1) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2173 "movq %%mm1, (%1, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2174 "movq %%mm2, (%1, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2175 "movq %%mm3, (%1, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2176 "movq %%mm4, (%1, %2, 4) \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2177 "movq %%mm5, (%1, %%ebx) \n\t" // L5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2178 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2179 "movq %%mm7, (%1, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2180 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2181 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2182 "1: \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2183 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2184 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2185 "pavgb (%1), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2186 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2187 "pavgb (%1, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2188 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2189 "pavgb (%1, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2190 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2191 "pavgb (%1, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2192 "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2193 "pavgb (%1, %2, 4), %%mm4 \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2194 "movq (%0, %%ebx), %%mm5 \n\t" // L5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2195 "pavgb (%1, %%ebx), %%mm5 \n\t" // L5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2196 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2197 "pavgb (%1, %%eax, 2), %%mm6 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2198 "movq (%0, %%ecx), %%mm7 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2199 "pavgb (%1, %%ecx), %%mm7 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2200 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2201 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2202 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2203 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2204 "movq %%mm4, (%1, %2, 4) \n\t" // R4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2205 "movq %%mm5, (%1, %%ebx) \n\t" // R5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2206 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2207 "movq %%mm7, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2208 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2209 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2210 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2211 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2212 "movq %%mm4, (%0, %2, 4) \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2213 "movq %%mm5, (%0, %%ebx) \n\t" // L5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2214 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2215 "movq %%mm7, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2216 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2217 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2218 "2: \n\t" |
| 210 | 2219 "cmpl "MANGLE(maxTmpNoise)", %%ecx \n\t" |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2220 " jb 3f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2221 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2222 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2223 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2224 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2225 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2226 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2227 "movq (%1), %%mm4 \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2228 "movq (%1, %2), %%mm5 \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2229 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2230 "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2231 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2232 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2233 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2234 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2235 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2236 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2237 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2238 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2239 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2240 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2241 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2242 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2243 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2244 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2245 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2246 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2247 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2248 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2249 "movq (%0, %%ebx), %%mm1 \n\t" // L5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2250 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2251 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2252 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2253 "movq (%1, %%ebx), %%mm5 \n\t" // R5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2254 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2255 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2256 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2257 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2258 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2259 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2260 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2261 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2262 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2263 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2264 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2265 "movq %%mm1, (%1, %%ebx) \n\t" // R5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2266 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2267 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2268 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2269 "movq %%mm1, (%0, %%ebx) \n\t" // L5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2270 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2271 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2272 "jmp 4f \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2273 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2274 "3: \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2275 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2276 "movq (%0), %%mm0 \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2277 "movq (%0, %2), %%mm1 \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2278 "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2279 "movq (%0, %%eax), %%mm3 \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2280 "movq (%1), %%mm4 \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2281 "movq (%1, %2), %%mm5 \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2282 "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2283 "movq (%1, %%eax), %%mm7 \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2284 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2285 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2286 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2287 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2288 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2289 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2290 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2291 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2292 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2293 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2294 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2295 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2296 "movq %%mm0, (%1) \n\t" // R0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2297 "movq %%mm1, (%1, %2) \n\t" // R1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2298 "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2299 "movq %%mm3, (%1, %%eax) \n\t" // R3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2300 "movq %%mm0, (%0) \n\t" // L0 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2301 "movq %%mm1, (%0, %2) \n\t" // L1 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2302 "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2303 "movq %%mm3, (%0, %%eax) \n\t" // L3 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2304 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2305 "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2306 "movq (%0, %%ebx), %%mm1 \n\t" // L5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2307 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2308 "movq (%0, %%ecx), %%mm3 \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2309 "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2310 "movq (%1, %%ebx), %%mm5 \n\t" // R5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2311 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2312 "movq (%1, %%ecx), %%mm7 \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2313 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2314 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2315 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2316 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2317 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2318 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2319 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2320 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2321 PAVGB(%%mm4, %%mm0) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2322 PAVGB(%%mm5, %%mm1) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2323 PAVGB(%%mm6, %%mm2) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2324 PAVGB(%%mm7, %%mm3) |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2325 "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2326 "movq %%mm1, (%1, %%ebx) \n\t" // R5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2327 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2328 "movq %%mm3, (%1, %%ecx) \n\t" // R7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2329 "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2330 "movq %%mm1, (%0, %%ebx) \n\t" // L5 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2331 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2332 "movq %%mm3, (%0, %%ecx) \n\t" // L7 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2333 |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2334 "4: \n\t" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2335 |
| 158 | 2336 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2337 : "%eax", "%ebx", "%ecx", "memory" |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2338 ); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2339 //printf("%d\n", test); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2340 #else |
| 156 | 2341 int y; |
| 2342 int d=0; | |
| 2343 int sysd=0; | |
| 158 | 2344 int i; |
| 156 | 2345 |
| 2346 for(y=0; y<8; y++) | |
| 2347 { | |
| 2348 int x; | |
| 2349 for(x=0; x<8; x++) | |
| 2350 { | |
| 2351 int ref= tempBlured[ x + y*stride ]; | |
| 2352 int cur= src[ x + y*stride ]; | |
| 2353 int d1=ref - cur; | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2354 // if(x==0 || x==7) d1+= d1>>1; |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2355 // if(y==0 || y==7) d1+= d1>>1; |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2356 // d+= ABS(d1); |
|
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2357 d+= d1*d1; |
| 156 | 2358 sysd+= d1; |
| 2359 } | |
| 2360 } | |
| 158 | 2361 i=d; |
| 2362 d= ( | |
| 2363 4*d | |
| 2364 +(*(tempBluredPast-256)) | |
| 2365 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) | |
| 2366 +(*(tempBluredPast+256)) | |
| 2367 +4)>>3; | |
| 2368 *tempBluredPast=i; | |
| 2369 // ((*tempBluredPast)*3 + d + 2)>>2; | |
| 2370 | |
| 156 | 2371 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); |
| 2372 /* | |
| 2373 Switch between | |
| 2374 1 0 0 0 0 0 0 (0) | |
| 2375 64 32 16 8 4 2 1 (1) | |
| 2376 64 48 36 27 20 15 11 (33) (approx) | |
| 2377 64 56 49 43 37 33 29 (200) (approx) | |
| 2378 */ | |
| 2379 if(d > maxNoise[1]) | |
| 2380 { | |
| 2381 if(d < maxNoise[2]) | |
| 2382 { | |
| 2383 for(y=0; y<8; y++) | |
| 2384 { | |
| 2385 int x; | |
| 2386 for(x=0; x<8; x++) | |
| 2387 { | |
| 2388 int ref= tempBlured[ x + y*stride ]; | |
| 2389 int cur= src[ x + y*stride ]; | |
| 2390 tempBlured[ x + y*stride ]= | |
| 2391 src[ x + y*stride ]= | |
| 2392 (ref + cur + 1)>>1; | |
| 2393 } | |
| 2394 } | |
| 2395 } | |
| 2396 else | |
| 2397 { | |
| 2398 for(y=0; y<8; y++) | |
| 2399 { | |
| 2400 int x; | |
| 2401 for(x=0; x<8; x++) | |
| 2402 { | |
| 2403 tempBlured[ x + y*stride ]= src[ x + y*stride ]; | |
| 2404 } | |
| 2405 } | |
| 2406 } | |
| 2407 } | |
| 2408 else | |
| 2409 { | |
| 2410 if(d < maxNoise[0]) | |
| 2411 { | |
| 2412 for(y=0; y<8; y++) | |
| 2413 { | |
| 2414 int x; | |
| 2415 for(x=0; x<8; x++) | |
| 2416 { | |
| 2417 int ref= tempBlured[ x + y*stride ]; | |
| 2418 int cur= src[ x + y*stride ]; | |
| 2419 tempBlured[ x + y*stride ]= | |
| 2420 src[ x + y*stride ]= | |
| 2421 (ref*7 + cur + 4)>>3; | |
| 2422 } | |
| 2423 } | |
| 2424 } | |
| 2425 else | |
| 2426 { | |
| 2427 for(y=0; y<8; y++) | |
| 2428 { | |
| 2429 int x; | |
| 2430 for(x=0; x<8; x++) | |
| 2431 { | |
| 2432 int ref= tempBlured[ x + y*stride ]; | |
| 2433 int cur= src[ x + y*stride ]; | |
| 2434 tempBlured[ x + y*stride ]= | |
| 2435 src[ x + y*stride ]= | |
| 2436 (ref*3 + cur + 2)>>2; | |
| 2437 } | |
| 2438 } | |
| 2439 } | |
| 2440 } | |
|
157
bc12fd7e6153
temp denoiser changes: (a-b)^2 instead of |a-b| and MMX2/3DNOW version
michael
parents:
156
diff
changeset
|
2441 #endif |
| 156 | 2442 } |
| 2443 | |
| 169 | 2444 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 156 | 2445 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); |
| 96 | 2446 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2447 /** |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2448 * Copies a block from src to dst and fixes the blacklevel |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2449 * levelFix == 0 -> dont touch the brighness & contrast |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2450 */ |
| 169 | 2451 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
| 164 | 2452 int levelFix) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2453 { |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2454 #ifndef HAVE_MMX |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2455 int i; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2456 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2457 if(levelFix) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2458 { |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2459 #ifdef HAVE_MMX |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2460 asm volatile( |
| 166 | 2461 "leal (%0,%2), %%eax \n\t" |
| 2462 "leal (%1,%3), %%ebx \n\t" | |
| 210 | 2463 "movq "MANGLE(packedYOffset)", %%mm2\n\t" |
| 2464 "movq "MANGLE(packedYScale)", %%mm3\n\t" | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2465 "pxor %%mm4, %%mm4 \n\t" |
| 173 | 2466 #ifdef HAVE_MMX2 |
| 2467 #define SCALED_CPY(src1, src2, dst1, dst2) \ | |
| 2468 "movq " #src1 ", %%mm0 \n\t"\ | |
| 2469 "movq " #src1 ", %%mm5 \n\t"\ | |
| 2470 "movq " #src2 ", %%mm1 \n\t"\ | |
| 2471 "movq " #src2 ", %%mm6 \n\t"\ | |
| 2472 "punpcklbw %%mm0, %%mm0 \n\t"\ | |
| 2473 "punpckhbw %%mm5, %%mm5 \n\t"\ | |
| 2474 "punpcklbw %%mm1, %%mm1 \n\t"\ | |
| 2475 "punpckhbw %%mm6, %%mm6 \n\t"\ | |
| 2476 "pmulhuw %%mm3, %%mm0 \n\t"\ | |
| 2477 "pmulhuw %%mm3, %%mm5 \n\t"\ | |
| 2478 "pmulhuw %%mm3, %%mm1 \n\t"\ | |
| 2479 "pmulhuw %%mm3, %%mm6 \n\t"\ | |
| 2480 "psubw %%mm2, %%mm0 \n\t"\ | |
| 2481 "psubw %%mm2, %%mm5 \n\t"\ | |
| 2482 "psubw %%mm2, %%mm1 \n\t"\ | |
| 2483 "psubw %%mm2, %%mm6 \n\t"\ | |
| 2484 "packuswb %%mm5, %%mm0 \n\t"\ | |
| 2485 "packuswb %%mm6, %%mm1 \n\t"\ | |
| 2486 "movq %%mm0, " #dst1 " \n\t"\ | |
| 2487 "movq %%mm1, " #dst2 " \n\t"\ | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2488 |
| 173 | 2489 #else //HAVE_MMX2 |
| 166 | 2490 #define SCALED_CPY(src1, src2, dst1, dst2) \ |
| 2491 "movq " #src1 ", %%mm0 \n\t"\ | |
| 2492 "movq " #src1 ", %%mm5 \n\t"\ | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2493 "punpcklbw %%mm4, %%mm0 \n\t"\ |
|
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2494 "punpckhbw %%mm4, %%mm5 \n\t"\ |
| 117 | 2495 "psubw %%mm2, %%mm0 \n\t"\ |
| 2496 "psubw %%mm2, %%mm5 \n\t"\ | |
| 166 | 2497 "movq " #src2 ", %%mm1 \n\t"\ |
| 117 | 2498 "psllw $6, %%mm0 \n\t"\ |
| 2499 "psllw $6, %%mm5 \n\t"\ | |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2500 "pmulhw %%mm3, %%mm0 \n\t"\ |
| 166 | 2501 "movq " #src2 ", %%mm6 \n\t"\ |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2502 "pmulhw %%mm3, %%mm5 \n\t"\ |
|
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2503 "punpcklbw %%mm4, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2504 "punpckhbw %%mm4, %%mm6 \n\t"\ |
| 117 | 2505 "psubw %%mm2, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2506 "psubw %%mm2, %%mm6 \n\t"\ |
| 117 | 2507 "psllw $6, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2508 "psllw $6, %%mm6 \n\t"\ |
|
101
fcf4e8fcb34b
fixed a sig4 bug an non mmx2 cpus (in case of more sig4 errors please send me a "disassemble $eip-16 $eip+16" from gdb)
michael
parents:
100
diff
changeset
|
2509 "pmulhw %%mm3, %%mm1 \n\t"\ |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2510 "pmulhw %%mm3, %%mm6 \n\t"\ |
|
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2511 "packuswb %%mm5, %%mm0 \n\t"\ |
|
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2512 "packuswb %%mm6, %%mm1 \n\t"\ |
| 166 | 2513 "movq %%mm0, " #dst1 " \n\t"\ |
| 2514 "movq %%mm1, " #dst2 " \n\t"\ | |
| 2515 | |
| 173 | 2516 #endif //!HAVE_MMX2 |
| 2517 | |
| 166 | 2518 SCALED_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) |
| 2519 SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2)) | |
| 2520 SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4)) | |
| 2521 "leal (%%eax,%2,4), %%eax \n\t" | |
| 2522 "leal (%%ebx,%3,4), %%ebx \n\t" | |
| 2523 SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2)) | |
| 2524 | |
| 2525 | |
| 2526 : : "r"(src), | |
| 2527 "r"(dst), | |
| 2528 "r" (srcStride), | |
|
118
3dd1950ac98d
brightness / contrast fix/copy optimizations +2% speedup
michael
parents:
117
diff
changeset
|
2529 "r" (dstStride) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2530 : "%eax", "%ebx" |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2531 ); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2532 #else |
| 164 | 2533 for(i=0; i<8; i++) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2534 memcpy( &(dst[dstStride*i]), |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2535 &(src[srcStride*i]), BLOCK_SIZE); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2536 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2537 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2538 else |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2539 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2540 #ifdef HAVE_MMX |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2541 asm volatile( |
| 166 | 2542 "leal (%0,%2), %%eax \n\t" |
| 2543 "leal (%1,%3), %%ebx \n\t" | |
| 2544 | |
| 2545 #define SIMPLE_CPY(src1, src2, dst1, dst2) \ | |
| 2546 "movq " #src1 ", %%mm0 \n\t"\ | |
| 2547 "movq " #src2 ", %%mm1 \n\t"\ | |
| 2548 "movq %%mm0, " #dst1 " \n\t"\ | |
| 2549 "movq %%mm1, " #dst2 " \n\t"\ | |
| 2550 | |
| 2551 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) | |
| 2552 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2)) | |
| 2553 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4)) | |
| 2554 "leal (%%eax,%2,4), %%eax \n\t" | |
| 2555 "leal (%%ebx,%3,4), %%ebx \n\t" | |
| 2556 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2)) | |
| 2557 | |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2558 : : "r" (src), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2559 "r" (dst), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2560 "r" (srcStride), |
| 164 | 2561 "r" (dstStride) |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2562 : "%eax", "%ebx" |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2563 ); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2564 #else |
| 164 | 2565 for(i=0; i<8; i++) |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2566 memcpy( &(dst[dstStride*i]), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2567 &(src[srcStride*i]), BLOCK_SIZE); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2568 #endif |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2569 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2570 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2571 |
| 224 | 2572 /** |
| 2573 * Duplicates the given 8 src pixels ? times upward | |
| 2574 */ | |
| 2575 static inline void RENAME(duplicate)(uint8_t src[], int stride) | |
| 2576 { | |
| 2577 #ifdef HAVE_MMX | |
| 2578 asm volatile( | |
| 2579 "movq (%0), %%mm0 \n\t" | |
| 2580 "addl %1, %0 \n\t" | |
| 2581 "movq %%mm0, (%0) \n\t" | |
| 2582 "movq %%mm0, (%0, %1) \n\t" | |
| 2583 "movq %%mm0, (%0, %1, 2) \n\t" | |
| 2584 : "+r" (src) | |
| 2585 : "r" (-stride) | |
| 2586 ); | |
| 2587 #else | |
| 2588 int i; | |
| 2589 uint8_t *p=src; | |
| 2590 for(i=0; i<3; i++) | |
| 2591 { | |
| 2592 p-= stride; | |
| 2593 memcpy(p, src, 8); | |
| 2594 } | |
| 2595 #endif | |
| 2596 } | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2597 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2598 /** |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2599 * Filters array of bytes (Y or U or V values) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2600 */ |
| 169 | 2601 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 156 | 2602 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2603 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2604 int x,y; |
|
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2605 #ifdef COMPILE_TIME_MODE |
|
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2606 const int mode= COMPILE_TIME_MODE; |
|
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2607 #else |
| 156 | 2608 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode; |
|
172
a0efaf471d6b
compiletime pp-mode support (luminance = chrominance filters though) 1-2% faster with -benchmark -vo null -nosound
michael
parents:
169
diff
changeset
|
2609 #endif |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2610 /* we need 64bit here otherwise we´ll going to have a problem |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2611 after watching a black picture for 5 hours*/ |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2612 static uint64_t *yHistogram= NULL; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2613 int black=0, white=255; // blackest black and whitest white in the picture |
| 223 | 2614 int QPCorrecture= 256*256; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2615 |
| 111 | 2616 /* Temporary buffers for handling the last row(s) */ |
| 2617 static uint8_t *tempDst= NULL; | |
| 2618 static uint8_t *tempSrc= NULL; | |
| 2619 | |
| 112 | 2620 /* Temporary buffers for handling the last block */ |
| 2621 static uint8_t *tempDstBlock= NULL; | |
| 2622 static uint8_t *tempSrcBlock= NULL; | |
| 2623 | |
| 156 | 2624 /* Temporal noise reducing buffers */ |
| 2625 static uint8_t *tempBlured[3]= {NULL,NULL,NULL}; | |
| 158 | 2626 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL}; |
| 156 | 2627 |
| 164 | 2628 int copyAhead; |
| 2629 | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2630 #ifdef PP_FUNNY_STRIDE |
| 112 | 2631 uint8_t *dstBlockPtrBackup; |
| 2632 uint8_t *srcBlockPtrBackup; | |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2633 #endif |
| 112 | 2634 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2635 #ifdef MORE_TIMING |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2636 long long T0, T1, diffTime=0; |
|
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2637 #endif |
| 107 | 2638 #ifdef TIMING |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
2639 long long memcpyTime=0, vertTime=0, horizTime=0, sumTime; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2640 sumTime= rdtsc(); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2641 #endif |
|
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2642 dcOffset= ppMode->maxDcDiff; |
|
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2643 dcThreshold= ppMode->maxDcDiff*2 + 1; |
|
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2644 |
| 158 | 2645 #ifdef HAVE_MMX |
| 2646 maxTmpNoise[0]= ppMode->maxTmpNoise[0]; | |
| 2647 maxTmpNoise[1]= ppMode->maxTmpNoise[1]; | |
| 2648 maxTmpNoise[2]= ppMode->maxTmpNoise[2]; | |
|
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2649 |
|
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2650 mmxDCOffset= 0x7F - dcOffset; |
|
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2651 mmxDCThreshold= 0x7F - dcThreshold; |
|
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2652 |
|
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2653 mmxDCOffset*= 0x0101010101010101LL; |
|
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2654 mmxDCThreshold*= 0x0101010101010101LL; |
| 158 | 2655 #endif |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2656 |
| 164 | 2657 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
| 2658 else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14; | |
| 2659 else if( (mode & V_DEBLOCK) | |
| 2660 || (mode & LINEAR_IPOL_DEINT_FILTER) | |
| 2661 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; | |
| 2662 else if(mode & V_X1_FILTER) copyAhead=11; | |
| 2663 else if(mode & V_RK1_FILTER) copyAhead=10; | |
| 2664 else if(mode & DERING) copyAhead=9; | |
| 2665 else copyAhead=8; | |
| 2666 | |
| 2667 copyAhead-= 8; | |
| 2668 | |
| 111 | 2669 if(tempDst==NULL) |
| 2670 { | |
| 2671 tempDst= (uint8_t*)memalign(8, 1024*24); | |
| 2672 tempSrc= (uint8_t*)memalign(8, 1024*24); | |
| 112 | 2673 tempDstBlock= (uint8_t*)memalign(8, 1024*24); |
| 2674 tempSrcBlock= (uint8_t*)memalign(8, 1024*24); | |
| 111 | 2675 } |
| 2676 | |
| 156 | 2677 if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER)) |
| 2678 { | |
| 2679 // printf("%d %d %d\n", isColor, dstStride, height); | |
| 2680 //FIXME works only as long as the size doesnt increase | |
| 2681 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end | |
| 2682 tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024); | |
| 158 | 2683 tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024); |
| 156 | 2684 |
| 2685 memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024); | |
| 158 | 2686 memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024); |
| 156 | 2687 } |
| 2688 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2689 if(!yHistogram) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2690 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2691 int i; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2692 yHistogram= (uint64_t*)malloc(8*256); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2693 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256; |
| 112 | 2694 |
| 2695 if(mode & FULL_Y_RANGE) | |
| 2696 { | |
|
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2697 ppMode->maxAllowedY=255; |
|
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2698 ppMode->minAllowedY=0; |
| 112 | 2699 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2700 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2701 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2702 if(!isColor) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2703 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2704 uint64_t sum= 0; |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2705 int i; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2706 static int framenum= -1; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2707 uint64_t maxClipped; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2708 uint64_t clipped; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2709 double scale; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2710 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2711 framenum++; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2712 if(framenum == 1) yHistogram[0]= width*height/64*15/256; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2713 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2714 for(i=0; i<256; i++) |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2715 { |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2716 sum+= yHistogram[i]; |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2717 // printf("%d ", yHistogram[i]); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2718 } |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2719 // printf("\n\n"); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2720 |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2721 /* we allways get a completly black picture first */ |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2722 maxClipped= (uint64_t)(sum * maxClippedThreshold); |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2723 |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2724 clipped= sum; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2725 for(black=255; black>0; black--) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2726 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2727 if(clipped < maxClipped) break; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2728 clipped-= yHistogram[black]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2729 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2730 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2731 clipped= sum; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2732 for(white=0; white<256; white++) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2733 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2734 if(clipped < maxClipped) break; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2735 clipped-= yHistogram[white]; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2736 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2737 |
|
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2738 scale= (double)(ppMode->maxAllowedY - ppMode->minAllowedY) / (double)(white-black); |
| 173 | 2739 |
| 2740 #ifdef HAVE_MMX2 | |
| 2741 packedYScale= (uint16_t)(scale*256.0 + 0.5); | |
|
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2742 packedYOffset= (((black*packedYScale)>>8) - ppMode->minAllowedY) & 0xFFFF; |
| 173 | 2743 #else |
| 2744 packedYScale= (uint16_t)(scale*1024.0 + 0.5); | |
|
182
3ccd74a91074
minor brightness/contrast bugfix / moved some global vars into ppMode
michael
parents:
181
diff
changeset
|
2745 packedYOffset= (black - ppMode->minAllowedY) & 0xFFFF; |
| 173 | 2746 #endif |
| 2747 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2748 packedYOffset|= packedYOffset<<32; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2749 packedYOffset|= packedYOffset<<16; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2750 |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2751 packedYScale|= packedYScale<<32; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2752 packedYScale|= packedYScale<<16; |
| 223 | 2753 |
| 2754 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); | |
| 2755 else QPCorrecture= 256*256; | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2756 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2757 else |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2758 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2759 packedYScale= 0x0100010001000100LL; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2760 packedYOffset= 0; |
| 223 | 2761 QPCorrecture= 256*256; |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2762 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2763 |
|
148
1cfc4d567c0a
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
142
diff
changeset
|
2764 /* copy & deinterlace first row of blocks */ |
| 142 | 2765 y=-BLOCK_SIZE; |
| 2766 { | |
| 2767 uint8_t *srcBlock= &(src[y*srcStride]); | |
| 224 | 2768 uint8_t *dstBlock= tempDst + dstStride; |
| 142 | 2769 |
| 2770 // From this point on it is guranteed that we can read and write 16 lines downward | |
| 2771 // finish 1 block before the next otherwise we´ll might have a problem | |
| 2772 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
| 2773 for(x=0; x<width; x+=BLOCK_SIZE) | |
| 2774 { | |
| 2775 | |
| 2776 #ifdef HAVE_MMX2 | |
| 2777 /* | |
| 2778 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 2779 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 2780 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 2781 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 2782 */ | |
| 2783 | |
| 2784 asm( | |
| 2785 "movl %4, %%eax \n\t" | |
| 2786 "shrl $2, %%eax \n\t" | |
| 2787 "andl $6, %%eax \n\t" | |
| 164 | 2788 "addl %5, %%eax \n\t" |
| 142 | 2789 "movl %%eax, %%ebx \n\t" |
| 2790 "imul %1, %%eax \n\t" | |
| 2791 "imul %3, %%ebx \n\t" | |
| 2792 "prefetchnta 32(%%eax, %0) \n\t" | |
| 2793 "prefetcht0 32(%%ebx, %2) \n\t" | |
| 2794 "addl %1, %%eax \n\t" | |
| 2795 "addl %3, %%ebx \n\t" | |
| 2796 "prefetchnta 32(%%eax, %0) \n\t" | |
| 2797 "prefetcht0 32(%%ebx, %2) \n\t" | |
| 2798 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | |
| 164 | 2799 "m" (x), "m" (copyAhead) |
| 142 | 2800 : "%eax", "%ebx" |
| 2801 ); | |
| 2802 | |
| 2803 #elif defined(HAVE_3DNOW) | |
| 2804 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 2805 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
| 2806 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 2807 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 2808 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 2809 */ | |
| 2810 #endif | |
| 2811 | |
| 224 | 2812 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
| 2813 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX); | |
| 2814 | |
| 2815 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); | |
| 142 | 2816 |
| 2817 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
| 169 | 2818 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
| 142 | 2819 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 169 | 2820 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
| 142 | 2821 else if(mode & MEDIAN_DEINT_FILTER) |
| 169 | 2822 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
| 142 | 2823 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
| 169 | 2824 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
| 142 | 2825 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
| 169 | 2826 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
| 142 | 2827 */ |
| 2828 dstBlock+=8; | |
| 2829 srcBlock+=8; | |
| 2830 } | |
| 224 | 2831 memcpy(dst, tempDst + 9*dstStride, copyAhead*dstStride ); |
| 142 | 2832 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2833 |
| 111 | 2834 for(y=0; y<height; y+=BLOCK_SIZE) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2835 { |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2836 //1% speedup if these are here instead of the inner loop |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2837 uint8_t *srcBlock= &(src[y*srcStride]); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2838 uint8_t *dstBlock= &(dst[y*dstStride]); |
| 169 | 2839 #ifdef HAVE_MMX |
| 2840 uint8_t *tempBlock1= tempBlocks; | |
| 2841 uint8_t *tempBlock2= tempBlocks + 8; | |
| 2842 #endif | |
| 126 | 2843 #ifdef ARCH_X86 |
| 2844 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | |
| 223 | 2845 int QPDelta= isColor ? (-1) : 1<<31; |
| 2846 int QPFrac= 1<<30; | |
| 126 | 2847 #endif |
| 156 | 2848 int QP=0; |
| 130 | 2849 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
| 2850 if not than use a temporary buffer */ | |
| 111 | 2851 if(y+15 >= height) |
| 2852 { | |
| 156 | 2853 int i; |
| 164 | 2854 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
| 111 | 2855 blockcopy to dst later */ |
| 164 | 2856 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
| 2857 srcStride*MAX(height-y-copyAhead, 0) ); | |
| 2858 | |
| 2859 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ | |
| 2860 for(i=MAX(height-y, 8); i<copyAhead+8; i++) | |
| 156 | 2861 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); |
| 2862 | |
| 164 | 2863 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
| 2864 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) ); | |
| 2865 | |
| 2866 /* duplicate last line of dst to fill the void upto line (copyAhead) */ | |
| 2867 for(i=height-y+1; i<=copyAhead; i++) | |
| 156 | 2868 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); |
| 2869 | |
| 130 | 2870 dstBlock= tempDst + dstStride; |
| 111 | 2871 srcBlock= tempSrc; |
| 2872 } | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2873 |
| 112 | 2874 // From this point on it is guranteed that we can read and write 16 lines downward |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2875 // finish 1 block before the next otherwise we´ll might have a problem |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2876 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2877 for(x=0; x<width; x+=BLOCK_SIZE) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2878 { |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
2879 const int stride= dstStride; |
| 169 | 2880 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
2881 uint8_t *tmpXchg; |
| 169 | 2882 #endif |
| 126 | 2883 #ifdef ARCH_X86 |
| 156 | 2884 QP= *QPptr; |
| 126 | 2885 asm volatile( |
| 2886 "addl %2, %1 \n\t" | |
| 2887 "sbbl %%eax, %%eax \n\t" | |
| 2888 "shll $2, %%eax \n\t" | |
| 2889 "subl %%eax, %0 \n\t" | |
| 2890 : "+r" (QPptr), "+m" (QPFrac) | |
| 2891 : "r" (QPDelta) | |
| 2892 : "%eax" | |
| 2893 ); | |
| 2894 #else | |
| 156 | 2895 QP= isColor ? |
| 126 | 2896 QPs[(y>>3)*QPStride + (x>>3)]: |
| 2897 QPs[(y>>4)*QPStride + (x>>4)]; | |
| 2898 #endif | |
| 2899 if(!isColor) | |
| 121 | 2900 { |
| 223 | 2901 QP= (QP* QPCorrecture + 256*128)>>16; |
|
148
1cfc4d567c0a
minor changes (fixed some warnings, added attribute aligned(8) stuff)
michael
parents:
142
diff
changeset
|
2902 yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
| 121 | 2903 } |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2904 #ifdef HAVE_MMX |
| 111 | 2905 asm volatile( |
| 2906 "movd %0, %%mm7 \n\t" | |
| 2907 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | |
| 2908 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | |
| 2909 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | |
| 210 | 2910 "movq %%mm7, "MANGLE(pQPb)" \n\t" |
| 111 | 2911 : : "r" (QP) |
| 2912 ); | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2913 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2914 |
| 107 | 2915 #ifdef MORE_TIMING |
| 111 | 2916 T0= rdtsc(); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2917 #endif |
| 96 | 2918 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2919 #ifdef HAVE_MMX2 |
| 126 | 2920 /* |
| 2921 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 2922 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 2923 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 2924 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 2925 */ | |
| 2926 | |
| 2927 asm( | |
| 2928 "movl %4, %%eax \n\t" | |
| 2929 "shrl $2, %%eax \n\t" | |
| 2930 "andl $6, %%eax \n\t" | |
| 164 | 2931 "addl %5, %%eax \n\t" |
| 126 | 2932 "movl %%eax, %%ebx \n\t" |
| 2933 "imul %1, %%eax \n\t" | |
| 2934 "imul %3, %%ebx \n\t" | |
| 2935 "prefetchnta 32(%%eax, %0) \n\t" | |
| 2936 "prefetcht0 32(%%ebx, %2) \n\t" | |
| 2937 "addl %1, %%eax \n\t" | |
| 2938 "addl %3, %%ebx \n\t" | |
| 2939 "prefetchnta 32(%%eax, %0) \n\t" | |
| 2940 "prefetcht0 32(%%ebx, %2) \n\t" | |
| 2941 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | |
| 164 | 2942 "m" (x), "m" (copyAhead) |
| 126 | 2943 : "%eax", "%ebx" |
| 2944 ); | |
| 2945 | |
| 96 | 2946 #elif defined(HAVE_3DNOW) |
| 2947 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 111 | 2948 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
| 2949 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 2950 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 2951 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 96 | 2952 */ |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2953 #endif |
| 111 | 2954 |
| 113 | 2955 #ifdef PP_FUNNY_STRIDE |
| 112 | 2956 //can we mess with a 8x16 block, if not use a temp buffer, yes again |
| 2957 if(x+7 >= width) | |
| 2958 { | |
| 2959 int i; | |
| 2960 dstBlockPtrBackup= dstBlock; | |
| 2961 srcBlockPtrBackup= srcBlock; | |
| 2962 | |
| 2963 for(i=0;i<BLOCK_SIZE*2; i++) | |
| 2964 { | |
| 2965 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x); | |
| 2966 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x); | |
| 2967 } | |
| 2968 | |
| 2969 dstBlock= tempDstBlock; | |
| 2970 srcBlock= tempSrcBlock; | |
| 2971 } | |
| 113 | 2972 #endif |
| 112 | 2973 |
| 169 | 2974 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
| 164 | 2975 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2976 |
| 111 | 2977 if(mode & LINEAR_IPOL_DEINT_FILTER) |
| 169 | 2978 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
| 111 | 2979 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 169 | 2980 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride); |
| 111 | 2981 else if(mode & MEDIAN_DEINT_FILTER) |
| 169 | 2982 RENAME(deInterlaceMedian)(dstBlock, dstStride); |
| 111 | 2983 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
| 169 | 2984 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
| 111 | 2985 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
| 169 | 2986 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
|
106
389391a6d0bf
rewrote the horizontal lowpass filter to fix a bug which caused a blocky look
michael
parents:
105
diff
changeset
|
2987 */ |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2988 |
| 111 | 2989 /* only deblock if we have 2 blocks */ |
| 2990 if(y + 8 < height) | |
| 2991 { | |
| 107 | 2992 #ifdef MORE_TIMING |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2993 T1= rdtsc(); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2994 memcpyTime+= T1-T0; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2995 T0=T1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
2996 #endif |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
2997 if(mode & V_RK1_FILTER) |
| 169 | 2998 RENAME(vertRK1Filter)(dstBlock, stride, QP); |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
2999 else if(mode & V_X1_FILTER) |
| 169 | 3000 RENAME(vertX1Filter)(dstBlock, stride, QP); |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3001 else if(mode & V_DEBLOCK) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3002 { |
| 169 | 3003 if( RENAME(isVertDC)(dstBlock, stride)) |
| 96 | 3004 { |
| 169 | 3005 if(RENAME(isVertMinMaxOk)(dstBlock, stride, QP)) |
| 3006 RENAME(doVertLowPass)(dstBlock, stride, QP); | |
| 96 | 3007 } |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3008 else |
| 169 | 3009 RENAME(doVertDefFilter)(dstBlock, stride, QP); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3010 } |
| 107 | 3011 #ifdef MORE_TIMING |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3012 T1= rdtsc(); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3013 vertTime+= T1-T0; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3014 T0=T1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3015 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3016 } |
| 130 | 3017 |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3018 #ifdef HAVE_MMX |
| 169 | 3019 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3020 #endif |
| 111 | 3021 /* check if we have a previous block to deblock it with dstBlock */ |
| 112 | 3022 if(x - 8 >= 0) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3023 { |
| 107 | 3024 #ifdef MORE_TIMING |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3025 T0= rdtsc(); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3026 #endif |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3027 #ifdef HAVE_MMX |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3028 if(mode & H_RK1_FILTER) |
| 169 | 3029 RENAME(vertRK1Filter)(tempBlock1, 16, QP); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3030 else if(mode & H_X1_FILTER) |
| 169 | 3031 RENAME(vertX1Filter)(tempBlock1, 16, QP); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3032 else if(mode & H_DEBLOCK) |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3033 { |
| 169 | 3034 if( RENAME(isVertDC)(tempBlock1, 16) ) |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3035 { |
| 169 | 3036 if(RENAME(isVertMinMaxOk)(tempBlock1, 16, QP)) |
| 3037 RENAME(doVertLowPass)(tempBlock1, 16, QP); | |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3038 } |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3039 else |
| 169 | 3040 RENAME(doVertDefFilter)(tempBlock1, 16, QP); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3041 } |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3042 |
| 169 | 3043 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3044 |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3045 #else |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3046 if(mode & H_X1_FILTER) |
|
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3047 horizX1Filter(dstBlock-4, stride, QP); |
|
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3048 else if(mode & H_DEBLOCK) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3049 { |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3050 if( isHorizDC(dstBlock-4, stride)) |
| 96 | 3051 { |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3052 if(isHorizMinMaxOk(dstBlock-4, stride, QP)) |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3053 doHorizLowPass(dstBlock-4, stride, QP); |
| 96 | 3054 } |
|
115
4514b8e7f0f1
more logic behavior if the altenative deblock filters are used (turning a alt filter on without turning the deblock filter on uses the alt filter instead of using no filter now)
michael
parents:
113
diff
changeset
|
3055 else |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3056 doHorizDefFilter(dstBlock-4, stride, QP); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3057 } |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3058 #endif |
| 107 | 3059 #ifdef MORE_TIMING |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3060 T1= rdtsc(); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3061 horizTime+= T1-T0; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3062 T0=T1; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3063 #endif |
| 130 | 3064 if(mode & DERING) |
| 3065 { | |
| 3066 //FIXME filter first line | |
| 169 | 3067 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, QP); |
| 130 | 3068 } |
| 156 | 3069 |
| 3070 if(mode & TEMP_NOISE_FILTER) | |
| 3071 { | |
| 169 | 3072 RENAME(tempNoiseReducer)(dstBlock-8, stride, |
| 156 | 3073 tempBlured[isColor] + y*dstStride + x, |
| 158 | 3074 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), |
| 156 | 3075 ppMode->maxTmpNoise); |
| 3076 } | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3077 } |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3078 |
| 113 | 3079 #ifdef PP_FUNNY_STRIDE |
| 112 | 3080 /* did we use a tmp-block buffer */ |
| 3081 if(x+7 >= width) | |
| 3082 { | |
| 3083 int i; | |
| 3084 dstBlock= dstBlockPtrBackup; | |
| 3085 srcBlock= srcBlockPtrBackup; | |
| 3086 | |
| 3087 for(i=0;i<BLOCK_SIZE*2; i++) | |
| 3088 { | |
| 3089 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x); | |
| 3090 } | |
| 3091 } | |
| 113 | 3092 #endif |
| 112 | 3093 |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3094 dstBlock+=8; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3095 srcBlock+=8; |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3096 |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3097 #ifdef HAVE_MMX |
|
128
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3098 tmpXchg= tempBlock1; |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3099 tempBlock1= tempBlock2; |
|
e5266b8e79be
much better horizontal filters (transpose & use the vertical ones) :)
michael
parents:
126
diff
changeset
|
3100 tempBlock2 = tmpXchg; |
|
129
be35346e27c1
fixed difference with -vo md5 between doVertDefFilter() C and MMX / MMX2 versions
michael
parents:
128
diff
changeset
|
3101 #endif |
| 111 | 3102 } |
| 3103 | |
| 156 | 3104 if(mode & DERING) |
| 3105 { | |
| 169 | 3106 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, QP); |
| 156 | 3107 } |
| 3108 | |
| 3109 if((mode & TEMP_NOISE_FILTER)) | |
| 3110 { | |
| 169 | 3111 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
| 156 | 3112 tempBlured[isColor] + y*dstStride + x, |
| 158 | 3113 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), |
| 156 | 3114 ppMode->maxTmpNoise); |
| 3115 } | |
| 3116 | |
| 142 | 3117 /* did we use a tmp buffer for the last lines*/ |
| 112 | 3118 if(y+15 >= height) |
| 111 | 3119 { |
| 3120 uint8_t *dstBlock= &(dst[y*dstStride]); | |
| 130 | 3121 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3122 } |
| 163 | 3123 /* |
| 3124 for(x=0; x<width; x+=32) | |
| 3125 { | |
| 164 | 3126 volatile int i; |
| 163 | 3127 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
| 3128 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | |
| 164 | 3129 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
| 3130 // + dstBlock[x +13*dstStride] | |
| 3131 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; | |
| 3132 }*/ | |
| 3133 } | |
| 96 | 3134 #ifdef HAVE_3DNOW |
| 3135 asm volatile("femms"); | |
| 3136 #elif defined (HAVE_MMX) | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3137 asm volatile("emms"); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3138 #endif |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3139 |
| 107 | 3140 #ifdef TIMING |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3141 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3142 sumTime= rdtsc() - sumTime; |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3143 if(!isColor) |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3144 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r", |
|
97
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3145 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000), |
|
e57b1d38d71f
bugfixes: last 3 lines not brightness/contrast corrected
michael
parents:
96
diff
changeset
|
3146 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3147 , black, white); |
|
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3148 #endif |
| 163 | 3149 #ifdef DEBUG_BRIGHTNESS |
| 3150 if(!isColor) | |
| 3151 { | |
| 3152 int max=1; | |
| 3153 int i; | |
| 3154 for(i=0; i<256; i++) | |
| 3155 if(yHistogram[i] > max) max=yHistogram[i]; | |
| 3156 | |
| 3157 for(i=1; i<256; i++) | |
| 3158 { | |
| 3159 int x; | |
| 3160 int start=yHistogram[i-1]/(max/256+1); | |
| 3161 int end=yHistogram[i]/(max/256+1); | |
| 3162 int inc= end > start ? 1 : -1; | |
| 3163 for(x=start; x!=end+inc; x+=inc) | |
| 3164 dst[ i*dstStride + x]+=128; | |
| 3165 } | |
| 3166 | |
| 3167 for(i=0; i<100; i+=2) | |
| 3168 { | |
| 3169 dst[ (white)*dstStride + i]+=128; | |
| 3170 dst[ (black)*dstStride + i]+=128; | |
| 3171 } | |
| 3172 | |
| 3173 } | |
| 3174 #endif | |
| 3175 | |
|
95
8bce253b537c
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
arpi
parents:
diff
changeset
|
3176 } |
