Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 95:8bce253b537c libavcodec
new postprocess code by Michael Niedermayer (michaelni@gmx.at)
| author | arpi |
|---|---|
| date | Wed, 10 Oct 2001 22:13:27 +0000 |
| parents | |
| children | 29ac11dc53d3 |
comparison
equal
deleted
inserted
replaced
| 94:7e263a256a6f | 95:8bce253b537c |
|---|---|
| 1 /* | |
| 2 Copyright (C) 2001 Michael Niedermayer (michaelni@gmx.at) | |
| 3 | |
| 4 This program is free software; you can redistribute it and/or modify | |
| 5 it under the terms of the GNU General Public License as published by | |
| 6 the Free Software Foundation; either version 2 of the License, or | |
| 7 (at your option) any later version. | |
| 8 | |
| 9 This program is distributed in the hope that it will be useful, | |
| 10 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 12 GNU General Public License for more details. | |
| 13 | |
| 14 You should have received a copy of the GNU General Public License | |
| 15 along with this program; if not, write to the Free Software | |
| 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
| 17 */ | |
| 18 | |
| 19 /* | |
| 20 C MMX MMX2 | |
| 21 isVertDC Ec Ec | |
| 22 isVertMinMaxOk Ec Ec | |
| 23 doVertLowPass E e | |
| 24 doVertDefFilter Ec Ec Ec | |
| 25 isHorizDC Ec Ec | |
| 26 isHorizMinMaxOk a | |
| 27 doHorizLowPass E a | |
| 28 doHorizDefFilter E a | |
| 29 deRing | |
| 30 | |
| 31 E = Exact implementation | |
| 32 e = allmost exact implementation | |
| 33 a = alternative / approximate impl | |
| 34 c = checked against the other implementations (-vo md5) | |
| 35 */ | |
| 36 | |
| 37 /* | |
| 38 TODO: | |
| 39 verify that everything workes as it should | |
| 40 reduce the time wasted on the mem transfer | |
| 41 implement dering | |
| 42 implement everything in C at least | |
| 43 figure range of QP out (assuming <256 for now) | |
| 44 unroll stuff if instructions depend too much on the prior one | |
| 45 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? | |
| 46 move YScale thing to the end instead of fixing QP | |
| 47 ... | |
| 48 | |
| 49 Notes: | |
| 50 | |
| 51 */ | |
| 52 | |
| 53 | |
| 54 #include <inttypes.h> | |
| 55 #include <stdio.h> | |
| 56 #include "../config.h" | |
| 57 #include "postprocess.h" | |
| 58 //#undef HAVE_MMX2 | |
| 59 //#undef HAVE_MMX | |
| 60 | |
| 61 | |
| 62 | |
| 63 static uint64_t packedYOffset= 0x0000000000000000LL; | |
| 64 static uint64_t packedYScale= 0x0100010001000100LL; | |
| 65 static uint64_t w05= 0x0005000500050005LL; | |
| 66 static uint64_t w20= 0x0020002000200020LL; | |
| 67 static uint64_t w1400= 0x1400140014001400LL; | |
| 68 static uint64_t bm00000001= 0x00000000000000FFLL; | |
| 69 static uint64_t bm00010000= 0x000000FF00000000LL; | |
| 70 static uint64_t bm00001000= 0x00000000FF000000LL; | |
| 71 static uint64_t bm10000000= 0xFF00000000000000LL; | |
| 72 static uint64_t bm10000001= 0xFF000000000000FFLL; | |
| 73 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; | |
| 74 static uint64_t bm00011000= 0x000000FFFF000000LL; | |
| 75 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; | |
| 76 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; | |
| 77 static uint64_t b00= 0x0000000000000000LL; | |
| 78 static uint64_t b02= 0x0202020202020202LL; | |
| 79 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; | |
| 80 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; | |
| 81 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL; | |
| 82 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL; | |
| 83 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL; | |
| 84 static uint64_t temp0=0; | |
| 85 static uint64_t temp1=0; | |
| 86 static uint64_t temp2=0; | |
| 87 static uint64_t temp3=0; | |
| 88 static uint64_t temp4=0; | |
| 89 static uint64_t temp5=0; | |
| 90 static uint64_t pQPb=0; | |
| 91 static uint8_t tempBlock[16*16]; | |
| 92 | |
| 93 int hFlatnessThreshold= 56 - 16; | |
| 94 int vFlatnessThreshold= 56 - 16; | |
| 95 | |
| 96 //amount of "black" u r willing to loose to get a brightness corrected picture | |
| 97 double maxClippedThreshold= 0.01; | |
| 98 | |
| 99 int maxAllowedY=255; | |
| 100 //FIXME can never make a movieŽs black brighter (anyone needs that?) | |
| 101 int minAllowedY=0; | |
| 102 | |
| 103 | |
| 104 static inline long long rdtsc() | |
| 105 { | |
| 106 long long l; | |
| 107 asm volatile( "rdtsc\n\t" | |
| 108 : "=A" (l) | |
| 109 ); | |
| 110 // printf("%d\n", int(l/1000)); | |
| 111 return l; | |
| 112 } | |
| 113 | |
| 114 static inline void prefetchnta(void *p) | |
| 115 { | |
| 116 asm volatile( "prefetchnta (%0)\n\t" | |
| 117 : : "r" (p) | |
| 118 ); | |
| 119 } | |
| 120 | |
| 121 static inline void prefetcht0(void *p) | |
| 122 { | |
| 123 asm volatile( "prefetcht0 (%0)\n\t" | |
| 124 : : "r" (p) | |
| 125 ); | |
| 126 } | |
| 127 | |
| 128 static inline void prefetcht1(void *p) | |
| 129 { | |
| 130 asm volatile( "prefetcht1 (%0)\n\t" | |
| 131 : : "r" (p) | |
| 132 ); | |
| 133 } | |
| 134 | |
| 135 static inline void prefetcht2(void *p) | |
| 136 { | |
| 137 asm volatile( "prefetcht2 (%0)\n\t" | |
| 138 : : "r" (p) | |
| 139 ); | |
| 140 } | |
| 141 | |
| 142 //FIXME? |255-0| = 1 (shouldnt be a problem ...) | |
| 143 /** | |
| 144 * Check if the middle 8x8 Block in the given 8x10 block is flat | |
| 145 */ | |
| 146 static inline bool isVertDC(uint8_t src[], int stride){ | |
| 147 // return true; | |
| 148 int numEq= 0; | |
| 149 src+= stride; // src points to begin of the 8x8 Block | |
| 150 #ifdef HAVE_MMX | |
| 151 asm volatile( | |
| 152 // "int $3 \n\t" | |
| 153 "pushl %1\n\t" | |
| 154 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F | |
| 155 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D | |
| 156 "movq (%1), %%mm0 \n\t" | |
| 157 "addl %2, %1 \n\t" | |
| 158 "movq (%1), %%mm1 \n\t" | |
| 159 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece | |
| 160 "paddb %%mm7, %%mm0 \n\t" | |
| 161 "pcmpgtb %%mm6, %%mm0 \n\t" | |
| 162 | |
| 163 "addl %2, %1 \n\t" | |
| 164 "movq (%1), %%mm2 \n\t" | |
| 165 "psubb %%mm2, %%mm1 \n\t" | |
| 166 "paddb %%mm7, %%mm1 \n\t" | |
| 167 "pcmpgtb %%mm6, %%mm1 \n\t" | |
| 168 "paddb %%mm1, %%mm0 \n\t" | |
| 169 | |
| 170 "addl %2, %1 \n\t" | |
| 171 "movq (%1), %%mm1 \n\t" | |
| 172 "psubb %%mm1, %%mm2 \n\t" | |
| 173 "paddb %%mm7, %%mm2 \n\t" | |
| 174 "pcmpgtb %%mm6, %%mm2 \n\t" | |
| 175 "paddb %%mm2, %%mm0 \n\t" | |
| 176 | |
| 177 "addl %2, %1 \n\t" | |
| 178 "movq (%1), %%mm2 \n\t" | |
| 179 "psubb %%mm2, %%mm1 \n\t" | |
| 180 "paddb %%mm7, %%mm1 \n\t" | |
| 181 "pcmpgtb %%mm6, %%mm1 \n\t" | |
| 182 "paddb %%mm1, %%mm0 \n\t" | |
| 183 | |
| 184 "addl %2, %1 \n\t" | |
| 185 "movq (%1), %%mm1 \n\t" | |
| 186 "psubb %%mm1, %%mm2 \n\t" | |
| 187 "paddb %%mm7, %%mm2 \n\t" | |
| 188 "pcmpgtb %%mm6, %%mm2 \n\t" | |
| 189 "paddb %%mm2, %%mm0 \n\t" | |
| 190 | |
| 191 "addl %2, %1 \n\t" | |
| 192 "movq (%1), %%mm2 \n\t" | |
| 193 "psubb %%mm2, %%mm1 \n\t" | |
| 194 "paddb %%mm7, %%mm1 \n\t" | |
| 195 "pcmpgtb %%mm6, %%mm1 \n\t" | |
| 196 "paddb %%mm1, %%mm0 \n\t" | |
| 197 | |
| 198 "addl %2, %1 \n\t" | |
| 199 "movq (%1), %%mm1 \n\t" | |
| 200 "psubb %%mm1, %%mm2 \n\t" | |
| 201 "paddb %%mm7, %%mm2 \n\t" | |
| 202 "pcmpgtb %%mm6, %%mm2 \n\t" | |
| 203 "paddb %%mm2, %%mm0 \n\t" | |
| 204 | |
| 205 " \n\t" | |
| 206 "movq %%mm0, %%mm1 \n\t" | |
| 207 "psrlw $8, %%mm0 \n\t" | |
| 208 "paddb %%mm1, %%mm0 \n\t" | |
| 209 "movq %%mm0, %%mm1 \n\t" | |
| 210 "psrlq $16, %%mm0 \n\t" | |
| 211 "paddb %%mm1, %%mm0 \n\t" | |
| 212 "movq %%mm0, %%mm1 \n\t" | |
| 213 "psrlq $32, %%mm0 \n\t" | |
| 214 "paddb %%mm1, %%mm0 \n\t" | |
| 215 "popl %1\n\t" | |
| 216 "movd %%mm0, %0 \n\t" | |
| 217 : "=r" (numEq) | |
| 218 : "r" (src), "r" (stride) | |
| 219 ); | |
| 220 // printf("%d\n", numEq); | |
| 221 numEq= (256 - (numEq & 0xFF)) &0xFF; | |
| 222 | |
| 223 // int asmEq= numEq; | |
| 224 // numEq=0; | |
| 225 // uint8_t *temp= src; | |
| 226 | |
| 227 #else | |
| 228 for(int y=0; y<BLOCK_SIZE-1; y++) | |
| 229 { | |
| 230 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 231 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 232 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 233 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 234 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 235 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 236 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 237 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 238 src+= stride; | |
| 239 } | |
| 240 #endif | |
| 241 /* if(abs(numEq - asmEq) > 0) | |
| 242 { | |
| 243 printf("\nasm:%d c:%d\n", asmEq, numEq); | |
| 244 for(int y=0; y<8; y++) | |
| 245 { | |
| 246 for(int x=0; x<8; x++) | |
| 247 { | |
| 248 printf("%d ", temp[x + y*stride]); | |
| 249 } | |
| 250 printf("\n"); | |
| 251 } | |
| 252 } | |
| 253 */ | |
| 254 return numEq > vFlatnessThreshold; | |
| 255 } | |
| 256 | |
| 257 static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP) | |
| 258 { | |
| 259 #ifdef HAVE_MMX | |
| 260 int isOk; | |
| 261 asm volatile( | |
| 262 // "int $3 \n\t" | |
| 263 "movq (%1, %2), %%mm0 \n\t" | |
| 264 "movq (%1, %2, 8), %%mm1 \n\t" | |
| 265 "movq %%mm0, %%mm2 \n\t" | |
| 266 "psubusb %%mm1, %%mm0 \n\t" | |
| 267 "psubusb %%mm2, %%mm1 \n\t" | |
| 268 "por %%mm1, %%mm0 \n\t" // ABS Diff | |
| 269 | |
| 270 "movq pQPb, %%mm7 \n\t" // QP,..., QP | |
| 271 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | |
| 272 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 | |
| 273 "pcmpeqd b00, %%mm0 \n\t" | |
| 274 "psrlq $16, %%mm0 \n\t" | |
| 275 "pcmpeqd bFF, %%mm0 \n\t" | |
| 276 // "movd %%mm0, (%1, %2, 4)\n\t" | |
| 277 "movd %%mm0, %0 \n\t" | |
| 278 : "=r" (isOk) | |
| 279 : "r" (src), "r" (stride) | |
| 280 ); | |
| 281 return isOk; | |
| 282 #else | |
| 283 | |
| 284 int isOk2= true; | |
| 285 for(int x=0; x<BLOCK_SIZE; x++) | |
| 286 { | |
| 287 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=false; | |
| 288 } | |
| 289 /* if(isOk && !isOk2 || !isOk && isOk2) | |
| 290 { | |
| 291 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP); | |
| 292 for(int y=0; y<9; y++) | |
| 293 { | |
| 294 for(int x=0; x<8; x++) | |
| 295 { | |
| 296 printf("%d ", src[x + y*stride]); | |
| 297 } | |
| 298 printf("\n"); | |
| 299 } | |
| 300 } */ | |
| 301 | |
| 302 return isOk2; | |
| 303 #endif | |
| 304 | |
| 305 } | |
| 306 | |
| 307 /** | |
| 308 * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle) | |
| 309 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 | |
| 310 */ | |
| 311 static inline void doVertLowPass(uint8_t *src, int stride, int QP) | |
| 312 { | |
| 313 // QP= 64; | |
| 314 | |
| 315 #ifdef HAVE_MMX2 | |
| 316 asm volatile( //"movv %0 %1 %2\n\t" | |
| 317 "pushl %0 \n\t" | |
| 318 "movq pQPb, %%mm0 \n\t" // QP,..., QP | |
| 319 // "movq bFF , %%mm0 \n\t" // QP,..., QP | |
| 320 | |
| 321 "movq (%0), %%mm6 \n\t" | |
| 322 "movq (%0, %1), %%mm5 \n\t" | |
| 323 "movq %%mm5, %%mm1 \n\t" | |
| 324 "movq %%mm6, %%mm2 \n\t" | |
| 325 "psubusb %%mm6, %%mm5 \n\t" | |
| 326 "psubusb %%mm1, %%mm2 \n\t" | |
| 327 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
| 328 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | |
| 329 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF | |
| 330 | |
| 331 "pand %%mm2, %%mm6 \n\t" | |
| 332 "pandn %%mm1, %%mm2 \n\t" | |
| 333 "por %%mm2, %%mm6 \n\t"// First Line to Filter | |
| 334 | |
| 335 "movq (%0, %1, 8), %%mm5 \n\t" | |
| 336 "leal (%0, %1, 4), %%eax \n\t" | |
| 337 "leal (%0, %1, 8), %%ebx \n\t" | |
| 338 "subl %1, %%ebx \n\t" | |
| 339 "addl %1, %0 \n\t" // %0 points to line 1 not 0 | |
| 340 "movq (%0, %1, 8), %%mm7 \n\t" | |
| 341 "movq %%mm5, %%mm1 \n\t" | |
| 342 "movq %%mm7, %%mm2 \n\t" | |
| 343 "psubusb %%mm7, %%mm5 \n\t" | |
| 344 "psubusb %%mm1, %%mm2 \n\t" | |
| 345 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
| 346 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | |
| 347 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF | |
| 348 | |
| 349 "pand %%mm2, %%mm7 \n\t" | |
| 350 "pandn %%mm1, %%mm2 \n\t" | |
| 351 "por %%mm2, %%mm7 \n\t" // First Line to Filter | |
| 352 | |
| 353 | |
| 354 // 1 2 3 4 5 6 7 8 | |
| 355 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 | |
| 356 // 6 4 2 2 1 1 | |
| 357 // 6 4 4 2 | |
| 358 // 6 8 2 | |
| 359 /* | |
| 360 "movq %%mm6, %%mm2 \n\t" //1 | |
| 361 "movq %%mm6, %%mm3 \n\t" //1 | |
| 362 "paddusb b02, %%mm3 \n\t" | |
| 363 "psrlw $2, %%mm3 \n\t" //1 /4 | |
| 364 "pand b3F, %%mm3 \n\t" | |
| 365 "psubb %%mm3, %%mm2 \n\t" | |
| 366 "movq (%0, %1), %%mm0 \n\t" // 1 | |
| 367 "movq %%mm0, %%mm1 \n\t" // 1 | |
| 368 "paddusb b02, %%mm0 \n\t" | |
| 369 "psrlw $2, %%mm0 \n\t" // 1 /4 | |
| 370 "pand b3F, %%mm0 \n\t" | |
| 371 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4 | |
| 372 */ | |
| 373 "movq (%0, %1), %%mm0 \n\t" // 1 | |
| 374 "movq %%mm0, %%mm1 \n\t" // 1 | |
| 375 "pavgb %%mm6, %%mm0 \n\t" //1 1 /2 | |
| 376 "pavgb %%mm6, %%mm0 \n\t" //3 1 /4 | |
| 377 | |
| 378 "movq (%0, %1, 4), %%mm2 \n\t" // 1 | |
| 379 "movq %%mm2, %%mm5 \n\t" // 1 | |
| 380 "pavgb (%%eax), %%mm2 \n\t" // 11 /2 | |
| 381 "pavgb (%0, %1, 2), %%mm2 \n\t" // 211 /4 | |
| 382 "movq %%mm2, %%mm3 \n\t" // 211 /4 | |
| 383 "movq (%0), %%mm4 \n\t" // 1 | |
| 384 "pavgb %%mm4, %%mm3 \n\t" // 4 211 /8 | |
| 385 "pavgb %%mm0, %%mm3 \n\t" //642211 /16 | |
| 386 "movq %%mm3, (%0) \n\t" // X | |
| 387 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 | |
| 388 "movq %%mm1, %%mm0 \n\t" // 1 | |
| 389 "pavgb %%mm6, %%mm0 \n\t" //1 1 /2 | |
| 390 "movq %%mm4, %%mm3 \n\t" // 1 | |
| 391 "pavgb (%0,%1,2), %%mm3 \n\t" // 1 1 /2 | |
| 392 "pavgb (%%eax,%1,2), %%mm5 \n\t" // 11 /2 | |
| 393 "pavgb (%%eax), %%mm5 \n\t" // 211 /4 | |
| 394 "pavgb %%mm5, %%mm3 \n\t" // 2 2211 /8 | |
| 395 "pavgb %%mm0, %%mm3 \n\t" //4242211 /16 | |
| 396 "movq %%mm3, (%0,%1) \n\t" // X | |
| 397 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 | |
| 398 "pavgb %%mm4, %%mm6 \n\t" //11 /2 | |
| 399 "movq (%%ebx), %%mm0 \n\t" // 1 | |
| 400 "pavgb (%%eax, %1, 2), %%mm0 \n\t" // 11/2 | |
| 401 "movq %%mm0, %%mm3 \n\t" // 11/2 | |
| 402 "pavgb %%mm1, %%mm0 \n\t" // 2 11/4 | |
| 403 "pavgb %%mm6, %%mm0 \n\t" //222 11/8 | |
| 404 "pavgb %%mm2, %%mm0 \n\t" //22242211/16 | |
| 405 "movq (%0, %1, 2), %%mm2 \n\t" // 1 | |
| 406 "movq %%mm0, (%0, %1, 2) \n\t" // X | |
| 407 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 | |
| 408 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | |
| 409 "pavgb (%%ebx), %%mm0 \n\t" // 11 /2 | |
| 410 "pavgb %%mm0, %%mm6 \n\t" //11 11 /4 | |
| 411 "pavgb %%mm1, %%mm4 \n\t" // 11 /2 | |
| 412 "pavgb %%mm2, %%mm1 \n\t" // 11 /2 | |
| 413 "pavgb %%mm1, %%mm6 \n\t" //1122 11 /8 | |
| 414 "pavgb %%mm5, %%mm6 \n\t" //112242211 /16 | |
| 415 "movq (%%eax), %%mm5 \n\t" // 1 | |
| 416 "movq %%mm6, (%%eax) \n\t" // X | |
| 417 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 | |
| 418 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 | |
| 419 "pavgb %%mm7, %%mm6 \n\t" // 11 /2 | |
| 420 "pavgb %%mm4, %%mm6 \n\t" // 11 11 /4 | |
| 421 "pavgb %%mm3, %%mm6 \n\t" // 11 2211 /8 | |
| 422 "pavgb %%mm5, %%mm2 \n\t" // 11 /2 | |
| 423 "movq (%0, %1, 4), %%mm4 \n\t" // 1 | |
| 424 "pavgb %%mm4, %%mm2 \n\t" // 112 /4 | |
| 425 "pavgb %%mm2, %%mm6 \n\t" // 112242211 /16 | |
| 426 "movq %%mm6, (%0, %1, 4) \n\t" // X | |
| 427 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 | |
| 428 "pavgb %%mm7, %%mm1 \n\t" // 11 2 /4 | |
| 429 "pavgb %%mm4, %%mm5 \n\t" // 11 /2 | |
| 430 "pavgb %%mm5, %%mm0 \n\t" // 11 11 /4 | |
| 431 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 | |
| 432 "pavgb %%mm6, %%mm1 \n\t" // 11 4 2 /8 | |
| 433 "pavgb %%mm0, %%mm1 \n\t" // 11224222 /16 | |
| 434 // "pxor %%mm1, %%mm1 \n\t" | |
| 435 "movq %%mm1, (%%eax, %1, 2) \n\t" // X | |
| 436 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 | |
| 437 "pavgb (%%ebx), %%mm2 \n\t" // 112 4 /8 | |
| 438 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | |
| 439 "pavgb %%mm0, %%mm6 \n\t" // 1 1 /2 | |
| 440 "pavgb %%mm7, %%mm6 \n\t" // 1 12 /4 | |
| 441 "pavgb %%mm2, %%mm6 \n\t" // 1122424 /4 | |
| 442 // "pxor %%mm6, %%mm6 \n\t" | |
| 443 "movq %%mm6, (%%ebx) \n\t" // X | |
| 444 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 | |
| 445 "pavgb %%mm7, %%mm5 \n\t" // 11 2 /4 | |
| 446 "pavgb %%mm7, %%mm5 \n\t" // 11 6 /8 | |
| 447 | |
| 448 "pavgb %%mm3, %%mm0 \n\t" // 112 /4 | |
| 449 "pavgb %%mm0, %%mm5 \n\t" // 112246 /16 | |
| 450 // "pxor %%mm5, %%mm5 \n\t" | |
| 451 // "movq pQPb, %%mm5 \n\t" | |
| 452 "movq %%mm5, (%%eax, %1, 4) \n\t" // X | |
| 453 "popl %0\n\t" | |
| 454 | |
| 455 : | |
| 456 : "r" (src), "r" (stride) | |
| 457 : "%eax", "%ebx" | |
| 458 ); | |
| 459 | |
| 460 #else | |
| 461 const int l1= stride; | |
| 462 const int l2= stride + l1; | |
| 463 const int l3= stride + l2; | |
| 464 const int l4= stride + l3; | |
| 465 const int l5= stride + l4; | |
| 466 const int l6= stride + l5; | |
| 467 const int l7= stride + l6; | |
| 468 const int l8= stride + l7; | |
| 469 const int l9= stride + l8; | |
| 470 | |
| 471 for(int x=0; x<BLOCK_SIZE; x++) | |
| 472 { | |
| 473 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; | |
| 474 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; | |
| 475 | |
| 476 int sums[9]; | |
| 477 sums[0] = first + src[l1]; | |
| 478 sums[1] = src[l1] + src[l2]; | |
| 479 sums[2] = src[l2] + src[l3]; | |
| 480 sums[3] = src[l3] + src[l4]; | |
| 481 sums[4] = src[l4] + src[l5]; | |
| 482 sums[5] = src[l5] + src[l6]; | |
| 483 sums[6] = src[l6] + src[l7]; | |
| 484 sums[7] = src[l7] + src[l8]; | |
| 485 sums[8] = src[l8] + last; | |
| 486 | |
| 487 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | |
| 488 src[l2]= ((src[l2]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; | |
| 489 src[l3]= ((src[l3]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4; | |
| 490 src[l4]= ((src[l4]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4; | |
| 491 src[l5]= ((src[l5]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4; | |
| 492 src[l6]= ((src[l6]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4; | |
| 493 src[l7]= ((last + src[l7]<<2) + (src[l8] + sums[5]<<1) + sums[3] + 8)>>4; | |
| 494 src[l8]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; | |
| 495 | |
| 496 src++; | |
| 497 } | |
| 498 | |
| 499 #endif | |
| 500 } | |
| 501 | |
| 502 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | |
| 503 { | |
| 504 #ifdef HAVE_MMX | |
| 505 src+= stride; | |
| 506 //FIXME try pmul for *5 stuff | |
| 507 // src[0]=0; | |
| 508 asm volatile( | |
| 509 "pxor %%mm7, %%mm7 \n\t" | |
| 510 "leal (%0, %1), %%eax \n\t" | |
| 511 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 512 // 0 1 2 3 4 5 6 7 | |
| 513 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | |
| 514 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | |
| 515 | |
| 516 "movq (%0), %%mm0 \n\t" | |
| 517 "movq %%mm0, %%mm1 \n\t" | |
| 518 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 | |
| 519 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 | |
| 520 | |
| 521 "movq (%%eax), %%mm2 \n\t" | |
| 522 "movq %%mm2, %%mm3 \n\t" | |
| 523 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 | |
| 524 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 | |
| 525 | |
| 526 "movq (%%eax, %1), %%mm4 \n\t" | |
| 527 "movq %%mm4, %%mm5 \n\t" | |
| 528 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 | |
| 529 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 | |
| 530 | |
| 531 "paddw %%mm0, %%mm0 \n\t" // 2L0 | |
| 532 "paddw %%mm1, %%mm1 \n\t" // 2H0 | |
| 533 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 | |
| 534 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 | |
| 535 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 | |
| 536 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 | |
| 537 | |
| 538 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 | |
| 539 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 | |
| 540 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 | |
| 541 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 | |
| 542 | |
| 543 "movq (%%eax, %1, 2), %%mm2 \n\t" | |
| 544 "movq %%mm2, %%mm3 \n\t" | |
| 545 "punpcklbw %%mm7, %%mm2 \n\t" // L3 | |
| 546 "punpckhbw %%mm7, %%mm3 \n\t" // H3 | |
| 547 | |
| 548 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 | |
| 549 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 | |
| 550 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
| 551 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 552 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
| 553 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 554 | |
| 555 "movq (%0, %1, 4), %%mm0 \n\t" | |
| 556 "movq %%mm0, %%mm1 \n\t" | |
| 557 "punpcklbw %%mm7, %%mm0 \n\t" // L4 | |
| 558 "punpckhbw %%mm7, %%mm1 \n\t" // H4 | |
| 559 | |
| 560 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 | |
| 561 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 | |
| 562 "movq %%mm2, temp2 \n\t" // L3 - L4 | |
| 563 "movq %%mm3, temp3 \n\t" // H3 - H4 | |
| 564 "paddw %%mm4, %%mm4 \n\t" // 2L2 | |
| 565 "paddw %%mm5, %%mm5 \n\t" // 2H2 | |
| 566 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 | |
| 567 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 | |
| 568 | |
| 569 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 | |
| 570 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 | |
| 571 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 | |
| 572 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 | |
| 573 //50 opcodes so far | |
| 574 "movq (%%ebx), %%mm2 \n\t" | |
| 575 "movq %%mm2, %%mm3 \n\t" | |
| 576 "punpcklbw %%mm7, %%mm2 \n\t" // L5 | |
| 577 "punpckhbw %%mm7, %%mm3 \n\t" // H5 | |
| 578 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 | |
| 579 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 | |
| 580 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 | |
| 581 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 | |
| 582 | |
| 583 "movq (%%ebx, %1), %%mm6 \n\t" | |
| 584 "punpcklbw %%mm7, %%mm6 \n\t" // L6 | |
| 585 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 | |
| 586 "movq (%%ebx, %1), %%mm6 \n\t" | |
| 587 "punpckhbw %%mm7, %%mm6 \n\t" // H6 | |
| 588 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 | |
| 589 | |
| 590 "paddw %%mm0, %%mm0 \n\t" // 2L4 | |
| 591 "paddw %%mm1, %%mm1 \n\t" // 2H4 | |
| 592 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 | |
| 593 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 | |
| 594 | |
| 595 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 | |
| 596 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 | |
| 597 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 | |
| 598 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 | |
| 599 | |
| 600 "movq (%%ebx, %1, 2), %%mm2 \n\t" | |
| 601 "movq %%mm2, %%mm3 \n\t" | |
| 602 "punpcklbw %%mm7, %%mm2 \n\t" // L7 | |
| 603 "punpckhbw %%mm7, %%mm3 \n\t" // H7 | |
| 604 | |
| 605 "paddw %%mm2, %%mm2 \n\t" // 2L7 | |
| 606 "paddw %%mm3, %%mm3 \n\t" // 2H7 | |
| 607 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 | |
| 608 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 | |
| 609 | |
| 610 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
| 611 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 612 //FIXME pxor, psubw, pmax for abs | |
| 613 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 614 "pcmpgtw %%mm0, %%mm6 \n\t" | |
| 615 "pxor %%mm6, %%mm0 \n\t" | |
| 616 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
| 617 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 618 "pcmpgtw %%mm1, %%mm6 \n\t" | |
| 619 "pxor %%mm6, %%mm1 \n\t" | |
| 620 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
| 621 | |
| 622 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 623 "pcmpgtw %%mm2, %%mm6 \n\t" | |
| 624 "pxor %%mm6, %%mm2 \n\t" | |
| 625 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
| 626 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 627 "pcmpgtw %%mm3, %%mm6 \n\t" | |
| 628 "pxor %%mm6, %%mm3 \n\t" | |
| 629 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
| 630 | |
| 631 #ifdef HAVE_MMX2 | |
| 632 "pminsw %%mm2, %%mm0 \n\t" | |
| 633 "pminsw %%mm3, %%mm1 \n\t" | |
| 634 #else | |
| 635 "movq %%mm0, %%mm6 \n\t" | |
| 636 "psubusw %%mm2, %%mm6 \n\t" | |
| 637 "psubw %%mm6, %%mm0 \n\t" | |
| 638 "movq %%mm1, %%mm6 \n\t" | |
| 639 "psubusw %%mm3, %%mm6 \n\t" | |
| 640 "psubw %%mm6, %%mm1 \n\t" | |
| 641 #endif | |
| 642 | |
| 643 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 644 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) | |
| 645 "pxor %%mm6, %%mm4 \n\t" | |
| 646 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| | |
| 647 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) | |
| 648 "pxor %%mm7, %%mm5 \n\t" | |
| 649 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| | |
| 650 // 100 opcodes | |
| 651 "movd %2, %%mm2 \n\t" // QP | |
| 652 //"pcmpeqb %%mm2, %%mm2\n\t" | |
| 653 "punpcklwd %%mm2, %%mm2 \n\t" | |
| 654 "punpcklwd %%mm2, %%mm2 \n\t" | |
| 655 "psllw $3, %%mm2 \n\t" // 8QP | |
| 656 "movq %%mm2, %%mm3 \n\t" // 8QP | |
| 657 "pcmpgtw %%mm4, %%mm2 \n\t" | |
| 658 "pcmpgtw %%mm5, %%mm3 \n\t" | |
| 659 "pand %%mm2, %%mm4 \n\t" | |
| 660 "pand %%mm3, %%mm5 \n\t" | |
| 661 | |
| 662 | |
| 663 "psubusw %%mm0, %%mm4 \n\t" // hd | |
| 664 "psubusw %%mm1, %%mm5 \n\t" // ld | |
| 665 | |
| 666 | |
| 667 "movq w05, %%mm2 \n\t" // 5 | |
| 668 "pmullw %%mm2, %%mm4 \n\t" | |
| 669 "pmullw %%mm2, %%mm5 \n\t" | |
| 670 "movq w20, %%mm2 \n\t" // 32 | |
| 671 "paddw %%mm2, %%mm4 \n\t" | |
| 672 "paddw %%mm2, %%mm5 \n\t" | |
| 673 "psrlw $6, %%mm4 \n\t" | |
| 674 "psrlw $6, %%mm5 \n\t" | |
| 675 | |
| 676 /* | |
| 677 "movq w06, %%mm2 \n\t" // 6 | |
| 678 "paddw %%mm2, %%mm4 \n\t" | |
| 679 "paddw %%mm2, %%mm5 \n\t" | |
| 680 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16 | |
| 681 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120 | |
| 682 "pmulhw %%mm2, %%mm4 \n\t" // hd/13 | |
| 683 "pmulhw %%mm2, %%mm5 \n\t" // ld/13 | |
| 684 */ | |
| 685 | |
| 686 "movq temp2, %%mm0 \n\t" // L3 - L4 | |
| 687 "movq temp3, %%mm1 \n\t" // H3 - H4 | |
| 688 | |
| 689 "pxor %%mm2, %%mm2 \n\t" | |
| 690 "pxor %%mm3, %%mm3 \n\t" | |
| 691 | |
| 692 // FIXME rounding error | |
| 693 "psraw $1, %%mm0 \n\t" // (L3 - L4)/2 | |
| 694 "psraw $1, %%mm1 \n\t" // (H3 - H4)/2 | |
| 695 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) | |
| 696 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) | |
| 697 "pxor %%mm2, %%mm0 \n\t" | |
| 698 "pxor %%mm3, %%mm1 \n\t" | |
| 699 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| | |
| 700 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| | |
| 701 // "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 | |
| 702 // "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 | |
| 703 | |
| 704 "pxor %%mm6, %%mm2 \n\t" | |
| 705 "pxor %%mm7, %%mm3 \n\t" | |
| 706 "pand %%mm2, %%mm4 \n\t" | |
| 707 "pand %%mm3, %%mm5 \n\t" | |
| 708 | |
| 709 #ifdef HAVE_MMX2 | |
| 710 "pminsw %%mm0, %%mm4 \n\t" | |
| 711 "pminsw %%mm1, %%mm5 \n\t" | |
| 712 #else | |
| 713 "movq %%mm4, %%mm2 \n\t" | |
| 714 "psubusw %%mm0, %%mm2 \n\t" | |
| 715 "psubw %%mm2, %%mm4 \n\t" | |
| 716 "movq %%mm5, %%mm2 \n\t" | |
| 717 "psubusw %%mm1, %%mm2 \n\t" | |
| 718 "psubw %%mm2, %%mm5 \n\t" | |
| 719 #endif | |
| 720 "pxor %%mm6, %%mm4 \n\t" | |
| 721 "pxor %%mm7, %%mm5 \n\t" | |
| 722 "psubw %%mm6, %%mm4 \n\t" | |
| 723 "psubw %%mm7, %%mm5 \n\t" | |
| 724 "packsswb %%mm5, %%mm4 \n\t" | |
| 725 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 726 "paddb %%mm4, %%mm0 \n\t" | |
| 727 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 728 "movq (%0, %1, 4), %%mm0 \n\t" | |
| 729 "psubb %%mm4, %%mm0 \n\t" | |
| 730 // "pxor %%mm0, %%mm0 \n\t" | |
| 731 "movq %%mm0, (%0, %1, 4) \n\t" | |
| 732 | |
| 733 : | |
| 734 : "r" (src), "r" (stride), "r" (QP) | |
| 735 : "%eax", "%ebx" | |
| 736 ); | |
| 737 #else | |
| 738 const int l1= stride; | |
| 739 const int l2= stride + l1; | |
| 740 const int l3= stride + l2; | |
| 741 const int l4= stride + l3; | |
| 742 const int l5= stride + l4; | |
| 743 const int l6= stride + l5; | |
| 744 const int l7= stride + l6; | |
| 745 const int l8= stride + l7; | |
| 746 // const int l9= stride + l8; | |
| 747 | |
| 748 for(int x=0; x<BLOCK_SIZE; x++) | |
| 749 { | |
| 750 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
| 751 if(ABS(middleEnergy) < 8*QP) | |
| 752 { | |
| 753 const int q=(src[l4] - src[l5])/2; | |
| 754 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
| 755 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
| 756 | |
| 757 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
| 758 d= MAX(d, 0); | |
| 759 | |
| 760 d= (5*d + 32) >> 6; | |
| 761 d*= SIGN(-middleEnergy); | |
| 762 | |
| 763 if(q>0) | |
| 764 { | |
| 765 d= d<0 ? 0 : d; | |
| 766 d= d>q ? q : d; | |
| 767 } | |
| 768 else | |
| 769 { | |
| 770 d= d>0 ? 0 : d; | |
| 771 d= d<q ? q : d; | |
| 772 } | |
| 773 | |
| 774 src[l4]-= d; | |
| 775 src[l5]+= d; | |
| 776 } | |
| 777 src++; | |
| 778 } | |
| 779 #endif | |
| 780 } | |
| 781 | |
| 782 //FIXME? |255-0| = 1 | |
| 783 /** | |
| 784 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. | |
| 785 */ | |
| 786 static inline bool isHorizDCAndCopy2Temp(uint8_t src[], int stride) | |
| 787 { | |
| 788 // src++; | |
| 789 int numEq= 0; | |
| 790 #ifdef HAVE_MMX | |
| 791 asm volatile ( | |
| 792 // "int $3 \n\t" | |
| 793 "pushl %1\n\t" | |
| 794 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F | |
| 795 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D | |
| 796 "leal tempBlock, %%eax \n\t" | |
| 797 "pxor %%mm0, %%mm0 \n\t" | |
| 798 | |
| 799 #define HDC_CHECK_AND_CPY(i) \ | |
| 800 "movq -4(%1), %%mm2 \n\t"\ | |
| 801 "psrlq $32, %%mm2 \n\t"\ | |
| 802 "punpckldq 4(%1), %%mm2 \n\t" /* (%1) */\ | |
| 803 "movq %%mm2, %%mm1 \n\t"\ | |
| 804 "psrlq $8, %%mm2 \n\t"\ | |
| 805 "psubb %%mm1, %%mm2 \n\t"\ | |
| 806 "paddb %%mm7, %%mm2 \n\t"\ | |
| 807 "pcmpgtb %%mm6, %%mm2 \n\t"\ | |
| 808 "paddb %%mm2, %%mm0 \n\t"\ | |
| 809 "movq %%mm1," #i "(%%eax) \n\t" | |
| 810 | |
| 811 HDC_CHECK_AND_CPY(0) | |
| 812 "addl %2, %1 \n\t" | |
| 813 HDC_CHECK_AND_CPY(8) | |
| 814 "addl %2, %1 \n\t" | |
| 815 HDC_CHECK_AND_CPY(16) | |
| 816 "addl %2, %1 \n\t" | |
| 817 HDC_CHECK_AND_CPY(24) | |
| 818 "addl %2, %1 \n\t" | |
| 819 HDC_CHECK_AND_CPY(32) | |
| 820 "addl %2, %1 \n\t" | |
| 821 HDC_CHECK_AND_CPY(40) | |
| 822 "addl %2, %1 \n\t" | |
| 823 HDC_CHECK_AND_CPY(48) | |
| 824 "addl %2, %1 \n\t" | |
| 825 HDC_CHECK_AND_CPY(56) | |
| 826 | |
| 827 "psllq $8, %%mm0 \n\t" // remove dummy value | |
| 828 "movq %%mm0, %%mm1 \n\t" | |
| 829 "psrlw $8, %%mm0 \n\t" | |
| 830 "paddb %%mm1, %%mm0 \n\t" | |
| 831 "movq %%mm0, %%mm1 \n\t" | |
| 832 "psrlq $16, %%mm0 \n\t" | |
| 833 "paddb %%mm1, %%mm0 \n\t" | |
| 834 "movq %%mm0, %%mm1 \n\t" | |
| 835 "psrlq $32, %%mm0 \n\t" | |
| 836 "paddb %%mm1, %%mm0 \n\t" | |
| 837 "popl %1\n\t" | |
| 838 "movd %%mm0, %0 \n\t" | |
| 839 : "=r" (numEq) | |
| 840 : "r" (src), "r" (stride) | |
| 841 : "%eax" | |
| 842 ); | |
| 843 // printf("%d\n", numEq); | |
| 844 numEq= (256 - (numEq & 0xFF)) &0xFF; | |
| 845 #else | |
| 846 for(int y=0; y<BLOCK_SIZE; y++) | |
| 847 { | |
| 848 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; | |
| 849 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; | |
| 850 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; | |
| 851 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; | |
| 852 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; | |
| 853 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; | |
| 854 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; | |
| 855 tempBlock[0 + y*TEMP_STRIDE] = src[0]; | |
| 856 tempBlock[1 + y*TEMP_STRIDE] = src[1]; | |
| 857 tempBlock[2 + y*TEMP_STRIDE] = src[2]; | |
| 858 tempBlock[3 + y*TEMP_STRIDE] = src[3]; | |
| 859 tempBlock[4 + y*TEMP_STRIDE] = src[4]; | |
| 860 tempBlock[5 + y*TEMP_STRIDE] = src[5]; | |
| 861 tempBlock[6 + y*TEMP_STRIDE] = src[6]; | |
| 862 tempBlock[7 + y*TEMP_STRIDE] = src[7]; | |
| 863 src+= stride; | |
| 864 } | |
| 865 #endif | |
| 866 /* if(abs(numEq - asmEq) > 0) | |
| 867 { | |
| 868 // printf("\nasm:%d c:%d\n", asmEq, numEq); | |
| 869 for(int y=0; y<8; y++) | |
| 870 { | |
| 871 for(int x=0; x<8; x++) | |
| 872 { | |
| 873 printf("%d ", src[x + y*stride]); | |
| 874 } | |
| 875 printf("\n"); | |
| 876 } | |
| 877 } | |
| 878 */ | |
| 879 // printf("%d\n", numEq); | |
| 880 return numEq > hFlatnessThreshold; | |
| 881 } | |
| 882 | |
| 883 static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP) | |
| 884 { | |
| 885 #ifdef MMX_FIXME | |
| 886 FIXME | |
| 887 int isOk; | |
| 888 asm volatile( | |
| 889 // "int $3 \n\t" | |
| 890 "movq (%1, %2), %%mm0 \n\t" | |
| 891 "movq (%1, %2, 8), %%mm1 \n\t" | |
| 892 "movq %%mm0, %%mm2 \n\t" | |
| 893 "psubusb %%mm1, %%mm0 \n\t" | |
| 894 "psubusb %%mm2, %%mm1 \n\t" | |
| 895 "por %%mm1, %%mm0 \n\t" // ABS Diff | |
| 896 | |
| 897 "movq pQPb, %%mm7 \n\t" // QP,..., QP | |
| 898 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | |
| 899 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 | |
| 900 "pcmpeqd b00, %%mm0 \n\t" | |
| 901 "psrlq $16, %%mm0 \n\t" | |
| 902 "pcmpeqd bFF, %%mm0 \n\t" | |
| 903 // "movd %%mm0, (%1, %2, 4)\n\t" | |
| 904 "movd %%mm0, %0 \n\t" | |
| 905 : "=r" (isOk) | |
| 906 : "r" (src), "r" (stride) | |
| 907 ); | |
| 908 return isOk; | |
| 909 #else | |
| 910 if(abs(src[0] - src[7]) > 2*QP) return false; | |
| 911 | |
| 912 return true; | |
| 913 #endif | |
| 914 } | |
| 915 | |
| 916 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) | |
| 917 { | |
| 918 #ifdef HAVE_MMX2 | |
| 919 asm volatile( | |
| 920 "pushl %0 \n\t" | |
| 921 "pxor %%mm7, %%mm7 \n\t" | |
| 922 "movq bm00001000, %%mm6 \n\t" | |
| 923 "movd %2, %%mm5 \n\t" // QP | |
| 924 "movq %%mm5, %%mm4 \n\t" | |
| 925 "paddusb %%mm5, %%mm5 \n\t" // 2QP | |
| 926 "paddusb %%mm5, %%mm4 \n\t" // 3QP | |
| 927 "psllq $24, %%mm4 \n\t" | |
| 928 "pxor %%mm5, %%mm5 \n\t" // 0 | |
| 929 "psubb %%mm4, %%mm5 \n\t" // -QP | |
| 930 "leal tempBlock, %%eax \n\t" | |
| 931 | |
| 932 //FIXME? "unroll by 2" and mix | |
| 933 #define HDF(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |
| 934 "movq %%mm0, %%mm1 \n\t"\ | |
| 935 "movq %%mm0, %%mm2 \n\t"\ | |
| 936 "psrlq $8, %%mm1 \n\t"\ | |
| 937 "psubusb %%mm1, %%mm2 \n\t"\ | |
| 938 "psubusb %%mm0, %%mm1 \n\t"\ | |
| 939 "por %%mm2, %%mm1 \n\t" /* |px - p(x+1)| */\ | |
| 940 "pcmpeqb %%mm7, %%mm2 \n\t" /* sgn[px - p(x+1)] */\ | |
| 941 "pshufw $0xAA, %%mm1, %%mm3 \n\t"\ | |
| 942 "pminub %%mm1, %%mm3 \n\t"\ | |
| 943 "psrlq $16, %%mm3 \n\t"\ | |
| 944 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\ | |
| 945 "paddb %%mm5, %%mm1 \n\t"\ | |
| 946 "psubusb %%mm5, %%mm1 \n\t"\ | |
| 947 "psrlw $2, %%mm1 \n\t"\ | |
| 948 "pxor %%mm2, %%mm1 \n\t"\ | |
| 949 "psubb %%mm2, %%mm1 \n\t"\ | |
| 950 "pand %%mm6, %%mm1 \n\t"\ | |
| 951 "psubb %%mm1, %%mm0 \n\t"\ | |
| 952 "psllq $8, %%mm1 \n\t"\ | |
| 953 "paddb %%mm1, %%mm0 \n\t"\ | |
| 954 "movd %%mm0, (%0) \n\t"\ | |
| 955 "psrlq $32, %%mm0 \n\t"\ | |
| 956 "movd %%mm0, 4(%0) \n\t" | |
| 957 | |
| 958 HDF(0) | |
| 959 "addl %1, %0 \n\t" | |
| 960 HDF(8) | |
| 961 "addl %1, %0 \n\t" | |
| 962 HDF(16) | |
| 963 "addl %1, %0 \n\t" | |
| 964 HDF(24) | |
| 965 "addl %1, %0 \n\t" | |
| 966 HDF(32) | |
| 967 "addl %1, %0 \n\t" | |
| 968 HDF(40) | |
| 969 "addl %1, %0 \n\t" | |
| 970 HDF(48) | |
| 971 "addl %1, %0 \n\t" | |
| 972 HDF(56) | |
| 973 "popl %0 \n\t" | |
| 974 : | |
| 975 : "r" (dst), "r" (stride), "r" (QP) | |
| 976 : "%eax" | |
| 977 ); | |
| 978 #else | |
| 979 uint8_t *src= tempBlock; | |
| 980 | |
| 981 for(int y=0; y<BLOCK_SIZE; y++) | |
| 982 { | |
| 983 dst[0] = src[0]; | |
| 984 dst[1] = src[1]; | |
| 985 dst[2] = src[2]; | |
| 986 dst[3] = src[3]; | |
| 987 dst[4] = src[4]; | |
| 988 dst[5] = src[5]; | |
| 989 dst[6] = src[6]; | |
| 990 dst[7] = src[7]; | |
| 991 | |
| 992 const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]); | |
| 993 if(ABS(middleEnergy) < 8*QP) | |
| 994 { | |
| 995 const int q=(src[3] - src[4])/2; | |
| 996 const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]); | |
| 997 const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]); | |
| 998 | |
| 999 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
| 1000 d= MAX(d, 0); | |
| 1001 | |
| 1002 d= (5*d + 32) >> 6; | |
| 1003 d*= SIGN(-middleEnergy); | |
| 1004 | |
| 1005 if(q>0) | |
| 1006 { | |
| 1007 d= d<0 ? 0 : d; | |
| 1008 d= d>q ? q : d; | |
| 1009 } | |
| 1010 else | |
| 1011 { | |
| 1012 d= d>0 ? 0 : d; | |
| 1013 d= d<q ? q : d; | |
| 1014 } | |
| 1015 | |
| 1016 dst[3]-= d; | |
| 1017 dst[4]+= d; | |
| 1018 } | |
| 1019 dst+= stride; | |
| 1020 src+= TEMP_STRIDE; | |
| 1021 } | |
| 1022 #endif | |
| 1023 } | |
| 1024 | |
| 1025 /** | |
| 1026 * Do a horizontal low pass filter on the 8x8 block | |
| 1027 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | |
| 1028 * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2 version) | |
| 1029 */ | |
| 1030 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | |
| 1031 { | |
| 1032 //return; | |
| 1033 #ifdef HAVE_MMX2 | |
| 1034 asm volatile( //"movv %0 %1 %2\n\t" | |
| 1035 "pushl %0\n\t" | |
| 1036 "pxor %%mm7, %%mm7 \n\t" | |
| 1037 "leal tempBlock, %%eax \n\t" | |
| 1038 | |
| 1039 #define HLP1 "movq (%0), %%mm0 \n\t"\ | |
| 1040 "movq %%mm0, %%mm1 \n\t"\ | |
| 1041 "psllq $8, %%mm0 \n\t"\ | |
| 1042 "pavgb %%mm1, %%mm0 \n\t"\ | |
| 1043 "psrlw $8, %%mm0 \n\t"\ | |
| 1044 "pxor %%mm1, %%mm1 \n\t"\ | |
| 1045 "packuswb %%mm1, %%mm0 \n\t"\ | |
| 1046 "movq %%mm0, %%mm1 \n\t"\ | |
| 1047 "movq %%mm0, %%mm2 \n\t"\ | |
| 1048 "psllq $32, %%mm0 \n\t"\ | |
| 1049 "paddb %%mm0, %%mm1 \n\t"\ | |
| 1050 "psllq $16, %%mm2 \n\t"\ | |
| 1051 "pavgb %%mm2, %%mm0 \n\t"\ | |
| 1052 "movq %%mm0, %%mm3 \n\t"\ | |
| 1053 "pand bm11001100, %%mm0 \n\t"\ | |
| 1054 "paddusb %%mm0, %%mm3 \n\t"\ | |
| 1055 "psrlq $8, %%mm3 \n\t"\ | |
| 1056 "pavgb %%mm1, %%mm4 \n\t"\ | |
| 1057 "pavgb %%mm3, %%mm2 \n\t"\ | |
| 1058 "psrlq $16, %%mm2 \n\t"\ | |
| 1059 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
| 1060 "movq %%mm2, (%0) \n\t"\ | |
| 1061 | |
| 1062 #define HLP2 "movq (%0), %%mm0 \n\t"\ | |
| 1063 "movq %%mm0, %%mm1 \n\t"\ | |
| 1064 "psllq $8, %%mm0 \n\t"\ | |
| 1065 "pavgb %%mm1, %%mm0 \n\t"\ | |
| 1066 "psrlw $8, %%mm0 \n\t"\ | |
| 1067 "pxor %%mm1, %%mm1 \n\t"\ | |
| 1068 "packuswb %%mm1, %%mm0 \n\t"\ | |
| 1069 "movq %%mm0, %%mm2 \n\t"\ | |
| 1070 "psllq $32, %%mm0 \n\t"\ | |
| 1071 "psllq $16, %%mm2 \n\t"\ | |
| 1072 "pavgb %%mm2, %%mm0 \n\t"\ | |
| 1073 "movq %%mm0, %%mm3 \n\t"\ | |
| 1074 "pand bm11001100, %%mm0 \n\t"\ | |
| 1075 "paddusb %%mm0, %%mm3 \n\t"\ | |
| 1076 "psrlq $8, %%mm3 \n\t"\ | |
| 1077 "pavgb %%mm3, %%mm2 \n\t"\ | |
| 1078 "psrlq $16, %%mm2 \n\t"\ | |
| 1079 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
| 1080 "movq %%mm2, (%0) \n\t"\ | |
| 1081 | |
| 1082 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16 | |
| 1083 /* | |
| 1084 31 | |
| 1085 121 | |
| 1086 121 | |
| 1087 121 | |
| 1088 121 | |
| 1089 121 | |
| 1090 121 | |
| 1091 13 | |
| 1092 Implemented Exact 7-Tap | |
| 1093 9421 A321 | |
| 1094 36421 64321 | |
| 1095 334321 = | |
| 1096 1234321 = | |
| 1097 1234321 = | |
| 1098 123433 = | |
| 1099 12463 12346 | |
| 1100 1249 123A | |
| 1101 | |
| 1102 */ | |
| 1103 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |
| 1104 "movq %%mm0, %%mm1 \n\t"\ | |
| 1105 "movq %%mm0, %%mm2 \n\t"\ | |
| 1106 "movq %%mm0, %%mm3 \n\t"\ | |
| 1107 "movq %%mm0, %%mm4 \n\t"\ | |
| 1108 "psllq $8, %%mm1 \n\t"\ | |
| 1109 "psrlq $8, %%mm2 \n\t"\ | |
| 1110 "pand bm00000001, %%mm3 \n\t"\ | |
| 1111 "pand bm10000000, %%mm4 \n\t"\ | |
| 1112 "por %%mm3, %%mm1 \n\t"\ | |
| 1113 "por %%mm4, %%mm2 \n\t"\ | |
| 1114 "pavgb %%mm2, %%mm1 \n\t"\ | |
| 1115 "pavgb %%mm1, %%mm0 \n\t"\ | |
| 1116 \ | |
| 1117 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\ | |
| 1118 "pshufw $0x90, %%mm0, %%mm4 \n\t"\ | |
| 1119 "pavgb %%mm3, %%mm4 \n\t"\ | |
| 1120 "pavgb %%mm4, %%mm0 \n\t"\ | |
| 1121 "movd %%mm0, (%0) \n\t"\ | |
| 1122 "psrlq $32, %%mm0 \n\t"\ | |
| 1123 "movd %%mm0, 4(%0) \n\t"\ | |
| 1124 | |
| 1125 #define HLP(i) HLP3(i) | |
| 1126 | |
| 1127 HLP(0) | |
| 1128 "addl %1, %0 \n\t" | |
| 1129 HLP(8) | |
| 1130 "addl %1, %0 \n\t" | |
| 1131 HLP(16) | |
| 1132 "addl %1, %0 \n\t" | |
| 1133 HLP(24) | |
| 1134 "addl %1, %0 \n\t" | |
| 1135 HLP(32) | |
| 1136 "addl %1, %0 \n\t" | |
| 1137 HLP(40) | |
| 1138 "addl %1, %0 \n\t" | |
| 1139 HLP(48) | |
| 1140 "addl %1, %0 \n\t" | |
| 1141 HLP(56) | |
| 1142 | |
| 1143 "popl %0\n\t" | |
| 1144 : | |
| 1145 : "r" (dst), "r" (stride) | |
| 1146 : "%eax", "%ebx" | |
| 1147 ); | |
| 1148 | |
| 1149 #else | |
| 1150 uint8_t *temp= tempBlock; | |
| 1151 for(int y=0; y<BLOCK_SIZE; y++) | |
| 1152 { | |
| 1153 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; | |
| 1154 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; | |
| 1155 | |
| 1156 int sums[9]; | |
| 1157 sums[0] = first + temp[0]; | |
| 1158 sums[1] = temp[0] + temp[1]; | |
| 1159 sums[2] = temp[1] + temp[2]; | |
| 1160 sums[3] = temp[2] + temp[3]; | |
| 1161 sums[4] = temp[3] + temp[4]; | |
| 1162 sums[5] = temp[4] + temp[5]; | |
| 1163 sums[6] = temp[5] + temp[6]; | |
| 1164 sums[7] = temp[6] + temp[7]; | |
| 1165 sums[8] = temp[7] + last; | |
| 1166 | |
| 1167 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | |
| 1168 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; | |
| 1169 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4; | |
| 1170 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4; | |
| 1171 dst[4]= ((dst[4]<<2) + (sums[3] + sums[6]<<1) + sums[1] + sums[8] + 8)>>4; | |
| 1172 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4; | |
| 1173 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4; | |
| 1174 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; | |
| 1175 | |
| 1176 dst+= stride; | |
| 1177 temp+= TEMP_STRIDE; | |
| 1178 } | |
| 1179 #endif | |
| 1180 } | |
| 1181 | |
| 1182 | |
| 1183 static inline void dering(uint8_t src[], int stride, int QP) | |
| 1184 { | |
| 1185 //FIXME | |
| 1186 | |
| 1187 #ifdef HAVE_MMX2X | |
| 1188 asm volatile( | |
| 1189 "leal (%0, %1), %%eax \n\t" | |
| 1190 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 1191 // 0 1 2 3 4 5 6 7 8 9 | |
| 1192 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 1193 | |
| 1194 "pcmpeq %%mm6, %%mm6 \n\t" | |
| 1195 "pxor %%mm7, %%mm7 \n\t" | |
| 1196 | |
| 1197 #define FIND_MIN_MAX(addr)\ | |
| 1198 "movq (" #addr "), %%mm0, \n\t"\ | |
| 1199 "pminub %%mm0, %%mm6 \n\t"\ | |
| 1200 "pmaxub %%mm0, %%mm7 \n\t" | |
| 1201 | |
| 1202 FIND_MIN_MAX(%0) | |
| 1203 FIND_MIN_MAX(%%eax) | |
| 1204 FIND_MIN_MAX(%%eax, %1) | |
| 1205 FIND_MIN_MAX(%%eax, %1, 2) | |
| 1206 FIND_MIN_MAX(%0, %1, 4) | |
| 1207 FIND_MIN_MAX(%%ebx) | |
| 1208 FIND_MIN_MAX(%%ebx, %1) | |
| 1209 FIND_MIN_MAX(%%ebx, %1, 2) | |
| 1210 FIND_MIN_MAX(%0, %1, 8) | |
| 1211 FIND_MIN_MAX(%%ebx, %1, 2) | |
| 1212 | |
| 1213 "movq %%mm6, %%mm4 \n\t" | |
| 1214 "psrlq $32, %%mm6 \n\t" | |
| 1215 "pminub %%mm4, %%mm6 \n\t" | |
| 1216 "movq %%mm6, %%mm4 \n\t" | |
| 1217 "psrlq $16, %%mm6 \n\t" | |
| 1218 "pminub %%mm4, %%mm6 \n\t" | |
| 1219 "movq %%mm6, %%mm4 \n\t" | |
| 1220 "psrlq $8, %%mm6 \n\t" | |
| 1221 "pminub %%mm4, %%mm6 \n\t" // min of pixels | |
| 1222 | |
| 1223 "movq %%mm7, %%mm4 \n\t" | |
| 1224 "psrlq $32, %%mm7 \n\t" | |
| 1225 "pmaxub %%mm4, %%mm7 \n\t" | |
| 1226 "movq %%mm7, %%mm4 \n\t" | |
| 1227 "psrlq $16, %%mm7 \n\t" | |
| 1228 "pmaxub %%mm4, %%mm7 \n\t" | |
| 1229 "movq %%mm7, %%mm4 \n\t" | |
| 1230 "psrlq $8, %%mm7 \n\t" | |
| 1231 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels | |
| 1232 "pavgb %%mm6, %%mm7 \n\t" // (max + min)/2 | |
| 1233 | |
| 1234 | |
| 1235 : : "r" (src), "r" (stride), "r" (QP) | |
| 1236 : "%eax", "%ebx" | |
| 1237 ); | |
| 1238 #else | |
| 1239 | |
| 1240 //FIXME | |
| 1241 #endif | |
| 1242 } | |
| 1243 | |
| 1244 /** | |
| 1245 * ... | |
| 1246 */ | |
| 1247 extern "C"{ | |
| 1248 void postprocess(unsigned char * src[], int src_stride, | |
| 1249 unsigned char * dst[], int dst_stride, | |
| 1250 int horizontal_size, int vertical_size, | |
| 1251 QP_STORE_T *QP_store, int QP_stride, | |
| 1252 int mode) | |
| 1253 { | |
| 1254 /* | |
| 1255 long long T= rdtsc(); | |
| 1256 for(int y=vertical_size-1; y>=0 ; y--) | |
| 1257 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride); | |
| 1258 // memcpy(dst[0], src[0],src_stride*vertical_size); | |
| 1259 printf("%4dk\r", (rdtsc()-T)/1000); | |
| 1260 | |
| 1261 return; | |
| 1262 */ | |
| 1263 /* | |
| 1264 long long T= rdtsc(); | |
| 1265 while( (rdtsc() - T)/1000 < 4000); | |
| 1266 | |
| 1267 return; | |
| 1268 */ | |
| 1269 postProcess(src[0], src_stride, | |
| 1270 dst[0], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, false); | |
| 1271 | |
| 1272 horizontal_size >>= 1; | |
| 1273 vertical_size >>= 1; | |
| 1274 src_stride >>= 1; | |
| 1275 dst_stride >>= 1; | |
| 1276 | |
| 1277 if(1) | |
| 1278 { | |
| 1279 postProcess(src[1], src_stride, | |
| 1280 dst[1], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, true); | |
| 1281 postProcess(src[2], src_stride, | |
| 1282 dst[2], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, true); | |
| 1283 } | |
| 1284 else | |
| 1285 { | |
| 1286 memcpy(dst[1], src[1], src_stride*horizontal_size); | |
| 1287 memcpy(dst[2], src[2], src_stride*horizontal_size); | |
| 1288 } | |
| 1289 } | |
| 1290 } | |
| 1291 | |
| 1292 /** | |
| 1293 * Copies a block from src to dst and fixes the blacklevel | |
| 1294 */ | |
| 1295 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) | |
| 1296 { | |
| 1297 #ifdef HAVE_MMX | |
| 1298 asm volatile( | |
| 1299 "pushl %0 \n\t" | |
| 1300 "pushl %1 \n\t" | |
| 1301 "leal (%2,%2), %%eax \n\t" | |
| 1302 "leal (%3,%3), %%ebx \n\t" | |
| 1303 "movq packedYOffset, %%mm2 \n\t" | |
| 1304 "movq packedYScale, %%mm3 \n\t" | |
| 1305 | |
| 1306 #define SIMPLE_CPY \ | |
| 1307 "movq (%0), %%mm0 \n\t"\ | |
| 1308 "movq (%0,%2), %%mm1 \n\t"\ | |
| 1309 "psubusb %%mm2, %%mm0 \n\t"\ | |
| 1310 "psubusb %%mm2, %%mm1 \n\t"\ | |
| 1311 "movq %%mm0, (%1) \n\t"\ | |
| 1312 "movq %%mm1, (%1, %3) \n\t"\ | |
| 1313 | |
| 1314 #define SCALED_CPY \ | |
| 1315 "movq (%0), %%mm0 \n\t"\ | |
| 1316 "movq (%0,%2), %%mm1 \n\t"\ | |
| 1317 "psubusb %%mm2, %%mm0 \n\t"\ | |
| 1318 "psubusb %%mm2, %%mm1 \n\t"\ | |
| 1319 "pxor %%mm4, %%mm4 \n\t"\ | |
| 1320 "pxor %%mm5, %%mm5 \n\t"\ | |
| 1321 "punpcklbw %%mm0, %%mm4 \n\t"\ | |
| 1322 "punpckhbw %%mm0, %%mm5 \n\t"\ | |
| 1323 "pmulhuw %%mm3, %%mm4 \n\t"\ | |
| 1324 "pmulhuw %%mm3, %%mm5 \n\t"\ | |
| 1325 "packuswb %%mm5, %%mm4 \n\t"\ | |
| 1326 "movq %%mm4, (%1) \n\t"\ | |
| 1327 "pxor %%mm4, %%mm4 \n\t"\ | |
| 1328 "pxor %%mm5, %%mm5 \n\t"\ | |
| 1329 "punpcklbw %%mm1, %%mm4 \n\t"\ | |
| 1330 "punpckhbw %%mm1, %%mm5 \n\t"\ | |
| 1331 "pmulhuw %%mm3, %%mm4 \n\t"\ | |
| 1332 "pmulhuw %%mm3, %%mm5 \n\t"\ | |
| 1333 "packuswb %%mm5, %%mm4 \n\t"\ | |
| 1334 "movq %%mm4, (%1, %3) \n\t"\ | |
| 1335 | |
| 1336 | |
| 1337 #define CPY SCALED_CPY | |
| 1338 //#define CPY SIMPLE_CPY | |
| 1339 // "prefetchnta 8(%0)\n\t" | |
| 1340 CPY | |
| 1341 "addl %%eax, %0 \n\t" | |
| 1342 "addl %%ebx, %1 \n\t" | |
| 1343 CPY | |
| 1344 "addl %%eax, %0 \n\t" | |
| 1345 "addl %%ebx, %1 \n\t" | |
| 1346 CPY | |
| 1347 "addl %%eax, %0 \n\t" | |
| 1348 "addl %%ebx, %1 \n\t" | |
| 1349 CPY | |
| 1350 "popl %1 \n\t" | |
| 1351 "popl %0 \n\t" | |
| 1352 : : "r" (src), | |
| 1353 "r" (dst), | |
| 1354 "r" (srcStride), | |
| 1355 "r" (dstStride) | |
| 1356 : "%eax", "%ebx" | |
| 1357 ); | |
| 1358 #else | |
| 1359 for(int i=0; i<BLOCK_SIZE; i++) // last 10x8 Block is copied allready so +2 | |
| 1360 memcpy( &(dst[dstStride*i]), | |
| 1361 &(src[srcStride*i]), BLOCK_SIZE); | |
| 1362 #endif | |
| 1363 } | |
| 1364 | |
| 1365 | |
| 1366 /** | |
| 1367 * Filters array of bytes (Y or U or V values) | |
| 1368 */ | |
| 1369 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | |
| 1370 QP_STORE_T QPs[], int QPStride, bool isColor) | |
| 1371 { | |
| 1372 | |
| 1373 #ifdef TIMEING | |
| 1374 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; | |
| 1375 sumTime= rdtsc(); | |
| 1376 #endif | |
| 1377 | |
| 1378 /* we need 64bit here otherwise weŽll going to have a problem | |
| 1379 after watching a black picture for 5 hours*/ | |
| 1380 static uint64_t *yHistogram= NULL; | |
| 1381 if(!yHistogram) | |
| 1382 { | |
| 1383 yHistogram= new uint64_t[256]; | |
| 1384 for(int i=0; i<256; i++) yHistogram[i]= width*height/64/256; | |
| 1385 } | |
| 1386 | |
| 1387 int black=0, white=255; // blackest black and whitest white in the picture | |
| 1388 if(!isColor) | |
| 1389 { | |
| 1390 uint64_t sum= 0; | |
| 1391 for(int i=0; i<256; i++) | |
| 1392 sum+= yHistogram[i]; | |
| 1393 | |
| 1394 uint64_t maxClipped= (uint64_t)(sum * maxClippedThreshold); | |
| 1395 | |
| 1396 uint64_t clipped= sum; | |
| 1397 for(black=255; black>0; black--) | |
| 1398 { | |
| 1399 if(clipped < maxClipped) break; | |
| 1400 clipped-= yHistogram[black]; | |
| 1401 } | |
| 1402 | |
| 1403 clipped= sum; | |
| 1404 for(white=0; white<256; white++) | |
| 1405 { | |
| 1406 if(clipped < maxClipped) break; | |
| 1407 clipped-= yHistogram[white]; | |
| 1408 } | |
| 1409 | |
| 1410 // we cant handle negative correctures | |
| 1411 packedYOffset= MAX(black - minAllowedY, 0); | |
| 1412 packedYOffset|= packedYOffset<<32; | |
| 1413 packedYOffset|= packedYOffset<<16; | |
| 1414 packedYOffset|= packedYOffset<<8; | |
| 1415 | |
| 1416 // uint64_t scale= (int)(256.0*256.0/(white-black) + 0.5); | |
| 1417 double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); | |
| 1418 | |
| 1419 packedYScale= uint16_t(scale*256.0 + 0.5); | |
| 1420 packedYScale|= packedYScale<<32; | |
| 1421 packedYScale|= packedYScale<<16; | |
| 1422 } | |
| 1423 else | |
| 1424 { | |
| 1425 packedYScale= 0x0100010001000100LL; | |
| 1426 packedYOffset= 0; | |
| 1427 } | |
| 1428 | |
| 1429 for(int x=0; x<width; x+=BLOCK_SIZE) | |
| 1430 blockCopy(dst + x, dstStride, src + x, srcStride); | |
| 1431 | |
| 1432 for(int y=0; y<height; y+=BLOCK_SIZE) | |
| 1433 { | |
| 1434 //1% speedup if these are here instead of the inner loop | |
| 1435 uint8_t *srcBlock= &(src[y*srcStride]); | |
| 1436 uint8_t *dstBlock= &(dst[y*dstStride]); | |
| 1437 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start | |
| 1438 uint8_t *vertBlock= &(dstBlock[dstStride*3]); | |
| 1439 | |
| 1440 // finish 1 block before the next otherwise weŽll might have a problem | |
| 1441 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
| 1442 for(int x=0; x<width; x+=BLOCK_SIZE) | |
| 1443 { | |
| 1444 int QP= isColor ? | |
| 1445 QPs[(y>>3)*QPStride + (x>>3)]: | |
| 1446 (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8; | |
| 1447 #ifdef HAVE_MMX | |
| 1448 asm volatile( | |
| 1449 "movd %0, %%mm7 \n\t" | |
| 1450 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | |
| 1451 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | |
| 1452 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | |
| 1453 "movq %%mm7, pQPb \n\t" | |
| 1454 : : "r" (QP) | |
| 1455 ); | |
| 1456 #endif | |
| 1457 | |
| 1458 | |
| 1459 const int stride= dstStride; | |
| 1460 if(y + 12 < height) | |
| 1461 { | |
| 1462 #ifdef MORE_TIMEING | |
| 1463 T0= rdtsc(); | |
| 1464 #endif | |
| 1465 #ifdef HAVE_MMX2 | |
| 1466 | |
| 1467 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); | |
| 1468 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); | |
| 1469 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); | |
| 1470 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); | |
| 1471 #endif | |
| 1472 if(!isColor) yHistogram[ srcBlock[0] ]++; | |
| 1473 | |
| 1474 blockCopy(vertBlock + dstStride*2, dstStride, | |
| 1475 vertSrcBlock + srcStride*2, srcStride); | |
| 1476 | |
| 1477 | |
| 1478 #ifdef MORE_TIMEING | |
| 1479 T1= rdtsc(); | |
| 1480 memcpyTime+= T1-T0; | |
| 1481 T0=T1; | |
| 1482 #endif | |
| 1483 | |
| 1484 if( isVertDC(vertBlock, stride)) | |
| 1485 { | |
| 1486 if(isVertMinMaxOk(vertBlock, stride, QP)) | |
| 1487 doVertLowPass(vertBlock, stride, QP); | |
| 1488 } | |
| 1489 else if(x<width) | |
| 1490 doVertDefFilter(vertBlock, stride, QP); | |
| 1491 | |
| 1492 #ifdef MORE_TIMEING | |
| 1493 T1= rdtsc(); | |
| 1494 vertTime+= T1-T0; | |
| 1495 T0=T1; | |
| 1496 #endif | |
| 1497 } | |
| 1498 else | |
| 1499 { | |
| 1500 for(int i=2; i<BLOCK_SIZE/2+1; i++) // last 10x8 Block is copied allready so +2 | |
| 1501 memcpy( &(vertBlock[dstStride*i]), | |
| 1502 &(vertSrcBlock[srcStride*i]), BLOCK_SIZE); | |
| 1503 | |
| 1504 } | |
| 1505 | |
| 1506 if(x - 8 >= 0 && x<width) | |
| 1507 { | |
| 1508 #ifdef MORE_TIMEING | |
| 1509 T0= rdtsc(); | |
| 1510 #endif | |
| 1511 | |
| 1512 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) | |
| 1513 { | |
| 1514 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | |
| 1515 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | |
| 1516 } | |
| 1517 else | |
| 1518 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |
| 1519 | |
| 1520 #ifdef MORE_TIMEING | |
| 1521 T1= rdtsc(); | |
| 1522 horizTime+= T1-T0; | |
| 1523 T0=T1; | |
| 1524 #endif | |
| 1525 dering(dstBlock - 9 - stride, stride, QP); | |
| 1526 } | |
| 1527 else if(y!=0) | |
| 1528 dering(dstBlock - stride*9 + width-9, stride, QP); | |
| 1529 //FIXME dering filter will not be applied to last block (bottom right) | |
| 1530 | |
| 1531 | |
| 1532 dstBlock+=8; | |
| 1533 srcBlock+=8; | |
| 1534 vertBlock+=8; | |
| 1535 vertSrcBlock+=8; | |
| 1536 } | |
| 1537 } | |
| 1538 #ifdef HAVE_MMX | |
| 1539 asm volatile("emms"); | |
| 1540 #endif | |
| 1541 | |
| 1542 #ifdef TIMEING | |
| 1543 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) | |
| 1544 sumTime= rdtsc() - sumTime; | |
| 1545 if(!isColor) | |
| 1546 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r", | |
| 1547 int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000), | |
| 1548 int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000) | |
| 1549 , black, white); | |
| 1550 #endif | |
| 1551 } |
