Mercurial > libavcodec.hg
comparison libpostproc/postprocess.c @ 169:20bcd5b70886 libavcodec
runtime cpu detection
| author | michael |
|---|---|
| date | Sat, 24 Nov 2001 22:16:29 +0000 |
| parents | 712c7a115164 |
| children | fa9734559c98 |
comparison
equal
deleted
inserted
replaced
| 168:712c7a115164 | 169:20bcd5b70886 |
|---|---|
| 60 border remover | 60 border remover |
| 61 optimize c versions | 61 optimize c versions |
| 62 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks | 62 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks |
| 63 smart blur | 63 smart blur |
| 64 commandline option for the deblock / dering thresholds | 64 commandline option for the deblock / dering thresholds |
| 65 put fastmemcpy back | |
| 66 dont use #ifdef ARCH_X86 for the asm stuff ... cross compilers? (note cpudetect uses ARCH_X86) | |
| 65 ... | 67 ... |
| 66 */ | 68 */ |
| 67 | 69 |
| 68 //Changelog: use the CVS log | 70 //Changelog: use the CVS log |
| 69 | 71 |
| 76 #include <malloc.h> | 78 #include <malloc.h> |
| 77 #endif | 79 #endif |
| 78 //#undef HAVE_MMX2 | 80 //#undef HAVE_MMX2 |
| 79 //#define HAVE_3DNOW | 81 //#define HAVE_3DNOW |
| 80 //#undef HAVE_MMX | 82 //#undef HAVE_MMX |
| 83 //#undef ARCH_X86 | |
| 81 //#define DEBUG_BRIGHTNESS | 84 //#define DEBUG_BRIGHTNESS |
| 82 #include "../libvo/fastmemcpy.h" | 85 //#include "../libvo/fastmemcpy.h" |
| 83 #include "postprocess.h" | 86 #include "postprocess.h" |
| 87 #include "../cpudetect.h" | |
| 84 | 88 |
| 85 #define MIN(a,b) ((a) > (b) ? (b) : (a)) | 89 #define MIN(a,b) ((a) > (b) ? (b) : (a)) |
| 86 #define MAX(a,b) ((a) < (b) ? (b) : (a)) | 90 #define MAX(a,b) ((a) < (b) ? (b) : (a)) |
| 87 #define ABS(a) ((a) > 0 ? (a) : (-(a))) | 91 #define ABS(a) ((a) > 0 ? (a) : (-(a))) |
| 88 #define SIGN(a) ((a) > 0 ? 1 : -1) | 92 #define SIGN(a) ((a) > 0 ? 1 : -1) |
| 89 | 93 |
| 90 #ifdef HAVE_MMX2 | |
| 91 #define PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | |
| 92 #elif defined (HAVE_3DNOW) | |
| 93 #define PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | |
| 94 #endif | |
| 95 | |
| 96 #ifdef HAVE_MMX2 | |
| 97 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" | |
| 98 #elif defined (HAVE_MMX) | |
| 99 #define PMINUB(b,a,t) \ | |
| 100 "movq " #a ", " #t " \n\t"\ | |
| 101 "psubusb " #b ", " #t " \n\t"\ | |
| 102 "psubb " #t ", " #a " \n\t" | |
| 103 #endif | |
| 104 | |
| 105 #ifdef HAVE_MMX2 | |
| 106 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" | |
| 107 #elif defined (HAVE_MMX) | |
| 108 #define PMAXUB(a,b) \ | |
| 109 "psubusb " #a ", " #b " \n\t"\ | |
| 110 "paddb " #a ", " #b " \n\t" | |
| 111 #endif | |
| 112 | |
| 113 | |
| 114 #define GET_MODE_BUFFER_SIZE 500 | 94 #define GET_MODE_BUFFER_SIZE 500 |
| 115 #define OPTIONS_ARRAY_SIZE 10 | 95 #define OPTIONS_ARRAY_SIZE 10 |
| 116 | 96 |
| 117 #ifdef HAVE_MMX | 97 #ifdef ARCH_X86 |
| 98 #define CAN_COMPILE_X86_ASM | |
| 99 #endif | |
| 100 | |
| 101 #ifdef CAN_COMPILE_X86_ASM | |
| 118 static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL; | 102 static volatile uint64_t __attribute__((aligned(8))) packedYOffset= 0x0000000000000000LL; |
| 119 static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL; | 103 static volatile uint64_t __attribute__((aligned(8))) packedYScale= 0x0100010001000100LL; |
| 120 static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL; | 104 static uint64_t __attribute__((aligned(8))) w05= 0x0005000500050005LL; |
| 121 static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL; | 105 static uint64_t __attribute__((aligned(8))) w20= 0x0020002000200020LL; |
| 122 static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL; | 106 static uint64_t __attribute__((aligned(8))) w1400= 0x1400140014001400LL; |
| 155 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code | 139 static uint8_t __attribute__((aligned(8))) tempBlocks[8*16*2]; //used for the horizontal code |
| 156 static uint32_t __attribute__((aligned(4))) maxTmpNoise[4]; | 140 static uint32_t __attribute__((aligned(4))) maxTmpNoise[4]; |
| 157 #else | 141 #else |
| 158 static uint64_t packedYOffset= 0x0000000000000000LL; | 142 static uint64_t packedYOffset= 0x0000000000000000LL; |
| 159 static uint64_t packedYScale= 0x0100010001000100LL; | 143 static uint64_t packedYScale= 0x0100010001000100LL; |
| 160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code | |
| 161 #endif | 144 #endif |
| 162 | 145 |
| 163 int hFlatnessThreshold= 56 - 16; | 146 int hFlatnessThreshold= 56 - 16; |
| 164 int vFlatnessThreshold= 56 - 16; | 147 int vFlatnessThreshold= 56 - 16; |
| 165 int deringThreshold= 20; | 148 int deringThreshold= 20; |
| 194 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", | 177 "fast", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", |
| 195 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", | 178 "fa", "x1hdeblock:a,x1vdeblock:a,dering:a,autolevels,tmpnoise:a:150:200:400", |
| 196 NULL //End Marker | 179 NULL //End Marker |
| 197 }; | 180 }; |
| 198 | 181 |
| 199 #ifdef HAVE_MMX | 182 #ifdef CAN_COMPILE_X86_ASM |
| 200 static inline void unusedVariableWarningFixer() | 183 static inline void unusedVariableWarningFixer() |
| 201 { | 184 { |
| 202 if( | 185 if( |
| 203 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000 | 186 packedYOffset + packedYScale + w05 + w20 + w1400 + bm00000001 + bm00010000 |
| 204 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110 | 187 + bm00001000 + bm10000000 + bm10000001 + bm11000011 + bm00000011 + bm11111110 |
| 218 // printf("%d\n", int(l/1000)); | 201 // printf("%d\n", int(l/1000)); |
| 219 return l; | 202 return l; |
| 220 } | 203 } |
| 221 #endif | 204 #endif |
| 222 | 205 |
| 223 #ifdef HAVE_MMX2 | 206 #ifdef CAN_COMPILE_X86_ASM |
| 224 static inline void prefetchnta(void *p) | 207 static inline void prefetchnta(void *p) |
| 225 { | 208 { |
| 226 asm volatile( "prefetchnta (%0)\n\t" | 209 asm volatile( "prefetchnta (%0)\n\t" |
| 227 : : "r" (p) | 210 : : "r" (p) |
| 228 ); | 211 ); |
| 248 : : "r" (p) | 231 : : "r" (p) |
| 249 ); | 232 ); |
| 250 } | 233 } |
| 251 #endif | 234 #endif |
| 252 | 235 |
| 253 //FIXME? |255-0| = 1 (shouldnt be a problem ...) | 236 // The horizontal Functions exist only in C cuz the MMX code is faster with vertical filters and transposing |
| 237 | |
| 254 /** | 238 /** |
| 255 * Check if the middle 8x8 Block in the given 8x16 block is flat | 239 * Check if the given 8x8 Block is mostly "flat" |
| 256 */ | 240 */ |
| 257 static inline int isVertDC(uint8_t src[], int stride){ | 241 static inline int isHorizDC(uint8_t src[], int stride) |
| 242 { | |
| 258 int numEq= 0; | 243 int numEq= 0; |
| 259 #ifndef HAVE_MMX | |
| 260 int y; | 244 int y; |
| 261 #endif | 245 for(y=0; y<BLOCK_SIZE; y++) |
| 262 src+= stride*4; // src points to begin of the 8x8 Block | 246 { |
| 263 #ifdef HAVE_MMX | 247 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; |
| 264 asm volatile( | 248 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; |
| 265 "leal (%1, %2), %%eax \n\t" | 249 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; |
| 266 "leal (%%eax, %2, 4), %%ebx \n\t" | 250 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; |
| 267 // 0 1 2 3 4 5 6 7 8 9 | 251 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; |
| 268 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 | 252 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; |
| 269 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F | 253 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; |
| 270 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D | |
| 271 "movq (%1), %%mm0 \n\t" | |
| 272 "movq (%%eax), %%mm1 \n\t" | |
| 273 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece | |
| 274 "paddb %%mm7, %%mm0 \n\t" | |
| 275 "pcmpgtb %%mm6, %%mm0 \n\t" | |
| 276 | |
| 277 "movq (%%eax,%2), %%mm2 \n\t" | |
| 278 "psubb %%mm2, %%mm1 \n\t" | |
| 279 "paddb %%mm7, %%mm1 \n\t" | |
| 280 "pcmpgtb %%mm6, %%mm1 \n\t" | |
| 281 "paddb %%mm1, %%mm0 \n\t" | |
| 282 | |
| 283 "movq (%%eax, %2, 2), %%mm1 \n\t" | |
| 284 "psubb %%mm1, %%mm2 \n\t" | |
| 285 "paddb %%mm7, %%mm2 \n\t" | |
| 286 "pcmpgtb %%mm6, %%mm2 \n\t" | |
| 287 "paddb %%mm2, %%mm0 \n\t" | |
| 288 | |
| 289 "movq (%1, %2, 4), %%mm2 \n\t" | |
| 290 "psubb %%mm2, %%mm1 \n\t" | |
| 291 "paddb %%mm7, %%mm1 \n\t" | |
| 292 "pcmpgtb %%mm6, %%mm1 \n\t" | |
| 293 "paddb %%mm1, %%mm0 \n\t" | |
| 294 | |
| 295 "movq (%%ebx), %%mm1 \n\t" | |
| 296 "psubb %%mm1, %%mm2 \n\t" | |
| 297 "paddb %%mm7, %%mm2 \n\t" | |
| 298 "pcmpgtb %%mm6, %%mm2 \n\t" | |
| 299 "paddb %%mm2, %%mm0 \n\t" | |
| 300 | |
| 301 "movq (%%ebx, %2), %%mm2 \n\t" | |
| 302 "psubb %%mm2, %%mm1 \n\t" | |
| 303 "paddb %%mm7, %%mm1 \n\t" | |
| 304 "pcmpgtb %%mm6, %%mm1 \n\t" | |
| 305 "paddb %%mm1, %%mm0 \n\t" | |
| 306 | |
| 307 "movq (%%ebx, %2, 2), %%mm1 \n\t" | |
| 308 "psubb %%mm1, %%mm2 \n\t" | |
| 309 "paddb %%mm7, %%mm2 \n\t" | |
| 310 "pcmpgtb %%mm6, %%mm2 \n\t" | |
| 311 "paddb %%mm2, %%mm0 \n\t" | |
| 312 | |
| 313 " \n\t" | |
| 314 #ifdef HAVE_MMX2 | |
| 315 "pxor %%mm7, %%mm7 \n\t" | |
| 316 "psadbw %%mm7, %%mm0 \n\t" | |
| 317 #else | |
| 318 "movq %%mm0, %%mm1 \n\t" | |
| 319 "psrlw $8, %%mm0 \n\t" | |
| 320 "paddb %%mm1, %%mm0 \n\t" | |
| 321 "movq %%mm0, %%mm1 \n\t" | |
| 322 "psrlq $16, %%mm0 \n\t" | |
| 323 "paddb %%mm1, %%mm0 \n\t" | |
| 324 "movq %%mm0, %%mm1 \n\t" | |
| 325 "psrlq $32, %%mm0 \n\t" | |
| 326 "paddb %%mm1, %%mm0 \n\t" | |
| 327 #endif | |
| 328 "movd %%mm0, %0 \n\t" | |
| 329 : "=r" (numEq) | |
| 330 : "r" (src), "r" (stride) | |
| 331 : "%ebx" | |
| 332 ); | |
| 333 numEq= (-numEq) &0xFF; | |
| 334 | |
| 335 #else | |
| 336 for(y=0; y<BLOCK_SIZE-1; y++) | |
| 337 { | |
| 338 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 339 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 340 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 341 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 342 if(((src[4] - src[4+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 343 if(((src[5] - src[5+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 344 if(((src[6] - src[6+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 345 if(((src[7] - src[7+stride] + 1)&0xFFFF) < 3) numEq++; | |
| 346 src+= stride; | 254 src+= stride; |
| 347 } | 255 } |
| 348 #endif | 256 return numEq > hFlatnessThreshold; |
| 349 /* if(abs(numEq - asmEq) > 0) | 257 } |
| 350 { | 258 |
| 351 printf("\nasm:%d c:%d\n", asmEq, numEq); | 259 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) |
| 352 for(int y=0; y<8; y++) | 260 { |
| 261 if(abs(src[0] - src[7]) > 2*QP) return 0; | |
| 262 | |
| 263 return 1; | |
| 264 } | |
| 265 | |
| 266 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) | |
| 267 { | |
| 268 int y; | |
| 269 for(y=0; y<BLOCK_SIZE; y++) | |
| 270 { | |
| 271 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]); | |
| 272 | |
| 273 if(ABS(middleEnergy) < 8*QP) | |
| 353 { | 274 { |
| 354 for(int x=0; x<8; x++) | 275 const int q=(dst[3] - dst[4])/2; |
| 355 { | 276 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); |
| 356 printf("%d ", temp[x + y*stride]); | 277 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); |
| 357 } | |
| 358 printf("\n"); | |
| 359 } | |
| 360 } | |
| 361 */ | |
| 362 // for(int i=0; i<numEq/8; i++) src[i]=255; | |
| 363 return (numEq > vFlatnessThreshold) ? 1 : 0; | |
| 364 } | |
| 365 | |
| 366 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) | |
| 367 { | |
| 368 #ifdef HAVE_MMX | |
| 369 int isOk; | |
| 370 src+= stride*3; | |
| 371 asm volatile( | |
| 372 // "int $3 \n\t" | |
| 373 "movq (%1, %2), %%mm0 \n\t" | |
| 374 "movq (%1, %2, 8), %%mm1 \n\t" | |
| 375 "movq %%mm0, %%mm2 \n\t" | |
| 376 "psubusb %%mm1, %%mm0 \n\t" | |
| 377 "psubusb %%mm2, %%mm1 \n\t" | |
| 378 "por %%mm1, %%mm0 \n\t" // ABS Diff | |
| 379 | |
| 380 "movq pQPb, %%mm7 \n\t" // QP,..., QP | |
| 381 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | |
| 382 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 | |
| 383 "pcmpeqd b00, %%mm0 \n\t" | |
| 384 "psrlq $16, %%mm0 \n\t" | |
| 385 "pcmpeqd bFF, %%mm0 \n\t" | |
| 386 // "movd %%mm0, (%1, %2, 4)\n\t" | |
| 387 "movd %%mm0, %0 \n\t" | |
| 388 : "=r" (isOk) | |
| 389 : "r" (src), "r" (stride) | |
| 390 ); | |
| 391 return isOk; | |
| 392 #else | |
| 393 | |
| 394 int isOk2= 1; | |
| 395 int x; | |
| 396 src+= stride*3; | |
| 397 for(x=0; x<BLOCK_SIZE; x++) | |
| 398 { | |
| 399 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; | |
| 400 } | |
| 401 /* if(isOk && !isOk2 || !isOk && isOk2) | |
| 402 { | |
| 403 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP); | |
| 404 for(int y=0; y<9; y++) | |
| 405 { | |
| 406 for(int x=0; x<8; x++) | |
| 407 { | |
| 408 printf("%d ", src[x + y*stride]); | |
| 409 } | |
| 410 printf("\n"); | |
| 411 } | |
| 412 } */ | |
| 413 | |
| 414 return isOk2; | |
| 415 #endif | |
| 416 | |
| 417 } | |
| 418 | |
| 419 /** | |
| 420 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) | |
| 421 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 | |
| 422 */ | |
| 423 static inline void doVertLowPass(uint8_t *src, int stride, int QP) | |
| 424 { | |
| 425 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 426 src+= stride*3; | |
| 427 asm volatile( //"movv %0 %1 %2\n\t" | |
| 428 "movq pQPb, %%mm0 \n\t" // QP,..., QP | |
| 429 | |
| 430 "movq (%0), %%mm6 \n\t" | |
| 431 "movq (%0, %1), %%mm5 \n\t" | |
| 432 "movq %%mm5, %%mm1 \n\t" | |
| 433 "movq %%mm6, %%mm2 \n\t" | |
| 434 "psubusb %%mm6, %%mm5 \n\t" | |
| 435 "psubusb %%mm1, %%mm2 \n\t" | |
| 436 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
| 437 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | |
| 438 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF | |
| 439 | |
| 440 "pand %%mm2, %%mm6 \n\t" | |
| 441 "pandn %%mm1, %%mm2 \n\t" | |
| 442 "por %%mm2, %%mm6 \n\t"// First Line to Filter | |
| 443 | |
| 444 "movq (%0, %1, 8), %%mm5 \n\t" | |
| 445 "leal (%0, %1, 4), %%eax \n\t" | |
| 446 "leal (%0, %1, 8), %%ebx \n\t" | |
| 447 "subl %1, %%ebx \n\t" | |
| 448 "addl %1, %0 \n\t" // %0 points to line 1 not 0 | |
| 449 "movq (%0, %1, 8), %%mm7 \n\t" | |
| 450 "movq %%mm5, %%mm1 \n\t" | |
| 451 "movq %%mm7, %%mm2 \n\t" | |
| 452 "psubusb %%mm7, %%mm5 \n\t" | |
| 453 "psubusb %%mm1, %%mm2 \n\t" | |
| 454 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
| 455 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | |
| 456 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF | |
| 457 | |
| 458 "pand %%mm2, %%mm7 \n\t" | |
| 459 "pandn %%mm1, %%mm2 \n\t" | |
| 460 "por %%mm2, %%mm7 \n\t" // First Line to Filter | |
| 461 | |
| 462 | |
| 463 // 1 2 3 4 5 6 7 8 | |
| 464 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 | |
| 465 // 6 4 2 2 1 1 | |
| 466 // 6 4 4 2 | |
| 467 // 6 8 2 | |
| 468 | |
| 469 "movq (%0, %1), %%mm0 \n\t" // 1 | |
| 470 "movq %%mm0, %%mm1 \n\t" // 1 | |
| 471 PAVGB(%%mm6, %%mm0) //1 1 /2 | |
| 472 PAVGB(%%mm6, %%mm0) //3 1 /4 | |
| 473 | |
| 474 "movq (%0, %1, 4), %%mm2 \n\t" // 1 | |
| 475 "movq %%mm2, %%mm5 \n\t" // 1 | |
| 476 PAVGB((%%eax), %%mm2) // 11 /2 | |
| 477 PAVGB((%0, %1, 2), %%mm2) // 211 /4 | |
| 478 "movq %%mm2, %%mm3 \n\t" // 211 /4 | |
| 479 "movq (%0), %%mm4 \n\t" // 1 | |
| 480 PAVGB(%%mm4, %%mm3) // 4 211 /8 | |
| 481 PAVGB(%%mm0, %%mm3) //642211 /16 | |
| 482 "movq %%mm3, (%0) \n\t" // X | |
| 483 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 | |
| 484 "movq %%mm1, %%mm0 \n\t" // 1 | |
| 485 PAVGB(%%mm6, %%mm0) //1 1 /2 | |
| 486 "movq %%mm4, %%mm3 \n\t" // 1 | |
| 487 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 | |
| 488 PAVGB((%%eax,%1,2), %%mm5) // 11 /2 | |
| 489 PAVGB((%%eax), %%mm5) // 211 /4 | |
| 490 PAVGB(%%mm5, %%mm3) // 2 2211 /8 | |
| 491 PAVGB(%%mm0, %%mm3) //4242211 /16 | |
| 492 "movq %%mm3, (%0,%1) \n\t" // X | |
| 493 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 | |
| 494 PAVGB(%%mm4, %%mm6) //11 /2 | |
| 495 "movq (%%ebx), %%mm0 \n\t" // 1 | |
| 496 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 | |
| 497 "movq %%mm0, %%mm3 \n\t" // 11/2 | |
| 498 PAVGB(%%mm1, %%mm0) // 2 11/4 | |
| 499 PAVGB(%%mm6, %%mm0) //222 11/8 | |
| 500 PAVGB(%%mm2, %%mm0) //22242211/16 | |
| 501 "movq (%0, %1, 2), %%mm2 \n\t" // 1 | |
| 502 "movq %%mm0, (%0, %1, 2) \n\t" // X | |
| 503 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 | |
| 504 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | |
| 505 PAVGB((%%ebx), %%mm0) // 11 /2 | |
| 506 PAVGB(%%mm0, %%mm6) //11 11 /4 | |
| 507 PAVGB(%%mm1, %%mm4) // 11 /2 | |
| 508 PAVGB(%%mm2, %%mm1) // 11 /2 | |
| 509 PAVGB(%%mm1, %%mm6) //1122 11 /8 | |
| 510 PAVGB(%%mm5, %%mm6) //112242211 /16 | |
| 511 "movq (%%eax), %%mm5 \n\t" // 1 | |
| 512 "movq %%mm6, (%%eax) \n\t" // X | |
| 513 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 | |
| 514 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 | |
| 515 PAVGB(%%mm7, %%mm6) // 11 /2 | |
| 516 PAVGB(%%mm4, %%mm6) // 11 11 /4 | |
| 517 PAVGB(%%mm3, %%mm6) // 11 2211 /8 | |
| 518 PAVGB(%%mm5, %%mm2) // 11 /2 | |
| 519 "movq (%0, %1, 4), %%mm4 \n\t" // 1 | |
| 520 PAVGB(%%mm4, %%mm2) // 112 /4 | |
| 521 PAVGB(%%mm2, %%mm6) // 112242211 /16 | |
| 522 "movq %%mm6, (%0, %1, 4) \n\t" // X | |
| 523 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 | |
| 524 PAVGB(%%mm7, %%mm1) // 11 2 /4 | |
| 525 PAVGB(%%mm4, %%mm5) // 11 /2 | |
| 526 PAVGB(%%mm5, %%mm0) // 11 11 /4 | |
| 527 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 | |
| 528 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 | |
| 529 PAVGB(%%mm0, %%mm1) // 11224222 /16 | |
| 530 "movq %%mm1, (%%eax, %1, 2) \n\t" // X | |
| 531 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 | |
| 532 PAVGB((%%ebx), %%mm2) // 112 4 /8 | |
| 533 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | |
| 534 PAVGB(%%mm0, %%mm6) // 1 1 /2 | |
| 535 PAVGB(%%mm7, %%mm6) // 1 12 /4 | |
| 536 PAVGB(%%mm2, %%mm6) // 1122424 /4 | |
| 537 "movq %%mm6, (%%ebx) \n\t" // X | |
| 538 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 | |
| 539 PAVGB(%%mm7, %%mm5) // 11 2 /4 | |
| 540 PAVGB(%%mm7, %%mm5) // 11 6 /8 | |
| 541 | |
| 542 PAVGB(%%mm3, %%mm0) // 112 /4 | |
| 543 PAVGB(%%mm0, %%mm5) // 112246 /16 | |
| 544 "movq %%mm5, (%%eax, %1, 4) \n\t" // X | |
| 545 "subl %1, %0 \n\t" | |
| 546 | |
| 547 : | |
| 548 : "r" (src), "r" (stride) | |
| 549 : "%eax", "%ebx" | |
| 550 ); | |
| 551 #else | |
| 552 const int l1= stride; | |
| 553 const int l2= stride + l1; | |
| 554 const int l3= stride + l2; | |
| 555 const int l4= stride + l3; | |
| 556 const int l5= stride + l4; | |
| 557 const int l6= stride + l5; | |
| 558 const int l7= stride + l6; | |
| 559 const int l8= stride + l7; | |
| 560 const int l9= stride + l8; | |
| 561 int x; | |
| 562 src+= stride*3; | |
| 563 for(x=0; x<BLOCK_SIZE; x++) | |
| 564 { | |
| 565 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; | |
| 566 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; | |
| 567 | |
| 568 int sums[9]; | |
| 569 sums[0] = first + src[l1]; | |
| 570 sums[1] = src[l1] + src[l2]; | |
| 571 sums[2] = src[l2] + src[l3]; | |
| 572 sums[3] = src[l3] + src[l4]; | |
| 573 sums[4] = src[l4] + src[l5]; | |
| 574 sums[5] = src[l5] + src[l6]; | |
| 575 sums[6] = src[l6] + src[l7]; | |
| 576 sums[7] = src[l7] + src[l8]; | |
| 577 sums[8] = src[l8] + last; | |
| 578 | |
| 579 src[l1]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | |
| 580 src[l2]= ((src[l2]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; | |
| 581 src[l3]= ((src[l3]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; | |
| 582 src[l4]= ((src[l4]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; | |
| 583 src[l5]= ((src[l5]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; | |
| 584 src[l6]= ((src[l6]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; | |
| 585 src[l7]= (((last + src[l7])<<2) + ((src[l8] + sums[5])<<1) + sums[3] + 8)>>4; | |
| 586 src[l8]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; | |
| 587 | |
| 588 src++; | |
| 589 } | |
| 590 | |
| 591 #endif | |
| 592 } | |
| 593 | |
| 594 /** | |
| 595 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | |
| 596 * values are correctly clipped (MMX2) | |
| 597 * values are wraparound (C) | |
| 598 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | |
| 599 0 8 16 24 | |
| 600 x = 8 | |
| 601 x/2 = 4 | |
| 602 x/8 = 1 | |
| 603 1 12 12 23 | |
| 604 */ | |
| 605 static inline void vertRK1Filter(uint8_t *src, int stride, int QP) | |
| 606 { | |
| 607 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 608 src+= stride*3; | |
| 609 // FIXME rounding | |
| 610 asm volatile( | |
| 611 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 612 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | |
| 613 "leal (%0, %1), %%eax \n\t" | |
| 614 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 615 // 0 1 2 3 4 5 6 7 8 9 | |
| 616 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 617 "movq pQPb, %%mm0 \n\t" // QP,..., QP | |
| 618 "movq %%mm0, %%mm1 \n\t" // QP,..., QP | |
| 619 "paddusb b02, %%mm0 \n\t" | |
| 620 "psrlw $2, %%mm0 \n\t" | |
| 621 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4 | |
| 622 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... | |
| 623 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | |
| 624 "movq (%%ebx), %%mm3 \n\t" // line 5 | |
| 625 "movq %%mm2, %%mm4 \n\t" // line 4 | |
| 626 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | |
| 627 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | |
| 628 PAVGB(%%mm3, %%mm5) | |
| 629 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 | |
| 630 "psubusb %%mm3, %%mm4 \n\t" | |
| 631 "psubusb %%mm2, %%mm3 \n\t" | |
| 632 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | |
| 633 "psubusb %%mm0, %%mm4 \n\t" | |
| 634 "pcmpeqb %%mm7, %%mm4 \n\t" | |
| 635 "pand %%mm4, %%mm5 \n\t" // d/2 | |
| 636 | |
| 637 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | |
| 638 "paddb %%mm5, %%mm2 \n\t" | |
| 639 // "psubb %%mm6, %%mm2 \n\t" | |
| 640 "movq %%mm2, (%0,%1, 4) \n\t" | |
| 641 | |
| 642 "movq (%%ebx), %%mm2 \n\t" | |
| 643 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 | |
| 644 "psubb %%mm5, %%mm2 \n\t" | |
| 645 // "psubb %%mm6, %%mm2 \n\t" | |
| 646 "movq %%mm2, (%%ebx) \n\t" | |
| 647 | |
| 648 "paddb %%mm6, %%mm5 \n\t" | |
| 649 "psrlw $2, %%mm5 \n\t" | |
| 650 "pand b3F, %%mm5 \n\t" | |
| 651 "psubb b20, %%mm5 \n\t" // (l5-l4)/8 | |
| 652 | |
| 653 "movq (%%eax, %1, 2), %%mm2 \n\t" | |
| 654 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | |
| 655 "paddsb %%mm5, %%mm2 \n\t" | |
| 656 "psubb %%mm6, %%mm2 \n\t" | |
| 657 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
| 658 | |
| 659 "movq (%%ebx, %1), %%mm2 \n\t" | |
| 660 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 | |
| 661 "psubsb %%mm5, %%mm2 \n\t" | |
| 662 "psubb %%mm6, %%mm2 \n\t" | |
| 663 "movq %%mm2, (%%ebx, %1) \n\t" | |
| 664 | |
| 665 : | |
| 666 : "r" (src), "r" (stride) | |
| 667 : "%eax", "%ebx" | |
| 668 ); | |
| 669 #else | |
| 670 const int l1= stride; | |
| 671 const int l2= stride + l1; | |
| 672 const int l3= stride + l2; | |
| 673 const int l4= stride + l3; | |
| 674 const int l5= stride + l4; | |
| 675 const int l6= stride + l5; | |
| 676 // const int l7= stride + l6; | |
| 677 // const int l8= stride + l7; | |
| 678 // const int l9= stride + l8; | |
| 679 int x; | |
| 680 const int QP15= QP + (QP>>2); | |
| 681 src+= stride*3; | |
| 682 for(x=0; x<BLOCK_SIZE; x++) | |
| 683 { | |
| 684 const int v = (src[x+l5] - src[x+l4]); | |
| 685 if(ABS(v) < QP15) | |
| 686 { | |
| 687 src[x+l3] +=v>>3; | |
| 688 src[x+l4] +=v>>1; | |
| 689 src[x+l5] -=v>>1; | |
| 690 src[x+l6] -=v>>3; | |
| 691 | |
| 692 } | |
| 693 } | |
| 694 | |
| 695 #endif | |
| 696 } | |
| 697 | |
| 698 /** | |
| 699 * Experimental Filter 1 | |
| 700 * will not damage linear gradients | |
| 701 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
| 702 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | |
| 703 * MMX2 version does correct clipping C version doesnt | |
| 704 */ | |
| 705 static inline void vertX1Filter(uint8_t *src, int stride, int QP) | |
| 706 { | |
| 707 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 708 src+= stride*3; | |
| 709 | |
| 710 asm volatile( | |
| 711 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 712 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | |
| 713 "leal (%0, %1), %%eax \n\t" | |
| 714 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 715 // 0 1 2 3 4 5 6 7 8 9 | |
| 716 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 717 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 | |
| 718 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 | |
| 719 "movq %%mm1, %%mm2 \n\t" // line 4 | |
| 720 "psubusb %%mm0, %%mm1 \n\t" | |
| 721 "psubusb %%mm2, %%mm0 \n\t" | |
| 722 "por %%mm1, %%mm0 \n\t" // |l2 - l3| | |
| 723 "movq (%%ebx), %%mm3 \n\t" // line 5 | |
| 724 "movq (%%ebx, %1), %%mm4 \n\t" // line 6 | |
| 725 "movq %%mm3, %%mm5 \n\t" // line 5 | |
| 726 "psubusb %%mm4, %%mm3 \n\t" | |
| 727 "psubusb %%mm5, %%mm4 \n\t" | |
| 728 "por %%mm4, %%mm3 \n\t" // |l5 - l6| | |
| 729 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 | |
| 730 "movq %%mm2, %%mm1 \n\t" // line 4 | |
| 731 "psubusb %%mm5, %%mm2 \n\t" | |
| 732 "movq %%mm2, %%mm4 \n\t" | |
| 733 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 | |
| 734 "psubusb %%mm1, %%mm5 \n\t" | |
| 735 "por %%mm5, %%mm4 \n\t" // |l4 - l5| | |
| 736 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) | |
| 737 "movq %%mm4, %%mm3 \n\t" // d | |
| 738 "psubusb pQPb, %%mm4 \n\t" | |
| 739 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 | |
| 740 "psubusb b01, %%mm3 \n\t" | |
| 741 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 | |
| 742 | |
| 743 PAVGB(%%mm7, %%mm3) // d/2 | |
| 744 "movq %%mm3, %%mm1 \n\t" // d/2 | |
| 745 PAVGB(%%mm7, %%mm3) // d/4 | |
| 746 PAVGB(%%mm1, %%mm3) // 3*d/8 | |
| 747 | |
| 748 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 | |
| 749 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | |
| 750 "psubusb %%mm3, %%mm0 \n\t" | |
| 751 "pxor %%mm2, %%mm0 \n\t" | |
| 752 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 | |
| 753 | |
| 754 "movq (%%ebx), %%mm0 \n\t" // line 5 | |
| 755 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | |
| 756 "paddusb %%mm3, %%mm0 \n\t" | |
| 757 "pxor %%mm2, %%mm0 \n\t" | |
| 758 "movq %%mm0, (%%ebx) \n\t" // line 5 | |
| 759 | |
| 760 PAVGB(%%mm7, %%mm1) // d/4 | |
| 761 | |
| 762 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 | |
| 763 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | |
| 764 "psubusb %%mm1, %%mm0 \n\t" | |
| 765 "pxor %%mm2, %%mm0 \n\t" | |
| 766 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 | |
| 767 | |
| 768 "movq (%%ebx, %1), %%mm0 \n\t" // line 6 | |
| 769 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | |
| 770 "paddusb %%mm1, %%mm0 \n\t" | |
| 771 "pxor %%mm2, %%mm0 \n\t" | |
| 772 "movq %%mm0, (%%ebx, %1) \n\t" // line 6 | |
| 773 | |
| 774 PAVGB(%%mm7, %%mm1) // d/8 | |
| 775 | |
| 776 "movq (%%eax, %1), %%mm0 \n\t" // line 2 | |
| 777 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 | |
| 778 "psubusb %%mm1, %%mm0 \n\t" | |
| 779 "pxor %%mm2, %%mm0 \n\t" | |
| 780 "movq %%mm0, (%%eax, %1) \n\t" // line 2 | |
| 781 | |
| 782 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 | |
| 783 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 | |
| 784 "paddusb %%mm1, %%mm0 \n\t" | |
| 785 "pxor %%mm2, %%mm0 \n\t" | |
| 786 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 | |
| 787 | |
| 788 : | |
| 789 : "r" (src), "r" (stride) | |
| 790 : "%eax", "%ebx" | |
| 791 ); | |
| 792 #else | |
| 793 | |
| 794 const int l1= stride; | |
| 795 const int l2= stride + l1; | |
| 796 const int l3= stride + l2; | |
| 797 const int l4= stride + l3; | |
| 798 const int l5= stride + l4; | |
| 799 const int l6= stride + l5; | |
| 800 const int l7= stride + l6; | |
| 801 // const int l8= stride + l7; | |
| 802 // const int l9= stride + l8; | |
| 803 int x; | |
| 804 | |
| 805 src+= stride*3; | |
| 806 for(x=0; x<BLOCK_SIZE; x++) | |
| 807 { | |
| 808 int a= src[l3] - src[l4]; | |
| 809 int b= src[l4] - src[l5]; | |
| 810 int c= src[l5] - src[l6]; | |
| 811 | |
| 812 int d= ABS(b) - ((ABS(a) + ABS(c))>>1); | |
| 813 d= MAX(d, 0); | |
| 814 | |
| 815 if(d < QP) | |
| 816 { | |
| 817 int v = d * SIGN(-b); | |
| 818 | |
| 819 src[l2] +=v>>3; | |
| 820 src[l3] +=v>>2; | |
| 821 src[l4] +=(3*v)>>3; | |
| 822 src[l5] -=(3*v)>>3; | |
| 823 src[l6] -=v>>2; | |
| 824 src[l7] -=v>>3; | |
| 825 | |
| 826 } | |
| 827 src++; | |
| 828 } | |
| 829 /* | |
| 830 const int l1= stride; | |
| 831 const int l2= stride + l1; | |
| 832 const int l3= stride + l2; | |
| 833 const int l4= stride + l3; | |
| 834 const int l5= stride + l4; | |
| 835 const int l6= stride + l5; | |
| 836 const int l7= stride + l6; | |
| 837 const int l8= stride + l7; | |
| 838 const int l9= stride + l8; | |
| 839 for(int x=0; x<BLOCK_SIZE; x++) | |
| 840 { | |
| 841 int v2= src[l2]; | |
| 842 int v3= src[l3]; | |
| 843 int v4= src[l4]; | |
| 844 int v5= src[l5]; | |
| 845 int v6= src[l6]; | |
| 846 int v7= src[l7]; | |
| 847 | |
| 848 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 ) | |
| 849 { | |
| 850 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16; | |
| 851 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16; | |
| 852 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16; | |
| 853 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; | |
| 854 } | |
| 855 src++; | |
| 856 } | |
| 857 */ | |
| 858 #endif | |
| 859 } | |
| 860 | |
| 861 /** | |
| 862 * Experimental Filter 1 (Horizontal) | |
| 863 * will not damage linear gradients | |
| 864 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
| 865 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | |
| 866 * MMX2 version does correct clipping C version doesnt | |
| 867 * not identical with the vertical one | |
| 868 */ | |
| 869 static inline void horizX1Filter(uint8_t *src, int stride, int QP) | |
| 870 { | |
| 871 int y; | |
| 872 //FIXME (has little in common with the mmx2 version) | |
| 873 for(y=0; y<BLOCK_SIZE; y++) | |
| 874 { | |
| 875 int a= src[1] - src[2]; | |
| 876 int b= src[3] - src[4]; | |
| 877 int c= src[5] - src[6]; | |
| 878 | |
| 879 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); | |
| 880 | |
| 881 if(d < QP) | |
| 882 { | |
| 883 int v = d * SIGN(-b); | |
| 884 | |
| 885 src[1] +=v/8; | |
| 886 src[2] +=v/4; | |
| 887 src[3] +=3*v/8; | |
| 888 src[4] -=3*v/8; | |
| 889 src[5] -=v/4; | |
| 890 src[6] -=v/8; | |
| 891 | |
| 892 } | |
| 893 src+=stride; | |
| 894 } | |
| 895 } | |
| 896 | |
| 897 | |
| 898 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | |
| 899 { | |
| 900 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 901 /* | |
| 902 uint8_t tmp[16]; | |
| 903 const int l1= stride; | |
| 904 const int l2= stride + l1; | |
| 905 const int l3= stride + l2; | |
| 906 const int l4= (int)tmp - (int)src - stride*3; | |
| 907 const int l5= (int)tmp - (int)src - stride*3 + 8; | |
| 908 const int l6= stride*3 + l3; | |
| 909 const int l7= stride + l6; | |
| 910 const int l8= stride + l7; | |
| 911 | |
| 912 memcpy(tmp, src+stride*7, 8); | |
| 913 memcpy(tmp+8, src+stride*8, 8); | |
| 914 */ | |
| 915 src+= stride*4; | |
| 916 asm volatile( | |
| 917 | |
| 918 #if 0 //sligtly more accurate and slightly slower | |
| 919 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 920 "leal (%0, %1), %%eax \n\t" | |
| 921 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 922 // 0 1 2 3 4 5 6 7 | |
| 923 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | |
| 924 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | |
| 925 | |
| 926 | |
| 927 "movq (%0, %1, 2), %%mm0 \n\t" // l2 | |
| 928 "movq (%0), %%mm1 \n\t" // l0 | |
| 929 "movq %%mm0, %%mm2 \n\t" // l2 | |
| 930 PAVGB(%%mm7, %%mm0) // ~l2/2 | |
| 931 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 | |
| 932 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 | |
| 933 | |
| 934 "movq (%%eax), %%mm1 \n\t" // l1 | |
| 935 "movq (%%eax, %1, 2), %%mm3 \n\t" // l3 | |
| 936 "movq %%mm1, %%mm4 \n\t" // l1 | |
| 937 PAVGB(%%mm7, %%mm1) // ~l1/2 | |
| 938 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 | |
| 939 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 | |
| 940 | |
| 941 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 | |
| 942 "psubusb %%mm1, %%mm0 \n\t" | |
| 943 "psubusb %%mm4, %%mm1 \n\t" | |
| 944 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 | |
| 945 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 | |
| 946 | |
| 947 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
| 948 "movq %%mm0, %%mm4 \n\t" // l4 | |
| 949 PAVGB(%%mm7, %%mm0) // ~l4/2 | |
| 950 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 | |
| 951 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 | |
| 952 | |
| 953 "movq (%%ebx), %%mm2 \n\t" // l5 | |
| 954 "movq %%mm3, %%mm5 \n\t" // l3 | |
| 955 PAVGB(%%mm7, %%mm3) // ~l3/2 | |
| 956 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 | |
| 957 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 | |
| 958 | |
| 959 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 | |
| 960 "psubusb %%mm3, %%mm0 \n\t" | |
| 961 "psubusb %%mm6, %%mm3 \n\t" | |
| 962 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 | |
| 963 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) | |
| 964 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 | |
| 965 | |
| 966 "movq (%%ebx, %1), %%mm6 \n\t" // l6 | |
| 967 "movq %%mm6, %%mm5 \n\t" // l6 | |
| 968 PAVGB(%%mm7, %%mm6) // ~l6/2 | |
| 969 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 | |
| 970 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 | |
| 971 | |
| 972 "movq (%%ebx, %1, 2), %%mm5 \n\t" // l7 | |
| 973 "movq %%mm2, %%mm4 \n\t" // l5 | |
| 974 PAVGB(%%mm7, %%mm2) // ~l5/2 | |
| 975 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 | |
| 976 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 | |
| 977 | |
| 978 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 | |
| 979 "psubusb %%mm2, %%mm6 \n\t" | |
| 980 "psubusb %%mm4, %%mm2 \n\t" | |
| 981 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | |
| 982 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | |
| 983 | |
| 984 | |
| 985 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | |
| 986 "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ? | |
| 987 "paddusb b01, %%mm4 \n\t" | |
| 988 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP | |
| 989 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | |
| 990 "pand %%mm4, %%mm3 \n\t" | |
| 991 | |
| 992 "movq %%mm3, %%mm1 \n\t" | |
| 993 // "psubusb b01, %%mm3 \n\t" | |
| 994 PAVGB(%%mm7, %%mm3) | |
| 995 PAVGB(%%mm7, %%mm3) | |
| 996 "paddusb %%mm1, %%mm3 \n\t" | |
| 997 // "paddusb b01, %%mm3 \n\t" | |
| 998 | |
| 999 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 | |
| 1000 "movq (%0, %1, 4), %%mm5 \n\t" //l4 | |
| 1001 "movq (%0, %1, 4), %%mm4 \n\t" //l4 | |
| 1002 "psubusb %%mm6, %%mm5 \n\t" | |
| 1003 "psubusb %%mm4, %%mm6 \n\t" | |
| 1004 "por %%mm6, %%mm5 \n\t" // |l3-l4| | |
| 1005 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | |
| 1006 "pxor %%mm6, %%mm0 \n\t" | |
| 1007 "pand %%mm0, %%mm3 \n\t" | |
| 1008 PMINUB(%%mm5, %%mm3, %%mm0) | |
| 1009 | |
| 1010 "psubusb b01, %%mm3 \n\t" | |
| 1011 PAVGB(%%mm7, %%mm3) | |
| 1012 | |
| 1013 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 1014 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 1015 "pxor %%mm6, %%mm0 \n\t" | |
| 1016 "pxor %%mm6, %%mm2 \n\t" | |
| 1017 "psubb %%mm3, %%mm0 \n\t" | |
| 1018 "paddb %%mm3, %%mm2 \n\t" | |
| 1019 "pxor %%mm6, %%mm0 \n\t" | |
| 1020 "pxor %%mm6, %%mm2 \n\t" | |
| 1021 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 1022 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 1023 #endif | |
| 1024 | |
| 1025 "leal (%0, %1), %%eax \n\t" | |
| 1026 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 | |
| 1027 // 0 1 2 3 4 5 6 7 | |
| 1028 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | |
| 1029 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | |
| 1030 | |
| 1031 | |
| 1032 "movq (%%eax, %1, 2), %%mm1 \n\t" // l3 | |
| 1033 "movq (%0, %1, 4), %%mm0 \n\t" // l4 | |
| 1034 "pxor %%mm6, %%mm1 \n\t" // -l3-1 | |
| 1035 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 | |
| 1036 // mm1=-l3-1, mm0=128-q | |
| 1037 | |
| 1038 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 | |
| 1039 "movq (%%eax, %1), %%mm3 \n\t" // l2 | |
| 1040 "pxor %%mm6, %%mm2 \n\t" // -l5-1 | |
| 1041 "movq %%mm2, %%mm5 \n\t" // -l5-1 | |
| 1042 "movq b80, %%mm4 \n\t" // 128 | |
| 1043 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 1044 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 | |
| 1045 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | |
| 1046 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | |
| 1047 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | |
| 1048 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 | |
| 1049 | |
| 1050 "movq (%%eax), %%mm2 \n\t" // l1 | |
| 1051 "pxor %%mm6, %%mm2 \n\t" // -l1-1 | |
| 1052 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | |
| 1053 PAVGB((%0), %%mm1) // (l0-l3+256)/2 | |
| 1054 "movq b80, %%mm3 \n\t" // 128 | |
| 1055 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 | |
| 1056 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | |
| 1057 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | |
| 1058 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | |
| 1059 | |
| 1060 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2 | |
| 1061 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7 | |
| 1062 "pxor %%mm6, %%mm1 \n\t" // -l7-1 | |
| 1063 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | |
| 1064 "movq b80, %%mm2 \n\t" // 128 | |
| 1065 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 | |
| 1066 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | |
| 1067 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | |
| 1068 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | |
| 1069 | |
| 1070 "movq b00, %%mm1 \n\t" // 0 | |
| 1071 "movq b00, %%mm5 \n\t" // 0 | |
| 1072 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 | |
| 1073 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | |
| 1074 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | |
| 1075 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | |
| 1076 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | |
| 1077 | |
| 1078 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | |
| 1079 | |
| 1080 "movq b00, %%mm7 \n\t" // 0 | |
| 1081 "movq pQPb, %%mm2 \n\t" // QP | |
| 1082 PAVGB(%%mm6, %%mm2) // 128 + QP/2 | |
| 1083 "psubb %%mm6, %%mm2 \n\t" | |
| 1084 | |
| 1085 "movq %%mm4, %%mm1 \n\t" | |
| 1086 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | |
| 1087 "pxor %%mm1, %%mm4 \n\t" | |
| 1088 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 | |
| 1089 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | |
| 1090 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | |
| 1091 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | |
| 1092 | |
| 1093 "movq %%mm4, %%mm3 \n\t" // d | |
| 1094 "psubusb b01, %%mm4 \n\t" | |
| 1095 PAVGB(%%mm7, %%mm4) // d/32 | |
| 1096 PAVGB(%%mm7, %%mm4) // (d + 32)/64 | |
| 1097 "paddb %%mm3, %%mm4 \n\t" // 5d/64 | |
| 1098 "pand %%mm2, %%mm4 \n\t" | |
| 1099 | |
| 1100 "movq b80, %%mm5 \n\t" // 128 | |
| 1101 "psubb %%mm0, %%mm5 \n\t" // q | |
| 1102 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | |
| 1103 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | |
| 1104 "pxor %%mm7, %%mm5 \n\t" | |
| 1105 | |
| 1106 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) | |
| 1107 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) | |
| 1108 | |
| 1109 "pand %%mm7, %%mm4 \n\t" | |
| 1110 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 1111 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 1112 "pxor %%mm1, %%mm0 \n\t" | |
| 1113 "pxor %%mm1, %%mm2 \n\t" | |
| 1114 "paddb %%mm4, %%mm0 \n\t" | |
| 1115 "psubb %%mm4, %%mm2 \n\t" | |
| 1116 "pxor %%mm1, %%mm0 \n\t" | |
| 1117 "pxor %%mm1, %%mm2 \n\t" | |
| 1118 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 1119 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 1120 | |
| 1121 : | |
| 1122 : "r" (src), "r" (stride) | |
| 1123 : "%eax", "%ebx" | |
| 1124 ); | |
| 1125 | |
| 1126 /* | |
| 1127 { | |
| 1128 int x; | |
| 1129 src-= stride; | |
| 1130 for(x=0; x<BLOCK_SIZE; x++) | |
| 1131 { | |
| 1132 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
| 1133 if(ABS(middleEnergy)< 8*QP) | |
| 1134 { | |
| 1135 const int q=(src[l4] - src[l5])/2; | |
| 1136 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
| 1137 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
| 1138 | 278 |
| 1139 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | 279 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
| 1140 d= MAX(d, 0); | 280 d= MAX(d, 0); |
| 1141 | 281 |
| 1142 d= (5*d + 32) >> 6; | 282 d= (5*d + 32) >> 6; |
| 1151 { | 291 { |
| 1152 d= d>0 ? 0 : d; | 292 d= d>0 ? 0 : d; |
| 1153 d= d<q ? q : d; | 293 d= d<q ? q : d; |
| 1154 } | 294 } |
| 1155 | 295 |
| 1156 src[l4]-= d; | |
| 1157 src[l5]+= d; | |
| 1158 } | |
| 1159 src++; | |
| 1160 } | |
| 1161 src-=8; | |
| 1162 for(x=0; x<8; x++) | |
| 1163 { | |
| 1164 int y; | |
| 1165 for(y=4; y<6; y++) | |
| 1166 { | |
| 1167 int d= src[x+y*stride] - tmp[x+(y-4)*8]; | |
| 1168 int ad= ABS(d); | |
| 1169 static int max=0; | |
| 1170 static int sum=0; | |
| 1171 static int num=0; | |
| 1172 static int bias=0; | |
| 1173 | |
| 1174 if(max<ad) max=ad; | |
| 1175 sum+= ad>3 ? 1 : 0; | |
| 1176 if(ad>3) | |
| 1177 { | |
| 1178 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; | |
| 1179 } | |
| 1180 if(y==4) bias+=d; | |
| 1181 num++; | |
| 1182 if(num%1000000 == 0) | |
| 1183 { | |
| 1184 printf(" %d %d %d %d\n", num, sum, max, bias); | |
| 1185 } | |
| 1186 } | |
| 1187 } | |
| 1188 } | |
| 1189 */ | |
| 1190 #elif defined (HAVE_MMX) | |
| 1191 src+= stride*4; | |
| 1192 | |
| 1193 asm volatile( | |
| 1194 "pxor %%mm7, %%mm7 \n\t" | |
| 1195 "leal (%0, %1), %%eax \n\t" | |
| 1196 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 1197 // 0 1 2 3 4 5 6 7 | |
| 1198 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ebx+%1 ebx+2%1 | |
| 1199 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 | |
| 1200 | |
| 1201 "movq (%0), %%mm0 \n\t" | |
| 1202 "movq %%mm0, %%mm1 \n\t" | |
| 1203 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 | |
| 1204 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 | |
| 1205 | |
| 1206 "movq (%%eax), %%mm2 \n\t" | |
| 1207 "movq %%mm2, %%mm3 \n\t" | |
| 1208 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 | |
| 1209 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 | |
| 1210 | |
| 1211 "movq (%%eax, %1), %%mm4 \n\t" | |
| 1212 "movq %%mm4, %%mm5 \n\t" | |
| 1213 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 | |
| 1214 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 | |
| 1215 | |
| 1216 "paddw %%mm0, %%mm0 \n\t" // 2L0 | |
| 1217 "paddw %%mm1, %%mm1 \n\t" // 2H0 | |
| 1218 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 | |
| 1219 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 | |
| 1220 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 | |
| 1221 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 | |
| 1222 | |
| 1223 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 | |
| 1224 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 | |
| 1225 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 | |
| 1226 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 | |
| 1227 | |
| 1228 "movq (%%eax, %1, 2), %%mm2 \n\t" | |
| 1229 "movq %%mm2, %%mm3 \n\t" | |
| 1230 "punpcklbw %%mm7, %%mm2 \n\t" // L3 | |
| 1231 "punpckhbw %%mm7, %%mm3 \n\t" // H3 | |
| 1232 | |
| 1233 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 | |
| 1234 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 | |
| 1235 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
| 1236 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 1237 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
| 1238 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 1239 | |
| 1240 "movq (%0, %1, 4), %%mm0 \n\t" | |
| 1241 "movq %%mm0, %%mm1 \n\t" | |
| 1242 "punpcklbw %%mm7, %%mm0 \n\t" // L4 | |
| 1243 "punpckhbw %%mm7, %%mm1 \n\t" // H4 | |
| 1244 | |
| 1245 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 | |
| 1246 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 | |
| 1247 "movq %%mm2, temp2 \n\t" // L3 - L4 | |
| 1248 "movq %%mm3, temp3 \n\t" // H3 - H4 | |
| 1249 "paddw %%mm4, %%mm4 \n\t" // 2L2 | |
| 1250 "paddw %%mm5, %%mm5 \n\t" // 2H2 | |
| 1251 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 | |
| 1252 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 | |
| 1253 | |
| 1254 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 | |
| 1255 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 | |
| 1256 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 | |
| 1257 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 | |
| 1258 //50 opcodes so far | |
| 1259 "movq (%%ebx), %%mm2 \n\t" | |
| 1260 "movq %%mm2, %%mm3 \n\t" | |
| 1261 "punpcklbw %%mm7, %%mm2 \n\t" // L5 | |
| 1262 "punpckhbw %%mm7, %%mm3 \n\t" // H5 | |
| 1263 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 | |
| 1264 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 | |
| 1265 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 | |
| 1266 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 | |
| 1267 | |
| 1268 "movq (%%ebx, %1), %%mm6 \n\t" | |
| 1269 "punpcklbw %%mm7, %%mm6 \n\t" // L6 | |
| 1270 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 | |
| 1271 "movq (%%ebx, %1), %%mm6 \n\t" | |
| 1272 "punpckhbw %%mm7, %%mm6 \n\t" // H6 | |
| 1273 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 | |
| 1274 | |
| 1275 "paddw %%mm0, %%mm0 \n\t" // 2L4 | |
| 1276 "paddw %%mm1, %%mm1 \n\t" // 2H4 | |
| 1277 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 | |
| 1278 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 | |
| 1279 | |
| 1280 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 | |
| 1281 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 | |
| 1282 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 | |
| 1283 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 | |
| 1284 | |
| 1285 "movq (%%ebx, %1, 2), %%mm2 \n\t" | |
| 1286 "movq %%mm2, %%mm3 \n\t" | |
| 1287 "punpcklbw %%mm7, %%mm2 \n\t" // L7 | |
| 1288 "punpckhbw %%mm7, %%mm3 \n\t" // H7 | |
| 1289 | |
| 1290 "paddw %%mm2, %%mm2 \n\t" // 2L7 | |
| 1291 "paddw %%mm3, %%mm3 \n\t" // 2H7 | |
| 1292 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 | |
| 1293 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 | |
| 1294 | |
| 1295 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
| 1296 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 1297 | |
| 1298 #ifdef HAVE_MMX2 | |
| 1299 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1300 "psubw %%mm0, %%mm6 \n\t" | |
| 1301 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
| 1302 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1303 "psubw %%mm1, %%mm6 \n\t" | |
| 1304 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
| 1305 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1306 "psubw %%mm2, %%mm6 \n\t" | |
| 1307 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
| 1308 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1309 "psubw %%mm3, %%mm6 \n\t" | |
| 1310 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
| 1311 #else | |
| 1312 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1313 "pcmpgtw %%mm0, %%mm6 \n\t" | |
| 1314 "pxor %%mm6, %%mm0 \n\t" | |
| 1315 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
| 1316 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1317 "pcmpgtw %%mm1, %%mm6 \n\t" | |
| 1318 "pxor %%mm6, %%mm1 \n\t" | |
| 1319 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
| 1320 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1321 "pcmpgtw %%mm2, %%mm6 \n\t" | |
| 1322 "pxor %%mm6, %%mm2 \n\t" | |
| 1323 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
| 1324 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1325 "pcmpgtw %%mm3, %%mm6 \n\t" | |
| 1326 "pxor %%mm6, %%mm3 \n\t" | |
| 1327 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
| 1328 #endif | |
| 1329 | |
| 1330 #ifdef HAVE_MMX2 | |
| 1331 "pminsw %%mm2, %%mm0 \n\t" | |
| 1332 "pminsw %%mm3, %%mm1 \n\t" | |
| 1333 #else | |
| 1334 "movq %%mm0, %%mm6 \n\t" | |
| 1335 "psubusw %%mm2, %%mm6 \n\t" | |
| 1336 "psubw %%mm6, %%mm0 \n\t" | |
| 1337 "movq %%mm1, %%mm6 \n\t" | |
| 1338 "psubusw %%mm3, %%mm6 \n\t" | |
| 1339 "psubw %%mm6, %%mm1 \n\t" | |
| 1340 #endif | |
| 1341 | |
| 1342 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 1343 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) | |
| 1344 "pxor %%mm6, %%mm4 \n\t" | |
| 1345 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| | |
| 1346 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) | |
| 1347 "pxor %%mm7, %%mm5 \n\t" | |
| 1348 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| | |
| 1349 // 100 opcodes | |
| 1350 "movd %2, %%mm2 \n\t" // QP | |
| 1351 "punpcklwd %%mm2, %%mm2 \n\t" | |
| 1352 "punpcklwd %%mm2, %%mm2 \n\t" | |
| 1353 "psllw $3, %%mm2 \n\t" // 8QP | |
| 1354 "movq %%mm2, %%mm3 \n\t" // 8QP | |
| 1355 "pcmpgtw %%mm4, %%mm2 \n\t" | |
| 1356 "pcmpgtw %%mm5, %%mm3 \n\t" | |
| 1357 "pand %%mm2, %%mm4 \n\t" | |
| 1358 "pand %%mm3, %%mm5 \n\t" | |
| 1359 | |
| 1360 | |
| 1361 "psubusw %%mm0, %%mm4 \n\t" // hd | |
| 1362 "psubusw %%mm1, %%mm5 \n\t" // ld | |
| 1363 | |
| 1364 | |
| 1365 "movq w05, %%mm2 \n\t" // 5 | |
| 1366 "pmullw %%mm2, %%mm4 \n\t" | |
| 1367 "pmullw %%mm2, %%mm5 \n\t" | |
| 1368 "movq w20, %%mm2 \n\t" // 32 | |
| 1369 "paddw %%mm2, %%mm4 \n\t" | |
| 1370 "paddw %%mm2, %%mm5 \n\t" | |
| 1371 "psrlw $6, %%mm4 \n\t" | |
| 1372 "psrlw $6, %%mm5 \n\t" | |
| 1373 | |
| 1374 /* | |
| 1375 "movq w06, %%mm2 \n\t" // 6 | |
| 1376 "paddw %%mm2, %%mm4 \n\t" | |
| 1377 "paddw %%mm2, %%mm5 \n\t" | |
| 1378 "movq w1400, %%mm2 \n\t" // 1400h = 5120 = 5/64*2^16 | |
| 1379 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120 | |
| 1380 "pmulhw %%mm2, %%mm4 \n\t" // hd/13 | |
| 1381 "pmulhw %%mm2, %%mm5 \n\t" // ld/13 | |
| 1382 */ | |
| 1383 | |
| 1384 "movq temp2, %%mm0 \n\t" // L3 - L4 | |
| 1385 "movq temp3, %%mm1 \n\t" // H3 - H4 | |
| 1386 | |
| 1387 "pxor %%mm2, %%mm2 \n\t" | |
| 1388 "pxor %%mm3, %%mm3 \n\t" | |
| 1389 | |
| 1390 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) | |
| 1391 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) | |
| 1392 "pxor %%mm2, %%mm0 \n\t" | |
| 1393 "pxor %%mm3, %%mm1 \n\t" | |
| 1394 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| | |
| 1395 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| | |
| 1396 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 | |
| 1397 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 | |
| 1398 | |
| 1399 "pxor %%mm6, %%mm2 \n\t" | |
| 1400 "pxor %%mm7, %%mm3 \n\t" | |
| 1401 "pand %%mm2, %%mm4 \n\t" | |
| 1402 "pand %%mm3, %%mm5 \n\t" | |
| 1403 | |
| 1404 #ifdef HAVE_MMX2 | |
| 1405 "pminsw %%mm0, %%mm4 \n\t" | |
| 1406 "pminsw %%mm1, %%mm5 \n\t" | |
| 1407 #else | |
| 1408 "movq %%mm4, %%mm2 \n\t" | |
| 1409 "psubusw %%mm0, %%mm2 \n\t" | |
| 1410 "psubw %%mm2, %%mm4 \n\t" | |
| 1411 "movq %%mm5, %%mm2 \n\t" | |
| 1412 "psubusw %%mm1, %%mm2 \n\t" | |
| 1413 "psubw %%mm2, %%mm5 \n\t" | |
| 1414 #endif | |
| 1415 "pxor %%mm6, %%mm4 \n\t" | |
| 1416 "pxor %%mm7, %%mm5 \n\t" | |
| 1417 "psubw %%mm6, %%mm4 \n\t" | |
| 1418 "psubw %%mm7, %%mm5 \n\t" | |
| 1419 "packsswb %%mm5, %%mm4 \n\t" | |
| 1420 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 1421 "paddb %%mm4, %%mm0 \n\t" | |
| 1422 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 1423 "movq (%0, %1, 4), %%mm0 \n\t" | |
| 1424 "psubb %%mm4, %%mm0 \n\t" | |
| 1425 "movq %%mm0, (%0, %1, 4) \n\t" | |
| 1426 | |
| 1427 : | |
| 1428 : "r" (src), "r" (stride), "r" (QP) | |
| 1429 : "%eax", "%ebx" | |
| 1430 ); | |
| 1431 #else | |
| 1432 const int l1= stride; | |
| 1433 const int l2= stride + l1; | |
| 1434 const int l3= stride + l2; | |
| 1435 const int l4= stride + l3; | |
| 1436 const int l5= stride + l4; | |
| 1437 const int l6= stride + l5; | |
| 1438 const int l7= stride + l6; | |
| 1439 const int l8= stride + l7; | |
| 1440 // const int l9= stride + l8; | |
| 1441 int x; | |
| 1442 src+= stride*3; | |
| 1443 for(x=0; x<BLOCK_SIZE; x++) | |
| 1444 { | |
| 1445 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | |
| 1446 if(ABS(middleEnergy) < 8*QP) | |
| 1447 { | |
| 1448 const int q=(src[l4] - src[l5])/2; | |
| 1449 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); | |
| 1450 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); | |
| 1451 | |
| 1452 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
| 1453 d= MAX(d, 0); | |
| 1454 | |
| 1455 d= (5*d + 32) >> 6; | |
| 1456 d*= SIGN(-middleEnergy); | |
| 1457 | |
| 1458 if(q>0) | |
| 1459 { | |
| 1460 d= d<0 ? 0 : d; | |
| 1461 d= d>q ? q : d; | |
| 1462 } | |
| 1463 else | |
| 1464 { | |
| 1465 d= d>0 ? 0 : d; | |
| 1466 d= d<q ? q : d; | |
| 1467 } | |
| 1468 | |
| 1469 src[l4]-= d; | |
| 1470 src[l5]+= d; | |
| 1471 } | |
| 1472 src++; | |
| 1473 } | |
| 1474 #endif | |
| 1475 } | |
| 1476 | |
| 1477 /** | |
| 1478 * Check if the given 8x8 Block is mostly "flat" | |
| 1479 */ | |
| 1480 static inline int isHorizDC(uint8_t src[], int stride) | |
| 1481 { | |
| 1482 int numEq= 0; | |
| 1483 int y; | |
| 1484 for(y=0; y<BLOCK_SIZE; y++) | |
| 1485 { | |
| 1486 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; | |
| 1487 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; | |
| 1488 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; | |
| 1489 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; | |
| 1490 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; | |
| 1491 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; | |
| 1492 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; | |
| 1493 src+= stride; | |
| 1494 } | |
| 1495 return numEq > hFlatnessThreshold; | |
| 1496 } | |
| 1497 | |
| 1498 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) | |
| 1499 { | |
| 1500 if(abs(src[0] - src[7]) > 2*QP) return 0; | |
| 1501 | |
| 1502 return 1; | |
| 1503 } | |
| 1504 | |
| 1505 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) | |
| 1506 { | |
| 1507 int y; | |
| 1508 for(y=0; y<BLOCK_SIZE; y++) | |
| 1509 { | |
| 1510 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]); | |
| 1511 | |
| 1512 if(ABS(middleEnergy) < 8*QP) | |
| 1513 { | |
| 1514 const int q=(dst[3] - dst[4])/2; | |
| 1515 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); | |
| 1516 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); | |
| 1517 | |
| 1518 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
| 1519 d= MAX(d, 0); | |
| 1520 | |
| 1521 d= (5*d + 32) >> 6; | |
| 1522 d*= SIGN(-middleEnergy); | |
| 1523 | |
| 1524 if(q>0) | |
| 1525 { | |
| 1526 d= d<0 ? 0 : d; | |
| 1527 d= d>q ? q : d; | |
| 1528 } | |
| 1529 else | |
| 1530 { | |
| 1531 d= d>0 ? 0 : d; | |
| 1532 d= d<q ? q : d; | |
| 1533 } | |
| 1534 | |
| 1535 dst[3]-= d; | 296 dst[3]-= d; |
| 1536 dst[4]+= d; | 297 dst[4]+= d; |
| 1537 } | 298 } |
| 1538 dst+= stride; | 299 dst+= stride; |
| 1539 } | 300 } |
| 1574 | 335 |
| 1575 dst+= stride; | 336 dst+= stride; |
| 1576 } | 337 } |
| 1577 } | 338 } |
| 1578 | 339 |
| 1579 | 340 /** |
| 1580 static inline void dering(uint8_t src[], int stride, int QP) | 341 * Experimental Filter 1 (Horizontal) |
| 1581 { | 342 * will not damage linear gradients |
| 1582 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 343 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
| 1583 asm volatile( | 344 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
| 1584 "movq pQPb, %%mm0 \n\t" | 345 * MMX2 version does correct clipping C version doesnt |
| 1585 "paddusb %%mm0, %%mm0 \n\t" | 346 * not identical with the vertical one |
| 1586 "movq %%mm0, pQPb2 \n\t" | 347 */ |
| 1587 | 348 static inline void horizX1Filter(uint8_t *src, int stride, int QP) |
| 1588 "leal (%0, %1), %%eax \n\t" | 349 { |
| 1589 "leal (%%eax, %1, 4), %%ebx \n\t" | 350 int y; |
| 1590 // 0 1 2 3 4 5 6 7 8 9 | 351 static uint64_t *lut= NULL; |
| 1591 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 352 if(lut==NULL) |
| 1592 | 353 { |
| 1593 "pcmpeqb %%mm7, %%mm7 \n\t" | 354 int i; |
| 1594 "pxor %%mm6, %%mm6 \n\t" | 355 lut= (uint64_t*)memalign(8, 256*8); |
| 1595 #ifdef HAVE_MMX2 | 356 for(i=0; i<256; i++) |
| 1596 #define FIND_MIN_MAX(addr)\ | 357 { |
| 1597 "movq " #addr ", %%mm0 \n\t"\ | 358 int v= i < 128 ? 2*i : 2*(i-256); |
| 1598 "pminub %%mm0, %%mm7 \n\t"\ | 359 /* |
| 1599 "pmaxub %%mm0, %%mm6 \n\t" | 360 //Simulate 112242211 9-Tap filter |
| 361 uint64_t a= (v/16) & 0xFF; | |
| 362 uint64_t b= (v/8) & 0xFF; | |
| 363 uint64_t c= (v/4) & 0xFF; | |
| 364 uint64_t d= (3*v/8) & 0xFF; | |
| 365 */ | |
| 366 //Simulate piecewise linear interpolation | |
| 367 uint64_t a= (v/16) & 0xFF; | |
| 368 uint64_t b= (v*3/16) & 0xFF; | |
| 369 uint64_t c= (v*5/16) & 0xFF; | |
| 370 uint64_t d= (7*v/16) & 0xFF; | |
| 371 uint64_t A= (0x100 - a)&0xFF; | |
| 372 uint64_t B= (0x100 - b)&0xFF; | |
| 373 uint64_t C= (0x100 - c)&0xFF; | |
| 374 uint64_t D= (0x100 - c)&0xFF; | |
| 375 | |
| 376 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | | |
| 377 (D<<24) | (C<<16) | (B<<8) | (A); | |
| 378 //lut[i] = (v<<32) | (v<<24); | |
| 379 } | |
| 380 } | |
| 381 | |
| 382 for(y=0; y<BLOCK_SIZE; y++) | |
| 383 { | |
| 384 int a= src[1] - src[2]; | |
| 385 int b= src[3] - src[4]; | |
| 386 int c= src[5] - src[6]; | |
| 387 | |
| 388 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); | |
| 389 | |
| 390 if(d < QP) | |
| 391 { | |
| 392 int v = d * SIGN(-b); | |
| 393 | |
| 394 src[1] +=v/8; | |
| 395 src[2] +=v/4; | |
| 396 src[3] +=3*v/8; | |
| 397 src[4] -=3*v/8; | |
| 398 src[5] -=v/4; | |
| 399 src[6] -=v/8; | |
| 400 | |
| 401 } | |
| 402 src+=stride; | |
| 403 } | |
| 404 } | |
| 405 | |
| 406 | |
| 407 //Note: we have C, MMX, MMX2, 3DNOW version therse no 3DNOW+MMX2 one | |
| 408 //Plain C versions | |
| 409 #undef HAVE_MMX | |
| 410 #undef HAVE_MMX2 | |
| 411 #undef HAVE_3DNOW | |
| 412 #undef ARCH_X86 | |
| 413 #define RENAME(a) a ## _C | |
| 414 #include "postprocess_template.c" | |
| 415 | |
| 416 #ifdef CAN_COMPILE_X86_ASM | |
| 417 | |
| 418 //MMX versions | |
| 419 #undef RENAME | |
| 420 #define HAVE_MMX | |
| 421 #undef HAVE_MMX2 | |
| 422 #undef HAVE_3DNOW | |
| 423 #define ARCH_X86 | |
| 424 #define RENAME(a) a ## _MMX | |
| 425 #include "postprocess_template.c" | |
| 426 | |
| 427 //MMX2 versions | |
| 428 #undef RENAME | |
| 429 #define HAVE_MMX | |
| 430 #define HAVE_MMX2 | |
| 431 #undef HAVE_3DNOW | |
| 432 #define ARCH_X86 | |
| 433 #define RENAME(a) a ## _MMX2 | |
| 434 #include "postprocess_template.c" | |
| 435 | |
| 436 //3DNOW versions | |
| 437 #undef RENAME | |
| 438 #define HAVE_MMX | |
| 439 #undef HAVE_MMX2 | |
| 440 #define HAVE_3DNOW | |
| 441 #define ARCH_X86 | |
| 442 #define RENAME(a) a ## _3DNow | |
| 443 #include "postprocess_template.c" | |
| 444 | |
| 445 #endif //CAN_COMPILE_X86_ASM | |
| 446 | |
| 447 // minor note: the HAVE_xyz is messed up after that line so dont use it | |
| 448 | |
| 449 static inline void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | |
| 450 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode) | |
| 451 { | |
| 452 // useing ifs here as they are faster than function pointers allthough the | |
| 453 // difference wouldnt be messureable here but its much better because | |
| 454 // someone might exchange the cpu whithout restarting mplayer ;) | |
| 455 | |
| 456 #ifdef CAN_COMPILE_X86_ASM | |
| 457 // ordered per speed fasterst first | |
| 458 if(gCpuCaps.hasMMX2) | |
| 459 postProcess_MMX2(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, ppMode); | |
| 460 else if(gCpuCaps.has3DNow) | |
| 461 postProcess_3DNow(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, ppMode); | |
| 462 else if(gCpuCaps.hasMMX) | |
| 463 postProcess_MMX(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, ppMode); | |
| 464 else | |
| 465 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, ppMode); | |
| 1600 #else | 466 #else |
| 1601 #define FIND_MIN_MAX(addr)\ | 467 postProcess_C(src, srcStride, dst, dstStride, width, height, QPs, QPStride, isColor, ppMode); |
| 1602 "movq " #addr ", %%mm0 \n\t"\ | |
| 1603 "movq %%mm7, %%mm1 \n\t"\ | |
| 1604 "psubusb %%mm0, %%mm6 \n\t"\ | |
| 1605 "paddb %%mm0, %%mm6 \n\t"\ | |
| 1606 "psubusb %%mm0, %%mm1 \n\t"\ | |
| 1607 "psubb %%mm1, %%mm7 \n\t" | |
| 1608 #endif | |
| 1609 | |
| 1610 FIND_MIN_MAX((%%eax)) | |
| 1611 FIND_MIN_MAX((%%eax, %1)) | |
| 1612 FIND_MIN_MAX((%%eax, %1, 2)) | |
| 1613 FIND_MIN_MAX((%0, %1, 4)) | |
| 1614 FIND_MIN_MAX((%%ebx)) | |
| 1615 FIND_MIN_MAX((%%ebx, %1)) | |
| 1616 FIND_MIN_MAX((%%ebx, %1, 2)) | |
| 1617 FIND_MIN_MAX((%0, %1, 8)) | |
| 1618 | |
| 1619 "movq %%mm7, %%mm4 \n\t" | |
| 1620 "psrlq $8, %%mm7 \n\t" | |
| 1621 #ifdef HAVE_MMX2 | |
| 1622 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1623 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
| 1624 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1625 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
| 1626 "pminub %%mm4, %%mm7 \n\t" | |
| 1627 #else | |
| 1628 "movq %%mm7, %%mm1 \n\t" | |
| 1629 "psubusb %%mm4, %%mm1 \n\t" | |
| 1630 "psubb %%mm1, %%mm7 \n\t" | |
| 1631 "movq %%mm7, %%mm4 \n\t" | |
| 1632 "psrlq $16, %%mm7 \n\t" | |
| 1633 "movq %%mm7, %%mm1 \n\t" | |
| 1634 "psubusb %%mm4, %%mm1 \n\t" | |
| 1635 "psubb %%mm1, %%mm7 \n\t" | |
| 1636 "movq %%mm7, %%mm4 \n\t" | |
| 1637 "psrlq $32, %%mm7 \n\t" | |
| 1638 "movq %%mm7, %%mm1 \n\t" | |
| 1639 "psubusb %%mm4, %%mm1 \n\t" | |
| 1640 "psubb %%mm1, %%mm7 \n\t" | |
| 1641 #endif | |
| 1642 | |
| 1643 | |
| 1644 "movq %%mm6, %%mm4 \n\t" | |
| 1645 "psrlq $8, %%mm6 \n\t" | |
| 1646 #ifdef HAVE_MMX2 | |
| 1647 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels | |
| 1648 "pshufw $0xF9, %%mm6, %%mm4 \n\t" | |
| 1649 "pmaxub %%mm4, %%mm6 \n\t" | |
| 1650 "pshufw $0xFE, %%mm6, %%mm4 \n\t" | |
| 1651 "pmaxub %%mm4, %%mm6 \n\t" | |
| 1652 #else | |
| 1653 "psubusb %%mm4, %%mm6 \n\t" | |
| 1654 "paddb %%mm4, %%mm6 \n\t" | |
| 1655 "movq %%mm6, %%mm4 \n\t" | |
| 1656 "psrlq $16, %%mm6 \n\t" | |
| 1657 "psubusb %%mm4, %%mm6 \n\t" | |
| 1658 "paddb %%mm4, %%mm6 \n\t" | |
| 1659 "movq %%mm6, %%mm4 \n\t" | |
| 1660 "psrlq $32, %%mm6 \n\t" | |
| 1661 "psubusb %%mm4, %%mm6 \n\t" | |
| 1662 "paddb %%mm4, %%mm6 \n\t" | |
| 1663 #endif | |
| 1664 "movq %%mm6, %%mm0 \n\t" // max | |
| 1665 "psubb %%mm7, %%mm6 \n\t" // max - min | |
| 1666 "movd %%mm6, %%ecx \n\t" | |
| 1667 "cmpb deringThreshold, %%cl \n\t" | |
| 1668 " jb 1f \n\t" | |
| 1669 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 | |
| 1670 "punpcklbw %%mm7, %%mm7 \n\t" | |
| 1671 "punpcklbw %%mm7, %%mm7 \n\t" | |
| 1672 "punpcklbw %%mm7, %%mm7 \n\t" | |
| 1673 "movq %%mm7, temp0 \n\t" | |
| 1674 | |
| 1675 "movq (%0), %%mm0 \n\t" // L10 | |
| 1676 "movq %%mm0, %%mm1 \n\t" // L10 | |
| 1677 "movq %%mm0, %%mm2 \n\t" // L10 | |
| 1678 "psllq $8, %%mm1 \n\t" | |
| 1679 "psrlq $8, %%mm2 \n\t" | |
| 1680 "movd -4(%0), %%mm3 \n\t" | |
| 1681 "movd 8(%0), %%mm4 \n\t" | |
| 1682 "psrlq $24, %%mm3 \n\t" | |
| 1683 "psllq $56, %%mm4 \n\t" | |
| 1684 "por %%mm3, %%mm1 \n\t" // L00 | |
| 1685 "por %%mm4, %%mm2 \n\t" // L20 | |
| 1686 "movq %%mm1, %%mm3 \n\t" // L00 | |
| 1687 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 | |
| 1688 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 | |
| 1689 "psubusb %%mm7, %%mm0 \n\t" | |
| 1690 "psubusb %%mm7, %%mm2 \n\t" | |
| 1691 "psubusb %%mm7, %%mm3 \n\t" | |
| 1692 "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1 | |
| 1693 "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1 | |
| 1694 "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1 | |
| 1695 "paddb %%mm2, %%mm0 \n\t" | |
| 1696 "paddb %%mm3, %%mm0 \n\t" | |
| 1697 | |
| 1698 "movq (%%eax), %%mm2 \n\t" // L11 | |
| 1699 "movq %%mm2, %%mm3 \n\t" // L11 | |
| 1700 "movq %%mm2, %%mm4 \n\t" // L11 | |
| 1701 "psllq $8, %%mm3 \n\t" | |
| 1702 "psrlq $8, %%mm4 \n\t" | |
| 1703 "movd -4(%%eax), %%mm5 \n\t" | |
| 1704 "movd 8(%%eax), %%mm6 \n\t" | |
| 1705 "psrlq $24, %%mm5 \n\t" | |
| 1706 "psllq $56, %%mm6 \n\t" | |
| 1707 "por %%mm5, %%mm3 \n\t" // L01 | |
| 1708 "por %%mm6, %%mm4 \n\t" // L21 | |
| 1709 "movq %%mm3, %%mm5 \n\t" // L01 | |
| 1710 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 | |
| 1711 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 | |
| 1712 "psubusb %%mm7, %%mm2 \n\t" | |
| 1713 "psubusb %%mm7, %%mm4 \n\t" | |
| 1714 "psubusb %%mm7, %%mm5 \n\t" | |
| 1715 "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1 | |
| 1716 "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1 | |
| 1717 "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1 | |
| 1718 "paddb %%mm4, %%mm2 \n\t" | |
| 1719 "paddb %%mm5, %%mm2 \n\t" | |
| 1720 // 0, 2, 3, 1 | |
| 1721 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ | |
| 1722 "movq " #src ", " #sx " \n\t" /* src[0] */\ | |
| 1723 "movq " #sx ", " #lx " \n\t" /* src[0] */\ | |
| 1724 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ | |
| 1725 "psllq $8, " #lx " \n\t"\ | |
| 1726 "psrlq $8, " #t0 " \n\t"\ | |
| 1727 "movd -4" #src ", " #t1 " \n\t"\ | |
| 1728 "psrlq $24, " #t1 " \n\t"\ | |
| 1729 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ | |
| 1730 "movd 8" #src ", " #t1 " \n\t"\ | |
| 1731 "psllq $56, " #t1 " \n\t"\ | |
| 1732 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | |
| 1733 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | |
| 1734 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | |
| 1735 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | |
| 1736 PAVGB(lx, pplx) \ | |
| 1737 "movq " #lx ", temp1 \n\t"\ | |
| 1738 "movq temp0, " #lx " \n\t"\ | |
| 1739 "psubusb " #lx ", " #t1 " \n\t"\ | |
| 1740 "psubusb " #lx ", " #t0 " \n\t"\ | |
| 1741 "psubusb " #lx ", " #sx " \n\t"\ | |
| 1742 "movq b00, " #lx " \n\t"\ | |
| 1743 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ | |
| 1744 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ | |
| 1745 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ | |
| 1746 "paddb " #t1 ", " #t0 " \n\t"\ | |
| 1747 "paddb " #t0 ", " #sx " \n\t"\ | |
| 1748 \ | |
| 1749 PAVGB(plx, pplx) /* filtered */\ | |
| 1750 "movq " #dst ", " #t0 " \n\t" /* dst */\ | |
| 1751 "movq " #t0 ", " #t1 " \n\t" /* dst */\ | |
| 1752 "psubusb pQPb2, " #t0 " \n\t"\ | |
| 1753 "paddusb pQPb2, " #t1 " \n\t"\ | |
| 1754 PMAXUB(t0, pplx)\ | |
| 1755 PMINUB(t1, pplx, t0)\ | |
| 1756 "paddb " #sx ", " #ppsx " \n\t"\ | |
| 1757 "paddb " #psx ", " #ppsx " \n\t"\ | |
| 1758 "#paddb b02, " #ppsx " \n\t"\ | |
| 1759 "pand b08, " #ppsx " \n\t"\ | |
| 1760 "pcmpeqb " #lx ", " #ppsx " \n\t"\ | |
| 1761 "pand " #ppsx ", " #pplx " \n\t"\ | |
| 1762 "pandn " #dst ", " #ppsx " \n\t"\ | |
| 1763 "por " #pplx ", " #ppsx " \n\t"\ | |
| 1764 "movq " #ppsx ", " #dst " \n\t"\ | |
| 1765 "movq temp1, " #lx " \n\t" | |
| 1766 | |
| 1767 /* | |
| 1768 0000000 | |
| 1769 1111111 | |
| 1770 | |
| 1771 1111110 | |
| 1772 1111101 | |
| 1773 1111100 | |
| 1774 1111011 | |
| 1775 1111010 | |
| 1776 1111001 | |
| 1777 | |
| 1778 1111000 | |
| 1779 1110111 | |
| 1780 | |
| 1781 */ | |
| 1782 //DERING_CORE(dst,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) | |
| 1783 DERING_CORE((%%eax),(%%eax, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1784 DERING_CORE((%%eax, %1),(%%eax, %1, 2) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1785 DERING_CORE((%%eax, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
| 1786 DERING_CORE((%0, %1, 4),(%%ebx) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1787 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1788 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | |
| 1789 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | |
| 1790 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | |
| 1791 | |
| 1792 "1: \n\t" | |
| 1793 : : "r" (src), "r" (stride), "r" (QP) | |
| 1794 : "%eax", "%ebx", "%ecx" | |
| 1795 ); | |
| 1796 #else | |
| 1797 int y; | |
| 1798 int min=255; | |
| 1799 int max=0; | |
| 1800 int avg; | |
| 1801 uint8_t *p; | |
| 1802 int s[10]; | |
| 1803 | |
| 1804 for(y=1; y<9; y++) | |
| 1805 { | |
| 1806 int x; | |
| 1807 p= src + stride*y; | |
| 1808 for(x=1; x<9; x++) | |
| 1809 { | |
| 1810 p++; | |
| 1811 if(*p > max) max= *p; | |
| 1812 if(*p < min) min= *p; | |
| 1813 } | |
| 1814 } | |
| 1815 avg= (min + max + 1)/2; | |
| 1816 | |
| 1817 if(max - min <deringThreshold) return; | |
| 1818 | |
| 1819 for(y=0; y<10; y++) | |
| 1820 { | |
| 1821 int x; | |
| 1822 int t = 0; | |
| 1823 p= src + stride*y; | |
| 1824 for(x=0; x<10; x++) | |
| 1825 { | |
| 1826 if(*p > avg) t |= (1<<x); | |
| 1827 p++; | |
| 1828 } | |
| 1829 t |= (~t)<<16; | |
| 1830 t &= (t<<1) & (t>>1); | |
| 1831 s[y] = t; | |
| 1832 } | |
| 1833 | |
| 1834 for(y=1; y<9; y++) | |
| 1835 { | |
| 1836 int x; | |
| 1837 int t = s[y-1] & s[y] & s[y+1]; | |
| 1838 t|= t>>16; | |
| 1839 | |
| 1840 p= src + stride*y; | |
| 1841 for(x=1; x<9; x++) | |
| 1842 { | |
| 1843 p++; | |
| 1844 if(t & (1<<x)) | |
| 1845 { | |
| 1846 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | |
| 1847 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | |
| 1848 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | |
| 1849 f= (f + 8)>>4; | |
| 1850 | |
| 1851 #ifdef DEBUG_DERING_THRESHOLD | |
| 1852 asm volatile("emms\n\t":); | |
| 1853 { | |
| 1854 static long long numPixels=0; | |
| 1855 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | |
| 1856 // if((max-min)<20 || (max-min)*QP<200) | |
| 1857 // if((max-min)*QP < 500) | |
| 1858 // if(max-min<QP/2) | |
| 1859 if(max-min < 20) | |
| 1860 { | |
| 1861 static int numSkiped=0; | |
| 1862 static int errorSum=0; | |
| 1863 static int worstQP=0; | |
| 1864 static int worstRange=0; | |
| 1865 static int worstDiff=0; | |
| 1866 int diff= (f - *p); | |
| 1867 int absDiff= ABS(diff); | |
| 1868 int error= diff*diff; | |
| 1869 | |
| 1870 if(x==1 || x==8 || y==1 || y==8) continue; | |
| 1871 | |
| 1872 numSkiped++; | |
| 1873 if(absDiff > worstDiff) | |
| 1874 { | |
| 1875 worstDiff= absDiff; | |
| 1876 worstQP= QP; | |
| 1877 worstRange= max-min; | |
| 1878 } | |
| 1879 errorSum+= error; | |
| 1880 | |
| 1881 if(1024LL*1024LL*1024LL % numSkiped == 0) | |
| 1882 { | |
| 1883 printf( "sum:%1.3f, skip:%d, wQP:%d, " | |
| 1884 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | |
| 1885 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, | |
| 1886 worstDiff, (float)numSkiped/numPixels); | |
| 1887 } | |
| 1888 } | |
| 1889 } | |
| 1890 #endif | |
| 1891 if (*p + 2*QP < f) *p= *p + 2*QP; | |
| 1892 else if(*p - 2*QP > f) *p= *p - 2*QP; | |
| 1893 else *p=f; | |
| 1894 } | |
| 1895 } | |
| 1896 } | |
| 1897 #ifdef DEBUG_DERING_THRESHOLD | |
| 1898 if(max-min < 20) | |
| 1899 { | |
| 1900 for(y=1; y<9; y++) | |
| 1901 { | |
| 1902 int x; | |
| 1903 int t = 0; | |
| 1904 p= src + stride*y; | |
| 1905 for(x=1; x<9; x++) | |
| 1906 { | |
| 1907 p++; | |
| 1908 *p = MIN(*p + 20, 255); | |
| 1909 } | |
| 1910 } | |
| 1911 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | |
| 1912 } | |
| 1913 #endif | |
| 1914 #endif | |
| 1915 } | |
| 1916 | |
| 1917 /** | |
| 1918 * Deinterlaces the given block | |
| 1919 * will be called for every 8x8 block and can read & write from line 4-15 | |
| 1920 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1921 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 1922 */ | |
| 1923 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) | |
| 1924 { | |
| 1925 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 1926 src+= 4*stride; | |
| 1927 asm volatile( | |
| 1928 "leal (%0, %1), %%eax \n\t" | |
| 1929 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 1930 // 0 1 2 3 4 5 6 7 8 9 | |
| 1931 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 1932 | |
| 1933 "movq (%0), %%mm0 \n\t" | |
| 1934 "movq (%%eax, %1), %%mm1 \n\t" | |
| 1935 PAVGB(%%mm1, %%mm0) | |
| 1936 "movq %%mm0, (%%eax) \n\t" | |
| 1937 "movq (%0, %1, 4), %%mm0 \n\t" | |
| 1938 PAVGB(%%mm0, %%mm1) | |
| 1939 "movq %%mm1, (%%eax, %1, 2) \n\t" | |
| 1940 "movq (%%ebx, %1), %%mm1 \n\t" | |
| 1941 PAVGB(%%mm1, %%mm0) | |
| 1942 "movq %%mm0, (%%ebx) \n\t" | |
| 1943 "movq (%0, %1, 8), %%mm0 \n\t" | |
| 1944 PAVGB(%%mm0, %%mm1) | |
| 1945 "movq %%mm1, (%%ebx, %1, 2) \n\t" | |
| 1946 | |
| 1947 : : "r" (src), "r" (stride) | |
| 1948 : "%eax", "%ebx" | |
| 1949 ); | |
| 1950 #else | |
| 1951 int x; | |
| 1952 src+= 4*stride; | |
| 1953 for(x=0; x<8; x++) | |
| 1954 { | |
| 1955 src[stride] = (src[0] + src[stride*2])>>1; | |
| 1956 src[stride*3] = (src[stride*2] + src[stride*4])>>1; | |
| 1957 src[stride*5] = (src[stride*4] + src[stride*6])>>1; | |
| 1958 src[stride*7] = (src[stride*6] + src[stride*8])>>1; | |
| 1959 src++; | |
| 1960 } | |
| 1961 #endif | |
| 1962 } | |
| 1963 | |
| 1964 /** | |
| 1965 * Deinterlaces the given block | |
| 1966 * will be called for every 8x8 block and can read & write from line 4-15 | |
| 1967 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 1968 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 1969 * this filter will read lines 3-15 and write 7-13 | |
| 1970 * no cliping in C version | |
| 1971 */ | |
| 1972 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride) | |
| 1973 { | |
| 1974 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 1975 src+= stride*3; | |
| 1976 asm volatile( | |
| 1977 "leal (%0, %1), %%eax \n\t" | |
| 1978 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 1979 "leal (%%ebx, %1, 4), %%ecx \n\t" | |
| 1980 "addl %1, %%ecx \n\t" | |
| 1981 "pxor %%mm7, %%mm7 \n\t" | |
| 1982 // 0 1 2 3 4 5 6 7 8 9 10 | |
| 1983 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx | |
| 1984 | |
| 1985 #define DEINT_CUBIC(a,b,c,d,e)\ | |
| 1986 "movq " #a ", %%mm0 \n\t"\ | |
| 1987 "movq " #b ", %%mm1 \n\t"\ | |
| 1988 "movq " #d ", %%mm2 \n\t"\ | |
| 1989 "movq " #e ", %%mm3 \n\t"\ | |
| 1990 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ | |
| 1991 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ | |
| 1992 "movq %%mm0, %%mm2 \n\t"\ | |
| 1993 "punpcklbw %%mm7, %%mm0 \n\t"\ | |
| 1994 "punpckhbw %%mm7, %%mm2 \n\t"\ | |
| 1995 "movq %%mm1, %%mm3 \n\t"\ | |
| 1996 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
| 1997 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
| 1998 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ | |
| 1999 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ | |
| 2000 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ | |
| 2001 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ | |
| 2002 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | |
| 2003 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | |
| 2004 "packuswb %%mm3, %%mm1 \n\t"\ | |
| 2005 "movq %%mm1, " #c " \n\t" | |
| 2006 | |
| 2007 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1)) | |
| 2008 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8)) | |
| 2009 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx)) | |
| 2010 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2)) | |
| 2011 | |
| 2012 : : "r" (src), "r" (stride) | |
| 2013 : "%eax", "%ebx", "ecx" | |
| 2014 ); | |
| 2015 #else | |
| 2016 int x; | |
| 2017 src+= stride*3; | |
| 2018 for(x=0; x<8; x++) | |
| 2019 { | |
| 2020 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; | |
| 2021 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; | |
| 2022 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; | |
| 2023 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4; | |
| 2024 src++; | |
| 2025 } | |
| 2026 #endif | |
| 2027 } | |
| 2028 | |
| 2029 /** | |
| 2030 * Deinterlaces the given block | |
| 2031 * will be called for every 8x8 block and can read & write from line 4-15 | |
| 2032 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 2033 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 2034 * will shift the image up by 1 line (FIXME if this is a problem) | |
| 2035 * this filter will read lines 4-13 and write 4-11 | |
| 2036 */ | |
| 2037 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) | |
| 2038 { | |
| 2039 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 2040 src+= 4*stride; | |
| 2041 asm volatile( | |
| 2042 "leal (%0, %1), %%eax \n\t" | |
| 2043 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 2044 // 0 1 2 3 4 5 6 7 8 9 | |
| 2045 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 2046 | |
| 2047 "movq (%0), %%mm0 \n\t" // L0 | |
| 2048 "movq (%%eax, %1), %%mm1 \n\t" // L2 | |
| 2049 PAVGB(%%mm1, %%mm0) // L0+L2 | |
| 2050 "movq (%%eax), %%mm2 \n\t" // L1 | |
| 2051 PAVGB(%%mm2, %%mm0) | |
| 2052 "movq %%mm0, (%0) \n\t" | |
| 2053 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 | |
| 2054 PAVGB(%%mm0, %%mm2) // L1+L3 | |
| 2055 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 | |
| 2056 "movq %%mm2, (%%eax) \n\t" | |
| 2057 "movq (%0, %1, 4), %%mm2 \n\t" // L4 | |
| 2058 PAVGB(%%mm2, %%mm1) // L2+L4 | |
| 2059 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 | |
| 2060 "movq %%mm1, (%%eax, %1) \n\t" | |
| 2061 "movq (%%ebx), %%mm1 \n\t" // L5 | |
| 2062 PAVGB(%%mm1, %%mm0) // L3+L5 | |
| 2063 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 | |
| 2064 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 2065 "movq (%%ebx, %1), %%mm0 \n\t" // L6 | |
| 2066 PAVGB(%%mm0, %%mm2) // L4+L6 | |
| 2067 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 | |
| 2068 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 2069 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7 | |
| 2070 PAVGB(%%mm2, %%mm1) // L5+L7 | |
| 2071 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 | |
| 2072 "movq %%mm1, (%%ebx) \n\t" | |
| 2073 "movq (%0, %1, 8), %%mm1 \n\t" // L8 | |
| 2074 PAVGB(%%mm1, %%mm0) // L6+L8 | |
| 2075 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 | |
| 2076 "movq %%mm0, (%%ebx, %1) \n\t" | |
| 2077 "movq (%%ebx, %1, 4), %%mm0 \n\t" // L9 | |
| 2078 PAVGB(%%mm0, %%mm2) // L7+L9 | |
| 2079 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 | |
| 2080 "movq %%mm2, (%%ebx, %1, 2) \n\t" | |
| 2081 | |
| 2082 | |
| 2083 : : "r" (src), "r" (stride) | |
| 2084 : "%eax", "%ebx" | |
| 2085 ); | |
| 2086 #else | |
| 2087 int x; | |
| 2088 src+= 4*stride; | |
| 2089 for(x=0; x<8; x++) | |
| 2090 { | |
| 2091 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |
| 2092 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |
| 2093 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |
| 2094 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |
| 2095 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |
| 2096 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |
| 2097 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; | |
| 2098 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; | |
| 2099 src++; | |
| 2100 } | |
| 2101 #endif | |
| 2102 } | |
| 2103 | |
| 2104 /** | |
| 2105 * Deinterlaces the given block | |
| 2106 * will be called for every 8x8 block and can read & write from line 4-15, | |
| 2107 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 2108 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 2109 */ | |
| 2110 static inline void deInterlaceMedian(uint8_t src[], int stride) | |
| 2111 { | |
| 2112 #ifdef HAVE_MMX | |
| 2113 src+= 4*stride; | |
| 2114 #ifdef HAVE_MMX2 | |
| 2115 asm volatile( | |
| 2116 "leal (%0, %1), %%eax \n\t" | |
| 2117 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 2118 // 0 1 2 3 4 5 6 7 8 9 | |
| 2119 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 2120 | |
| 2121 "movq (%0), %%mm0 \n\t" // | |
| 2122 "movq (%%eax, %1), %%mm2 \n\t" // | |
| 2123 "movq (%%eax), %%mm1 \n\t" // | |
| 2124 "movq %%mm0, %%mm3 \n\t" | |
| 2125 "pmaxub %%mm1, %%mm0 \n\t" // | |
| 2126 "pminub %%mm3, %%mm1 \n\t" // | |
| 2127 "pmaxub %%mm2, %%mm1 \n\t" // | |
| 2128 "pminub %%mm1, %%mm0 \n\t" | |
| 2129 "movq %%mm0, (%%eax) \n\t" | |
| 2130 | |
| 2131 "movq (%0, %1, 4), %%mm0 \n\t" // | |
| 2132 "movq (%%eax, %1, 2), %%mm1 \n\t" // | |
| 2133 "movq %%mm2, %%mm3 \n\t" | |
| 2134 "pmaxub %%mm1, %%mm2 \n\t" // | |
| 2135 "pminub %%mm3, %%mm1 \n\t" // | |
| 2136 "pmaxub %%mm0, %%mm1 \n\t" // | |
| 2137 "pminub %%mm1, %%mm2 \n\t" | |
| 2138 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
| 2139 | |
| 2140 "movq (%%ebx), %%mm2 \n\t" // | |
| 2141 "movq (%%ebx, %1), %%mm1 \n\t" // | |
| 2142 "movq %%mm2, %%mm3 \n\t" | |
| 2143 "pmaxub %%mm0, %%mm2 \n\t" // | |
| 2144 "pminub %%mm3, %%mm0 \n\t" // | |
| 2145 "pmaxub %%mm1, %%mm0 \n\t" // | |
| 2146 "pminub %%mm0, %%mm2 \n\t" | |
| 2147 "movq %%mm2, (%%ebx) \n\t" | |
| 2148 | |
| 2149 "movq (%%ebx, %1, 2), %%mm2 \n\t" // | |
| 2150 "movq (%0, %1, 8), %%mm0 \n\t" // | |
| 2151 "movq %%mm2, %%mm3 \n\t" | |
| 2152 "pmaxub %%mm0, %%mm2 \n\t" // | |
| 2153 "pminub %%mm3, %%mm0 \n\t" // | |
| 2154 "pmaxub %%mm1, %%mm0 \n\t" // | |
| 2155 "pminub %%mm0, %%mm2 \n\t" | |
| 2156 "movq %%mm2, (%%ebx, %1, 2) \n\t" | |
| 2157 | |
| 2158 | |
| 2159 : : "r" (src), "r" (stride) | |
| 2160 : "%eax", "%ebx" | |
| 2161 ); | |
| 2162 | |
| 2163 #else // MMX without MMX2 | |
| 2164 asm volatile( | |
| 2165 "leal (%0, %1), %%eax \n\t" | |
| 2166 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 2167 // 0 1 2 3 4 5 6 7 8 9 | |
| 2168 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 2169 "pxor %%mm7, %%mm7 \n\t" | |
| 2170 | |
| 2171 #define MEDIAN(a,b,c)\ | |
| 2172 "movq " #a ", %%mm0 \n\t"\ | |
| 2173 "movq " #b ", %%mm2 \n\t"\ | |
| 2174 "movq " #c ", %%mm1 \n\t"\ | |
| 2175 "movq %%mm0, %%mm3 \n\t"\ | |
| 2176 "movq %%mm1, %%mm4 \n\t"\ | |
| 2177 "movq %%mm2, %%mm5 \n\t"\ | |
| 2178 "psubusb %%mm1, %%mm3 \n\t"\ | |
| 2179 "psubusb %%mm2, %%mm4 \n\t"\ | |
| 2180 "psubusb %%mm0, %%mm5 \n\t"\ | |
| 2181 "pcmpeqb %%mm7, %%mm3 \n\t"\ | |
| 2182 "pcmpeqb %%mm7, %%mm4 \n\t"\ | |
| 2183 "pcmpeqb %%mm7, %%mm5 \n\t"\ | |
| 2184 "movq %%mm3, %%mm6 \n\t"\ | |
| 2185 "pxor %%mm4, %%mm3 \n\t"\ | |
| 2186 "pxor %%mm5, %%mm4 \n\t"\ | |
| 2187 "pxor %%mm6, %%mm5 \n\t"\ | |
| 2188 "por %%mm3, %%mm1 \n\t"\ | |
| 2189 "por %%mm4, %%mm2 \n\t"\ | |
| 2190 "por %%mm5, %%mm0 \n\t"\ | |
| 2191 "pand %%mm2, %%mm0 \n\t"\ | |
| 2192 "pand %%mm1, %%mm0 \n\t"\ | |
| 2193 "movq %%mm0, " #b " \n\t" | |
| 2194 | |
| 2195 MEDIAN((%0), (%%eax), (%%eax, %1)) | |
| 2196 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) | |
| 2197 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1)) | |
| 2198 MEDIAN((%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8)) | |
| 2199 | |
| 2200 : : "r" (src), "r" (stride) | |
| 2201 : "%eax", "%ebx" | |
| 2202 ); | |
| 2203 #endif // MMX | |
| 2204 #else | |
| 2205 //FIXME | |
| 2206 int x; | |
| 2207 src+= 4*stride; | |
| 2208 for(x=0; x<8; x++) | |
| 2209 { | |
| 2210 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |
| 2211 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |
| 2212 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |
| 2213 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |
| 2214 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |
| 2215 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |
| 2216 src[stride*6] = (src[stride*6] + 2*src[stride*7] + src[stride*8])>>2; | |
| 2217 src[stride*7] = (src[stride*7] + 2*src[stride*8] + src[stride*9])>>2; | |
| 2218 src++; | |
| 2219 } | |
| 2220 #endif | |
| 2221 } | |
| 2222 | |
| 2223 #ifdef HAVE_MMX | |
| 2224 /** | |
| 2225 * transposes and shift the given 8x8 Block into dst1 and dst2 | |
| 2226 */ | |
| 2227 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) | |
| 2228 { | |
| 2229 asm( | |
| 2230 "leal (%0, %1), %%eax \n\t" | |
| 2231 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 2232 // 0 1 2 3 4 5 6 7 8 9 | |
| 2233 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 2234 "movq (%0), %%mm0 \n\t" // 12345678 | |
| 2235 "movq (%%eax), %%mm1 \n\t" // abcdefgh | |
| 2236 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
| 2237 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
| 2238 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
| 2239 | |
| 2240 "movq (%%eax, %1), %%mm1 \n\t" | |
| 2241 "movq (%%eax, %1, 2), %%mm3 \n\t" | |
| 2242 "movq %%mm1, %%mm4 \n\t" | |
| 2243 "punpcklbw %%mm3, %%mm1 \n\t" | |
| 2244 "punpckhbw %%mm3, %%mm4 \n\t" | |
| 2245 | |
| 2246 "movq %%mm0, %%mm3 \n\t" | |
| 2247 "punpcklwd %%mm1, %%mm0 \n\t" | |
| 2248 "punpckhwd %%mm1, %%mm3 \n\t" | |
| 2249 "movq %%mm2, %%mm1 \n\t" | |
| 2250 "punpcklwd %%mm4, %%mm2 \n\t" | |
| 2251 "punpckhwd %%mm4, %%mm1 \n\t" | |
| 2252 | |
| 2253 "movd %%mm0, 128(%2) \n\t" | |
| 2254 "psrlq $32, %%mm0 \n\t" | |
| 2255 "movd %%mm0, 144(%2) \n\t" | |
| 2256 "movd %%mm3, 160(%2) \n\t" | |
| 2257 "psrlq $32, %%mm3 \n\t" | |
| 2258 "movd %%mm3, 176(%2) \n\t" | |
| 2259 "movd %%mm3, 48(%3) \n\t" | |
| 2260 "movd %%mm2, 192(%2) \n\t" | |
| 2261 "movd %%mm2, 64(%3) \n\t" | |
| 2262 "psrlq $32, %%mm2 \n\t" | |
| 2263 "movd %%mm2, 80(%3) \n\t" | |
| 2264 "movd %%mm1, 96(%3) \n\t" | |
| 2265 "psrlq $32, %%mm1 \n\t" | |
| 2266 "movd %%mm1, 112(%3) \n\t" | |
| 2267 | |
| 2268 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 | |
| 2269 "movq (%%ebx), %%mm1 \n\t" // abcdefgh | |
| 2270 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
| 2271 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
| 2272 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
| 2273 | |
| 2274 "movq (%%ebx, %1), %%mm1 \n\t" | |
| 2275 "movq (%%ebx, %1, 2), %%mm3 \n\t" | |
| 2276 "movq %%mm1, %%mm4 \n\t" | |
| 2277 "punpcklbw %%mm3, %%mm1 \n\t" | |
| 2278 "punpckhbw %%mm3, %%mm4 \n\t" | |
| 2279 | |
| 2280 "movq %%mm0, %%mm3 \n\t" | |
| 2281 "punpcklwd %%mm1, %%mm0 \n\t" | |
| 2282 "punpckhwd %%mm1, %%mm3 \n\t" | |
| 2283 "movq %%mm2, %%mm1 \n\t" | |
| 2284 "punpcklwd %%mm4, %%mm2 \n\t" | |
| 2285 "punpckhwd %%mm4, %%mm1 \n\t" | |
| 2286 | |
| 2287 "movd %%mm0, 132(%2) \n\t" | |
| 2288 "psrlq $32, %%mm0 \n\t" | |
| 2289 "movd %%mm0, 148(%2) \n\t" | |
| 2290 "movd %%mm3, 164(%2) \n\t" | |
| 2291 "psrlq $32, %%mm3 \n\t" | |
| 2292 "movd %%mm3, 180(%2) \n\t" | |
| 2293 "movd %%mm3, 52(%3) \n\t" | |
| 2294 "movd %%mm2, 196(%2) \n\t" | |
| 2295 "movd %%mm2, 68(%3) \n\t" | |
| 2296 "psrlq $32, %%mm2 \n\t" | |
| 2297 "movd %%mm2, 84(%3) \n\t" | |
| 2298 "movd %%mm1, 100(%3) \n\t" | |
| 2299 "psrlq $32, %%mm1 \n\t" | |
| 2300 "movd %%mm1, 116(%3) \n\t" | |
| 2301 | |
| 2302 | |
| 2303 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) | |
| 2304 : "%eax", "%ebx" | |
| 2305 ); | |
| 2306 } | |
| 2307 | |
| 2308 /** | |
| 2309 * transposes the given 8x8 block | |
| 2310 */ | |
| 2311 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src) | |
| 2312 { | |
| 2313 asm( | |
| 2314 "leal (%0, %1), %%eax \n\t" | |
| 2315 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 2316 // 0 1 2 3 4 5 6 7 8 9 | |
| 2317 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 2318 "movq (%2), %%mm0 \n\t" // 12345678 | |
| 2319 "movq 16(%2), %%mm1 \n\t" // abcdefgh | |
| 2320 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
| 2321 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
| 2322 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
| 2323 | |
| 2324 "movq 32(%2), %%mm1 \n\t" | |
| 2325 "movq 48(%2), %%mm3 \n\t" | |
| 2326 "movq %%mm1, %%mm4 \n\t" | |
| 2327 "punpcklbw %%mm3, %%mm1 \n\t" | |
| 2328 "punpckhbw %%mm3, %%mm4 \n\t" | |
| 2329 | |
| 2330 "movq %%mm0, %%mm3 \n\t" | |
| 2331 "punpcklwd %%mm1, %%mm0 \n\t" | |
| 2332 "punpckhwd %%mm1, %%mm3 \n\t" | |
| 2333 "movq %%mm2, %%mm1 \n\t" | |
| 2334 "punpcklwd %%mm4, %%mm2 \n\t" | |
| 2335 "punpckhwd %%mm4, %%mm1 \n\t" | |
| 2336 | |
| 2337 "movd %%mm0, (%0) \n\t" | |
| 2338 "psrlq $32, %%mm0 \n\t" | |
| 2339 "movd %%mm0, (%%eax) \n\t" | |
| 2340 "movd %%mm3, (%%eax, %1) \n\t" | |
| 2341 "psrlq $32, %%mm3 \n\t" | |
| 2342 "movd %%mm3, (%%eax, %1, 2) \n\t" | |
| 2343 "movd %%mm2, (%0, %1, 4) \n\t" | |
| 2344 "psrlq $32, %%mm2 \n\t" | |
| 2345 "movd %%mm2, (%%ebx) \n\t" | |
| 2346 "movd %%mm1, (%%ebx, %1) \n\t" | |
| 2347 "psrlq $32, %%mm1 \n\t" | |
| 2348 "movd %%mm1, (%%ebx, %1, 2) \n\t" | |
| 2349 | |
| 2350 | |
| 2351 "movq 64(%2), %%mm0 \n\t" // 12345678 | |
| 2352 "movq 80(%2), %%mm1 \n\t" // abcdefgh | |
| 2353 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
| 2354 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
| 2355 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
| 2356 | |
| 2357 "movq 96(%2), %%mm1 \n\t" | |
| 2358 "movq 112(%2), %%mm3 \n\t" | |
| 2359 "movq %%mm1, %%mm4 \n\t" | |
| 2360 "punpcklbw %%mm3, %%mm1 \n\t" | |
| 2361 "punpckhbw %%mm3, %%mm4 \n\t" | |
| 2362 | |
| 2363 "movq %%mm0, %%mm3 \n\t" | |
| 2364 "punpcklwd %%mm1, %%mm0 \n\t" | |
| 2365 "punpckhwd %%mm1, %%mm3 \n\t" | |
| 2366 "movq %%mm2, %%mm1 \n\t" | |
| 2367 "punpcklwd %%mm4, %%mm2 \n\t" | |
| 2368 "punpckhwd %%mm4, %%mm1 \n\t" | |
| 2369 | |
| 2370 "movd %%mm0, 4(%0) \n\t" | |
| 2371 "psrlq $32, %%mm0 \n\t" | |
| 2372 "movd %%mm0, 4(%%eax) \n\t" | |
| 2373 "movd %%mm3, 4(%%eax, %1) \n\t" | |
| 2374 "psrlq $32, %%mm3 \n\t" | |
| 2375 "movd %%mm3, 4(%%eax, %1, 2) \n\t" | |
| 2376 "movd %%mm2, 4(%0, %1, 4) \n\t" | |
| 2377 "psrlq $32, %%mm2 \n\t" | |
| 2378 "movd %%mm2, 4(%%ebx) \n\t" | |
| 2379 "movd %%mm1, 4(%%ebx, %1) \n\t" | |
| 2380 "psrlq $32, %%mm1 \n\t" | |
| 2381 "movd %%mm1, 4(%%ebx, %1, 2) \n\t" | |
| 2382 | |
| 2383 :: "r" (dst), "r" (dstStride), "r" (src) | |
| 2384 : "%eax", "%ebx" | |
| 2385 ); | |
| 2386 } | |
| 2387 #endif | |
| 2388 //static int test=0; | |
| 2389 | |
| 2390 static void inline tempNoiseReducer(uint8_t *src, int stride, | |
| 2391 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) | |
| 2392 { | |
| 2393 #define FAST_L2_DIFF | |
| 2394 //#define L1_DIFF //u should change the thresholds too if u try that one | |
| 2395 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 2396 asm volatile( | |
| 2397 "leal (%2, %2, 2), %%eax \n\t" // 3*stride | |
| 2398 "leal (%2, %2, 4), %%ebx \n\t" // 5*stride | |
| 2399 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | |
| 2400 // 0 1 2 3 4 5 6 7 8 9 | |
| 2401 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+ebx %x+2eax %x+ecx %x+8%2 | |
| 2402 //FIXME reorder? | |
| 2403 #ifdef L1_DIFF //needs mmx2 | |
| 2404 "movq (%0), %%mm0 \n\t" // L0 | |
| 2405 "psadbw (%1), %%mm0 \n\t" // |L0-R0| | |
| 2406 "movq (%0, %2), %%mm1 \n\t" // L1 | |
| 2407 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| | |
| 2408 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | |
| 2409 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| | |
| 2410 "movq (%0, %%eax), %%mm3 \n\t" // L3 | |
| 2411 "psadbw (%1, %%eax), %%mm3 \n\t" // |L3-R3| | |
| 2412 | |
| 2413 "movq (%0, %2, 4), %%mm4 \n\t" // L4 | |
| 2414 "paddw %%mm1, %%mm0 \n\t" | |
| 2415 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| | |
| 2416 "movq (%0, %%ebx), %%mm5 \n\t" // L5 | |
| 2417 "paddw %%mm2, %%mm0 \n\t" | |
| 2418 "psadbw (%1, %%ebx), %%mm5 \n\t" // |L5-R5| | |
| 2419 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 | |
| 2420 "paddw %%mm3, %%mm0 \n\t" | |
| 2421 "psadbw (%1, %%eax, 2), %%mm6 \n\t" // |L6-R6| | |
| 2422 "movq (%0, %%ecx), %%mm7 \n\t" // L7 | |
| 2423 "paddw %%mm4, %%mm0 \n\t" | |
| 2424 "psadbw (%1, %%ecx), %%mm7 \n\t" // |L7-R7| | |
| 2425 "paddw %%mm5, %%mm6 \n\t" | |
| 2426 "paddw %%mm7, %%mm6 \n\t" | |
| 2427 "paddw %%mm6, %%mm0 \n\t" | |
| 2428 #elif defined (FAST_L2_DIFF) | |
| 2429 "pcmpeqb %%mm7, %%mm7 \n\t" | |
| 2430 "movq b80, %%mm6 \n\t" | |
| 2431 "pxor %%mm0, %%mm0 \n\t" | |
| 2432 #define L2_DIFF_CORE(a, b)\ | |
| 2433 "movq " #a ", %%mm5 \n\t"\ | |
| 2434 "movq " #b ", %%mm2 \n\t"\ | |
| 2435 "pxor %%mm7, %%mm2 \n\t"\ | |
| 2436 PAVGB(%%mm2, %%mm5)\ | |
| 2437 "paddb %%mm6, %%mm5 \n\t"\ | |
| 2438 "movq %%mm5, %%mm2 \n\t"\ | |
| 2439 "psllw $8, %%mm5 \n\t"\ | |
| 2440 "pmaddwd %%mm5, %%mm5 \n\t"\ | |
| 2441 "pmaddwd %%mm2, %%mm2 \n\t"\ | |
| 2442 "paddd %%mm2, %%mm5 \n\t"\ | |
| 2443 "psrld $14, %%mm5 \n\t"\ | |
| 2444 "paddd %%mm5, %%mm0 \n\t" | |
| 2445 | |
| 2446 L2_DIFF_CORE((%0), (%1)) | |
| 2447 L2_DIFF_CORE((%0, %2), (%1, %2)) | |
| 2448 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) | |
| 2449 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) | |
| 2450 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) | |
| 2451 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx)) | |
| 2452 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) | |
| 2453 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) | |
| 2454 | |
| 2455 #else | |
| 2456 "pxor %%mm7, %%mm7 \n\t" | |
| 2457 "pxor %%mm0, %%mm0 \n\t" | |
| 2458 #define L2_DIFF_CORE(a, b)\ | |
| 2459 "movq " #a ", %%mm5 \n\t"\ | |
| 2460 "movq " #b ", %%mm2 \n\t"\ | |
| 2461 "movq %%mm5, %%mm1 \n\t"\ | |
| 2462 "movq %%mm2, %%mm3 \n\t"\ | |
| 2463 "punpcklbw %%mm7, %%mm5 \n\t"\ | |
| 2464 "punpckhbw %%mm7, %%mm1 \n\t"\ | |
| 2465 "punpcklbw %%mm7, %%mm2 \n\t"\ | |
| 2466 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
| 2467 "psubw %%mm2, %%mm5 \n\t"\ | |
| 2468 "psubw %%mm3, %%mm1 \n\t"\ | |
| 2469 "pmaddwd %%mm5, %%mm5 \n\t"\ | |
| 2470 "pmaddwd %%mm1, %%mm1 \n\t"\ | |
| 2471 "paddd %%mm1, %%mm5 \n\t"\ | |
| 2472 "paddd %%mm5, %%mm0 \n\t" | |
| 2473 | |
| 2474 L2_DIFF_CORE((%0), (%1)) | |
| 2475 L2_DIFF_CORE((%0, %2), (%1, %2)) | |
| 2476 L2_DIFF_CORE((%0, %2, 2), (%1, %2, 2)) | |
| 2477 L2_DIFF_CORE((%0, %%eax), (%1, %%eax)) | |
| 2478 L2_DIFF_CORE((%0, %2, 4), (%1, %2, 4)) | |
| 2479 L2_DIFF_CORE((%0, %%ebx), (%1, %%ebx)) | |
| 2480 L2_DIFF_CORE((%0, %%eax,2), (%1, %%eax,2)) | |
| 2481 L2_DIFF_CORE((%0, %%ecx), (%1, %%ecx)) | |
| 2482 | |
| 2483 #endif | |
| 2484 | |
| 2485 "movq %%mm0, %%mm4 \n\t" | |
| 2486 "psrlq $32, %%mm0 \n\t" | |
| 2487 "paddd %%mm0, %%mm4 \n\t" | |
| 2488 "movd %%mm4, %%ecx \n\t" | |
| 2489 "shll $2, %%ecx \n\t" | |
| 2490 "movl %3, %%ebx \n\t" | |
| 2491 "addl -4(%%ebx), %%ecx \n\t" | |
| 2492 "addl 4(%%ebx), %%ecx \n\t" | |
| 2493 "addl -1024(%%ebx), %%ecx \n\t" | |
| 2494 "addl $4, %%ecx \n\t" | |
| 2495 "addl 1024(%%ebx), %%ecx \n\t" | |
| 2496 "shrl $3, %%ecx \n\t" | |
| 2497 "movl %%ecx, (%%ebx) \n\t" | |
| 2498 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride | |
| 2499 | |
| 2500 // "movl %3, %%ecx \n\t" | |
| 2501 // "movl %%ecx, test \n\t" | |
| 2502 // "jmp 4f \n\t" | |
| 2503 "cmpl 4+maxTmpNoise, %%ecx \n\t" | |
| 2504 " jb 2f \n\t" | |
| 2505 "cmpl 8+maxTmpNoise, %%ecx \n\t" | |
| 2506 " jb 1f \n\t" | |
| 2507 | |
| 2508 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | |
| 2509 "movq (%0), %%mm0 \n\t" // L0 | |
| 2510 "movq (%0, %2), %%mm1 \n\t" // L1 | |
| 2511 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | |
| 2512 "movq (%0, %%eax), %%mm3 \n\t" // L3 | |
| 2513 "movq (%0, %2, 4), %%mm4 \n\t" // L4 | |
| 2514 "movq (%0, %%ebx), %%mm5 \n\t" // L5 | |
| 2515 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 | |
| 2516 "movq (%0, %%ecx), %%mm7 \n\t" // L7 | |
| 2517 "movq %%mm0, (%1) \n\t" // L0 | |
| 2518 "movq %%mm1, (%1, %2) \n\t" // L1 | |
| 2519 "movq %%mm2, (%1, %2, 2) \n\t" // L2 | |
| 2520 "movq %%mm3, (%1, %%eax) \n\t" // L3 | |
| 2521 "movq %%mm4, (%1, %2, 4) \n\t" // L4 | |
| 2522 "movq %%mm5, (%1, %%ebx) \n\t" // L5 | |
| 2523 "movq %%mm6, (%1, %%eax, 2) \n\t" // L6 | |
| 2524 "movq %%mm7, (%1, %%ecx) \n\t" // L7 | |
| 2525 "jmp 4f \n\t" | |
| 2526 | |
| 2527 "1: \n\t" | |
| 2528 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | |
| 2529 "movq (%0), %%mm0 \n\t" // L0 | |
| 2530 "pavgb (%1), %%mm0 \n\t" // L0 | |
| 2531 "movq (%0, %2), %%mm1 \n\t" // L1 | |
| 2532 "pavgb (%1, %2), %%mm1 \n\t" // L1 | |
| 2533 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | |
| 2534 "pavgb (%1, %2, 2), %%mm2 \n\t" // L2 | |
| 2535 "movq (%0, %%eax), %%mm3 \n\t" // L3 | |
| 2536 "pavgb (%1, %%eax), %%mm3 \n\t" // L3 | |
| 2537 "movq (%0, %2, 4), %%mm4 \n\t" // L4 | |
| 2538 "pavgb (%1, %2, 4), %%mm4 \n\t" // L4 | |
| 2539 "movq (%0, %%ebx), %%mm5 \n\t" // L5 | |
| 2540 "pavgb (%1, %%ebx), %%mm5 \n\t" // L5 | |
| 2541 "movq (%0, %%eax, 2), %%mm6 \n\t" // L6 | |
| 2542 "pavgb (%1, %%eax, 2), %%mm6 \n\t" // L6 | |
| 2543 "movq (%0, %%ecx), %%mm7 \n\t" // L7 | |
| 2544 "pavgb (%1, %%ecx), %%mm7 \n\t" // L7 | |
| 2545 "movq %%mm0, (%1) \n\t" // R0 | |
| 2546 "movq %%mm1, (%1, %2) \n\t" // R1 | |
| 2547 "movq %%mm2, (%1, %2, 2) \n\t" // R2 | |
| 2548 "movq %%mm3, (%1, %%eax) \n\t" // R3 | |
| 2549 "movq %%mm4, (%1, %2, 4) \n\t" // R4 | |
| 2550 "movq %%mm5, (%1, %%ebx) \n\t" // R5 | |
| 2551 "movq %%mm6, (%1, %%eax, 2) \n\t" // R6 | |
| 2552 "movq %%mm7, (%1, %%ecx) \n\t" // R7 | |
| 2553 "movq %%mm0, (%0) \n\t" // L0 | |
| 2554 "movq %%mm1, (%0, %2) \n\t" // L1 | |
| 2555 "movq %%mm2, (%0, %2, 2) \n\t" // L2 | |
| 2556 "movq %%mm3, (%0, %%eax) \n\t" // L3 | |
| 2557 "movq %%mm4, (%0, %2, 4) \n\t" // L4 | |
| 2558 "movq %%mm5, (%0, %%ebx) \n\t" // L5 | |
| 2559 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 | |
| 2560 "movq %%mm7, (%0, %%ecx) \n\t" // L7 | |
| 2561 "jmp 4f \n\t" | |
| 2562 | |
| 2563 "2: \n\t" | |
| 2564 "cmpl maxTmpNoise, %%ecx \n\t" | |
| 2565 " jb 3f \n\t" | |
| 2566 | |
| 2567 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | |
| 2568 "movq (%0), %%mm0 \n\t" // L0 | |
| 2569 "movq (%0, %2), %%mm1 \n\t" // L1 | |
| 2570 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | |
| 2571 "movq (%0, %%eax), %%mm3 \n\t" // L3 | |
| 2572 "movq (%1), %%mm4 \n\t" // R0 | |
| 2573 "movq (%1, %2), %%mm5 \n\t" // R1 | |
| 2574 "movq (%1, %2, 2), %%mm6 \n\t" // R2 | |
| 2575 "movq (%1, %%eax), %%mm7 \n\t" // R3 | |
| 2576 PAVGB(%%mm4, %%mm0) | |
| 2577 PAVGB(%%mm5, %%mm1) | |
| 2578 PAVGB(%%mm6, %%mm2) | |
| 2579 PAVGB(%%mm7, %%mm3) | |
| 2580 PAVGB(%%mm4, %%mm0) | |
| 2581 PAVGB(%%mm5, %%mm1) | |
| 2582 PAVGB(%%mm6, %%mm2) | |
| 2583 PAVGB(%%mm7, %%mm3) | |
| 2584 "movq %%mm0, (%1) \n\t" // R0 | |
| 2585 "movq %%mm1, (%1, %2) \n\t" // R1 | |
| 2586 "movq %%mm2, (%1, %2, 2) \n\t" // R2 | |
| 2587 "movq %%mm3, (%1, %%eax) \n\t" // R3 | |
| 2588 "movq %%mm0, (%0) \n\t" // L0 | |
| 2589 "movq %%mm1, (%0, %2) \n\t" // L1 | |
| 2590 "movq %%mm2, (%0, %2, 2) \n\t" // L2 | |
| 2591 "movq %%mm3, (%0, %%eax) \n\t" // L3 | |
| 2592 | |
| 2593 "movq (%0, %2, 4), %%mm0 \n\t" // L4 | |
| 2594 "movq (%0, %%ebx), %%mm1 \n\t" // L5 | |
| 2595 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 | |
| 2596 "movq (%0, %%ecx), %%mm3 \n\t" // L7 | |
| 2597 "movq (%1, %2, 4), %%mm4 \n\t" // R4 | |
| 2598 "movq (%1, %%ebx), %%mm5 \n\t" // R5 | |
| 2599 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 | |
| 2600 "movq (%1, %%ecx), %%mm7 \n\t" // R7 | |
| 2601 PAVGB(%%mm4, %%mm0) | |
| 2602 PAVGB(%%mm5, %%mm1) | |
| 2603 PAVGB(%%mm6, %%mm2) | |
| 2604 PAVGB(%%mm7, %%mm3) | |
| 2605 PAVGB(%%mm4, %%mm0) | |
| 2606 PAVGB(%%mm5, %%mm1) | |
| 2607 PAVGB(%%mm6, %%mm2) | |
| 2608 PAVGB(%%mm7, %%mm3) | |
| 2609 "movq %%mm0, (%1, %2, 4) \n\t" // R4 | |
| 2610 "movq %%mm1, (%1, %%ebx) \n\t" // R5 | |
| 2611 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 | |
| 2612 "movq %%mm3, (%1, %%ecx) \n\t" // R7 | |
| 2613 "movq %%mm0, (%0, %2, 4) \n\t" // L4 | |
| 2614 "movq %%mm1, (%0, %%ebx) \n\t" // L5 | |
| 2615 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 | |
| 2616 "movq %%mm3, (%0, %%ecx) \n\t" // L7 | |
| 2617 "jmp 4f \n\t" | |
| 2618 | |
| 2619 "3: \n\t" | |
| 2620 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | |
| 2621 "movq (%0), %%mm0 \n\t" // L0 | |
| 2622 "movq (%0, %2), %%mm1 \n\t" // L1 | |
| 2623 "movq (%0, %2, 2), %%mm2 \n\t" // L2 | |
| 2624 "movq (%0, %%eax), %%mm3 \n\t" // L3 | |
| 2625 "movq (%1), %%mm4 \n\t" // R0 | |
| 2626 "movq (%1, %2), %%mm5 \n\t" // R1 | |
| 2627 "movq (%1, %2, 2), %%mm6 \n\t" // R2 | |
| 2628 "movq (%1, %%eax), %%mm7 \n\t" // R3 | |
| 2629 PAVGB(%%mm4, %%mm0) | |
| 2630 PAVGB(%%mm5, %%mm1) | |
| 2631 PAVGB(%%mm6, %%mm2) | |
| 2632 PAVGB(%%mm7, %%mm3) | |
| 2633 PAVGB(%%mm4, %%mm0) | |
| 2634 PAVGB(%%mm5, %%mm1) | |
| 2635 PAVGB(%%mm6, %%mm2) | |
| 2636 PAVGB(%%mm7, %%mm3) | |
| 2637 PAVGB(%%mm4, %%mm0) | |
| 2638 PAVGB(%%mm5, %%mm1) | |
| 2639 PAVGB(%%mm6, %%mm2) | |
| 2640 PAVGB(%%mm7, %%mm3) | |
| 2641 "movq %%mm0, (%1) \n\t" // R0 | |
| 2642 "movq %%mm1, (%1, %2) \n\t" // R1 | |
| 2643 "movq %%mm2, (%1, %2, 2) \n\t" // R2 | |
| 2644 "movq %%mm3, (%1, %%eax) \n\t" // R3 | |
| 2645 "movq %%mm0, (%0) \n\t" // L0 | |
| 2646 "movq %%mm1, (%0, %2) \n\t" // L1 | |
| 2647 "movq %%mm2, (%0, %2, 2) \n\t" // L2 | |
| 2648 "movq %%mm3, (%0, %%eax) \n\t" // L3 | |
| 2649 | |
| 2650 "movq (%0, %2, 4), %%mm0 \n\t" // L4 | |
| 2651 "movq (%0, %%ebx), %%mm1 \n\t" // L5 | |
| 2652 "movq (%0, %%eax, 2), %%mm2 \n\t" // L6 | |
| 2653 "movq (%0, %%ecx), %%mm3 \n\t" // L7 | |
| 2654 "movq (%1, %2, 4), %%mm4 \n\t" // R4 | |
| 2655 "movq (%1, %%ebx), %%mm5 \n\t" // R5 | |
| 2656 "movq (%1, %%eax, 2), %%mm6 \n\t" // R6 | |
| 2657 "movq (%1, %%ecx), %%mm7 \n\t" // R7 | |
| 2658 PAVGB(%%mm4, %%mm0) | |
| 2659 PAVGB(%%mm5, %%mm1) | |
| 2660 PAVGB(%%mm6, %%mm2) | |
| 2661 PAVGB(%%mm7, %%mm3) | |
| 2662 PAVGB(%%mm4, %%mm0) | |
| 2663 PAVGB(%%mm5, %%mm1) | |
| 2664 PAVGB(%%mm6, %%mm2) | |
| 2665 PAVGB(%%mm7, %%mm3) | |
| 2666 PAVGB(%%mm4, %%mm0) | |
| 2667 PAVGB(%%mm5, %%mm1) | |
| 2668 PAVGB(%%mm6, %%mm2) | |
| 2669 PAVGB(%%mm7, %%mm3) | |
| 2670 "movq %%mm0, (%1, %2, 4) \n\t" // R4 | |
| 2671 "movq %%mm1, (%1, %%ebx) \n\t" // R5 | |
| 2672 "movq %%mm2, (%1, %%eax, 2) \n\t" // R6 | |
| 2673 "movq %%mm3, (%1, %%ecx) \n\t" // R7 | |
| 2674 "movq %%mm0, (%0, %2, 4) \n\t" // L4 | |
| 2675 "movq %%mm1, (%0, %%ebx) \n\t" // L5 | |
| 2676 "movq %%mm2, (%0, %%eax, 2) \n\t" // L6 | |
| 2677 "movq %%mm3, (%0, %%ecx) \n\t" // L7 | |
| 2678 | |
| 2679 "4: \n\t" | |
| 2680 | |
| 2681 :: "r" (src), "r" (tempBlured), "r"(stride), "m" (tempBluredPast) | |
| 2682 : "%eax", "%ebx", "%ecx", "memory" | |
| 2683 ); | |
| 2684 //printf("%d\n", test); | |
| 2685 #else | |
| 2686 int y; | |
| 2687 int d=0; | |
| 2688 int sysd=0; | |
| 2689 int i; | |
| 2690 | |
| 2691 for(y=0; y<8; y++) | |
| 2692 { | |
| 2693 int x; | |
| 2694 for(x=0; x<8; x++) | |
| 2695 { | |
| 2696 int ref= tempBlured[ x + y*stride ]; | |
| 2697 int cur= src[ x + y*stride ]; | |
| 2698 int d1=ref - cur; | |
| 2699 // if(x==0 || x==7) d1+= d1>>1; | |
| 2700 // if(y==0 || y==7) d1+= d1>>1; | |
| 2701 // d+= ABS(d1); | |
| 2702 d+= d1*d1; | |
| 2703 sysd+= d1; | |
| 2704 } | |
| 2705 } | |
| 2706 i=d; | |
| 2707 d= ( | |
| 2708 4*d | |
| 2709 +(*(tempBluredPast-256)) | |
| 2710 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) | |
| 2711 +(*(tempBluredPast+256)) | |
| 2712 +4)>>3; | |
| 2713 *tempBluredPast=i; | |
| 2714 // ((*tempBluredPast)*3 + d + 2)>>2; | |
| 2715 | |
| 2716 //printf("%d %d %d\n", maxNoise[0], maxNoise[1], maxNoise[2]); | |
| 2717 /* | |
| 2718 Switch between | |
| 2719 1 0 0 0 0 0 0 (0) | |
| 2720 64 32 16 8 4 2 1 (1) | |
| 2721 64 48 36 27 20 15 11 (33) (approx) | |
| 2722 64 56 49 43 37 33 29 (200) (approx) | |
| 2723 */ | |
| 2724 if(d > maxNoise[1]) | |
| 2725 { | |
| 2726 if(d < maxNoise[2]) | |
| 2727 { | |
| 2728 for(y=0; y<8; y++) | |
| 2729 { | |
| 2730 int x; | |
| 2731 for(x=0; x<8; x++) | |
| 2732 { | |
| 2733 int ref= tempBlured[ x + y*stride ]; | |
| 2734 int cur= src[ x + y*stride ]; | |
| 2735 tempBlured[ x + y*stride ]= | |
| 2736 src[ x + y*stride ]= | |
| 2737 (ref + cur + 1)>>1; | |
| 2738 } | |
| 2739 } | |
| 2740 } | |
| 2741 else | |
| 2742 { | |
| 2743 for(y=0; y<8; y++) | |
| 2744 { | |
| 2745 int x; | |
| 2746 for(x=0; x<8; x++) | |
| 2747 { | |
| 2748 tempBlured[ x + y*stride ]= src[ x + y*stride ]; | |
| 2749 } | |
| 2750 } | |
| 2751 } | |
| 2752 } | |
| 2753 else | |
| 2754 { | |
| 2755 if(d < maxNoise[0]) | |
| 2756 { | |
| 2757 for(y=0; y<8; y++) | |
| 2758 { | |
| 2759 int x; | |
| 2760 for(x=0; x<8; x++) | |
| 2761 { | |
| 2762 int ref= tempBlured[ x + y*stride ]; | |
| 2763 int cur= src[ x + y*stride ]; | |
| 2764 tempBlured[ x + y*stride ]= | |
| 2765 src[ x + y*stride ]= | |
| 2766 (ref*7 + cur + 4)>>3; | |
| 2767 } | |
| 2768 } | |
| 2769 } | |
| 2770 else | |
| 2771 { | |
| 2772 for(y=0; y<8; y++) | |
| 2773 { | |
| 2774 int x; | |
| 2775 for(x=0; x<8; x++) | |
| 2776 { | |
| 2777 int ref= tempBlured[ x + y*stride ]; | |
| 2778 int cur= src[ x + y*stride ]; | |
| 2779 tempBlured[ x + y*stride ]= | |
| 2780 src[ x + y*stride ]= | |
| 2781 (ref*3 + cur + 2)>>2; | |
| 2782 } | |
| 2783 } | |
| 2784 } | |
| 2785 } | |
| 2786 #endif | 468 #endif |
| 2787 } | 469 } |
| 2788 | 470 |
| 2789 #ifdef HAVE_ODIVX_POSTPROCESS | 471 #ifdef HAVE_ODIVX_POSTPROCESS |
| 2790 #include "../opendivx/postprocess.h" | 472 #include "../opendivx/postprocess.h" |
| 2791 int use_old_pp=0; | 473 int use_old_pp=0; |
| 2792 #endif | 474 #endif |
| 2793 | 475 |
| 2794 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | 476 //static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 2795 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); | 477 // QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode); |
| 2796 | 478 |
| 2797 /* -pp Command line Help | 479 /* -pp Command line Help |
| 2798 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)? | 480 NOTE/FIXME: put this at an appropriate place (--help, html docs, man mplayer)? |
| 2799 | 481 |
| 2800 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]... | 482 -pp <filterName>[:<option>[:<option>...]][,[-]<filterName>[:<option>...]]... |
| 3160 if(use_old_pp) return odivx_modes[quality]; | 842 if(use_old_pp) return odivx_modes[quality]; |
| 3161 #endif | 843 #endif |
| 3162 return modes[quality]; | 844 return modes[quality]; |
| 3163 } | 845 } |
| 3164 | 846 |
| 3165 /** | 847 |
| 3166 * Copies a block from src to dst and fixes the blacklevel | |
| 3167 * numLines must be a multiple of 4 | |
| 3168 * levelFix == 0 -> dont touch the brighness & contrast | |
| 3169 */ | |
| 3170 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, | |
| 3171 int levelFix) | |
| 3172 { | |
| 3173 #ifndef HAVE_MMX | |
| 3174 int i; | |
| 3175 #endif | |
| 3176 if(levelFix) | |
| 3177 { | |
| 3178 #ifdef HAVE_MMX | |
| 3179 asm volatile( | |
| 3180 "leal (%0,%2), %%eax \n\t" | |
| 3181 "leal (%1,%3), %%ebx \n\t" | |
| 3182 "movq packedYOffset, %%mm2 \n\t" | |
| 3183 "movq packedYScale, %%mm3 \n\t" | |
| 3184 "pxor %%mm4, %%mm4 \n\t" | |
| 3185 | |
| 3186 #define SCALED_CPY(src1, src2, dst1, dst2) \ | |
| 3187 "movq " #src1 ", %%mm0 \n\t"\ | |
| 3188 "movq " #src1 ", %%mm5 \n\t"\ | |
| 3189 "punpcklbw %%mm4, %%mm0 \n\t"\ | |
| 3190 "punpckhbw %%mm4, %%mm5 \n\t"\ | |
| 3191 "psubw %%mm2, %%mm0 \n\t"\ | |
| 3192 "psubw %%mm2, %%mm5 \n\t"\ | |
| 3193 "movq " #src2 ", %%mm1 \n\t"\ | |
| 3194 "psllw $6, %%mm0 \n\t"\ | |
| 3195 "psllw $6, %%mm5 \n\t"\ | |
| 3196 "pmulhw %%mm3, %%mm0 \n\t"\ | |
| 3197 "movq " #src2 ", %%mm6 \n\t"\ | |
| 3198 "pmulhw %%mm3, %%mm5 \n\t"\ | |
| 3199 "punpcklbw %%mm4, %%mm1 \n\t"\ | |
| 3200 "punpckhbw %%mm4, %%mm6 \n\t"\ | |
| 3201 "psubw %%mm2, %%mm1 \n\t"\ | |
| 3202 "psubw %%mm2, %%mm6 \n\t"\ | |
| 3203 "psllw $6, %%mm1 \n\t"\ | |
| 3204 "psllw $6, %%mm6 \n\t"\ | |
| 3205 "pmulhw %%mm3, %%mm1 \n\t"\ | |
| 3206 "pmulhw %%mm3, %%mm6 \n\t"\ | |
| 3207 "packuswb %%mm5, %%mm0 \n\t"\ | |
| 3208 "packuswb %%mm6, %%mm1 \n\t"\ | |
| 3209 "movq %%mm0, " #dst1 " \n\t"\ | |
| 3210 "movq %%mm1, " #dst2 " \n\t"\ | |
| 3211 | |
| 3212 SCALED_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) | |
| 3213 SCALED_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2)) | |
| 3214 SCALED_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4)) | |
| 3215 "leal (%%eax,%2,4), %%eax \n\t" | |
| 3216 "leal (%%ebx,%3,4), %%ebx \n\t" | |
| 3217 SCALED_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2)) | |
| 3218 | |
| 3219 | |
| 3220 : : "r"(src), | |
| 3221 "r"(dst), | |
| 3222 "r" (srcStride), | |
| 3223 "r" (dstStride) | |
| 3224 : "%eax", "%ebx" | |
| 3225 ); | |
| 3226 #else | |
| 3227 for(i=0; i<8; i++) | |
| 3228 memcpy( &(dst[dstStride*i]), | |
| 3229 &(src[srcStride*i]), BLOCK_SIZE); | |
| 3230 #endif | |
| 3231 } | |
| 3232 else | |
| 3233 { | |
| 3234 #ifdef HAVE_MMX | |
| 3235 asm volatile( | |
| 3236 "leal (%0,%2), %%eax \n\t" | |
| 3237 "leal (%1,%3), %%ebx \n\t" | |
| 3238 | |
| 3239 #define SIMPLE_CPY(src1, src2, dst1, dst2) \ | |
| 3240 "movq " #src1 ", %%mm0 \n\t"\ | |
| 3241 "movq " #src2 ", %%mm1 \n\t"\ | |
| 3242 "movq %%mm0, " #dst1 " \n\t"\ | |
| 3243 "movq %%mm1, " #dst2 " \n\t"\ | |
| 3244 | |
| 3245 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) | |
| 3246 SIMPLE_CPY((%0, %2, 2), (%%eax, %2, 2), (%1, %3, 2), (%%ebx, %3, 2)) | |
| 3247 SIMPLE_CPY((%0, %2, 4), (%%eax, %2, 4), (%1, %3, 4), (%%ebx, %3, 4)) | |
| 3248 "leal (%%eax,%2,4), %%eax \n\t" | |
| 3249 "leal (%%ebx,%3,4), %%ebx \n\t" | |
| 3250 SIMPLE_CPY((%%eax, %2), (%%eax, %2, 2), (%%ebx, %3), (%%ebx, %3, 2)) | |
| 3251 | |
| 3252 : : "r" (src), | |
| 3253 "r" (dst), | |
| 3254 "r" (srcStride), | |
| 3255 "r" (dstStride) | |
| 3256 : "%eax", "%ebx" | |
| 3257 ); | |
| 3258 #else | |
| 3259 for(i=0; i<8; i++) | |
| 3260 memcpy( &(dst[dstStride*i]), | |
| 3261 &(src[srcStride*i]), BLOCK_SIZE); | |
| 3262 #endif | |
| 3263 } | |
| 3264 } | |
| 3265 | |
| 3266 | |
| 3267 /** | |
| 3268 * Filters array of bytes (Y or U or V values) | |
| 3269 */ | |
| 3270 static void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | |
| 3271 QP_STORE_T QPs[], int QPStride, int isColor, struct PPMode *ppMode) | |
| 3272 { | |
| 3273 int x,y; | |
| 3274 const int mode= isColor ? ppMode->chromMode : ppMode->lumMode; | |
| 3275 | |
| 3276 /* we need 64bit here otherwise weŽll going to have a problem | |
| 3277 after watching a black picture for 5 hours*/ | |
| 3278 static uint64_t *yHistogram= NULL; | |
| 3279 int black=0, white=255; // blackest black and whitest white in the picture | |
| 3280 int QPCorrecture= 256; | |
| 3281 | |
| 3282 /* Temporary buffers for handling the last row(s) */ | |
| 3283 static uint8_t *tempDst= NULL; | |
| 3284 static uint8_t *tempSrc= NULL; | |
| 3285 | |
| 3286 /* Temporary buffers for handling the last block */ | |
| 3287 static uint8_t *tempDstBlock= NULL; | |
| 3288 static uint8_t *tempSrcBlock= NULL; | |
| 3289 | |
| 3290 /* Temporal noise reducing buffers */ | |
| 3291 static uint8_t *tempBlured[3]= {NULL,NULL,NULL}; | |
| 3292 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL}; | |
| 3293 | |
| 3294 int copyAhead; | |
| 3295 | |
| 3296 #ifdef PP_FUNNY_STRIDE | |
| 3297 uint8_t *dstBlockPtrBackup; | |
| 3298 uint8_t *srcBlockPtrBackup; | |
| 3299 #endif | |
| 3300 | |
| 3301 #ifdef MORE_TIMING | |
| 3302 long long T0, T1, diffTime=0; | |
| 3303 #endif | |
| 3304 #ifdef TIMING | |
| 3305 long long memcpyTime=0, vertTime=0, horizTime=0, sumTime; | |
| 3306 sumTime= rdtsc(); | |
| 3307 #endif | |
| 3308 //mode= 0x7F; | |
| 3309 #ifdef HAVE_MMX | |
| 3310 maxTmpNoise[0]= ppMode->maxTmpNoise[0]; | |
| 3311 maxTmpNoise[1]= ppMode->maxTmpNoise[1]; | |
| 3312 maxTmpNoise[2]= ppMode->maxTmpNoise[2]; | |
| 3313 #endif | |
| 3314 | |
| 3315 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; | |
| 3316 else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14; | |
| 3317 else if( (mode & V_DEBLOCK) | |
| 3318 || (mode & LINEAR_IPOL_DEINT_FILTER) | |
| 3319 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; | |
| 3320 else if(mode & V_X1_FILTER) copyAhead=11; | |
| 3321 else if(mode & V_RK1_FILTER) copyAhead=10; | |
| 3322 else if(mode & DERING) copyAhead=9; | |
| 3323 else copyAhead=8; | |
| 3324 | |
| 3325 copyAhead-= 8; | |
| 3326 | |
| 3327 if(tempDst==NULL) | |
| 3328 { | |
| 3329 tempDst= (uint8_t*)memalign(8, 1024*24); | |
| 3330 tempSrc= (uint8_t*)memalign(8, 1024*24); | |
| 3331 tempDstBlock= (uint8_t*)memalign(8, 1024*24); | |
| 3332 tempSrcBlock= (uint8_t*)memalign(8, 1024*24); | |
| 3333 } | |
| 3334 | |
| 3335 if(tempBlured[isColor]==NULL && (mode & TEMP_NOISE_FILTER)) | |
| 3336 { | |
| 3337 // printf("%d %d %d\n", isColor, dstStride, height); | |
| 3338 //FIXME works only as long as the size doesnt increase | |
| 3339 //Note:the +17*1024 is just there so i dont have to worry about r/w over te end | |
| 3340 tempBlured[isColor]= (uint8_t*)memalign(8, dstStride*((height+7)&(~7)) + 17*1024); | |
| 3341 tempBluredPast[isColor]= (uint32_t*)memalign(8, 256*((height+7)&(~7))/2 + 17*1024); | |
| 3342 | |
| 3343 memset(tempBlured[isColor], 0, dstStride*((height+7)&(~7)) + 17*1024); | |
| 3344 memset(tempBluredPast[isColor], 0, 256*((height+7)&(~7))/2 + 17*1024); | |
| 3345 } | |
| 3346 | |
| 3347 if(!yHistogram) | |
| 3348 { | |
| 3349 int i; | |
| 3350 yHistogram= (uint64_t*)malloc(8*256); | |
| 3351 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256; | |
| 3352 | |
| 3353 if(mode & FULL_Y_RANGE) | |
| 3354 { | |
| 3355 maxAllowedY=255; | |
| 3356 minAllowedY=0; | |
| 3357 } | |
| 3358 } | |
| 3359 | |
| 3360 if(!isColor) | |
| 3361 { | |
| 3362 uint64_t sum= 0; | |
| 3363 int i; | |
| 3364 static int framenum= -1; | |
| 3365 uint64_t maxClipped; | |
| 3366 uint64_t clipped; | |
| 3367 double scale; | |
| 3368 | |
| 3369 framenum++; | |
| 3370 if(framenum == 1) yHistogram[0]= width*height/64*15/256; | |
| 3371 | |
| 3372 for(i=0; i<256; i++) | |
| 3373 { | |
| 3374 sum+= yHistogram[i]; | |
| 3375 // printf("%d ", yHistogram[i]); | |
| 3376 } | |
| 3377 // printf("\n\n"); | |
| 3378 | |
| 3379 /* we allways get a completly black picture first */ | |
| 3380 maxClipped= (uint64_t)(sum * maxClippedThreshold); | |
| 3381 | |
| 3382 clipped= sum; | |
| 3383 for(black=255; black>0; black--) | |
| 3384 { | |
| 3385 if(clipped < maxClipped) break; | |
| 3386 clipped-= yHistogram[black]; | |
| 3387 } | |
| 3388 | |
| 3389 clipped= sum; | |
| 3390 for(white=0; white<256; white++) | |
| 3391 { | |
| 3392 if(clipped < maxClipped) break; | |
| 3393 clipped-= yHistogram[white]; | |
| 3394 } | |
| 3395 | |
| 3396 packedYOffset= (black - minAllowedY) & 0xFFFF; | |
| 3397 packedYOffset|= packedYOffset<<32; | |
| 3398 packedYOffset|= packedYOffset<<16; | |
| 3399 | |
| 3400 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); | |
| 3401 | |
| 3402 packedYScale= (uint16_t)(scale*1024.0 + 0.5); | |
| 3403 packedYScale|= packedYScale<<32; | |
| 3404 packedYScale|= packedYScale<<16; | |
| 3405 } | |
| 3406 else | |
| 3407 { | |
| 3408 packedYScale= 0x0100010001000100LL; | |
| 3409 packedYOffset= 0; | |
| 3410 } | |
| 3411 | |
| 3412 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF; | |
| 3413 else QPCorrecture= 256; | |
| 3414 | |
| 3415 /* copy & deinterlace first row of blocks */ | |
| 3416 y=-BLOCK_SIZE; | |
| 3417 { | |
| 3418 //1% speedup if these are here instead of the inner loop | |
| 3419 uint8_t *srcBlock= &(src[y*srcStride]); | |
| 3420 uint8_t *dstBlock= &(dst[y*dstStride]); | |
| 3421 | |
| 3422 dstBlock= tempDst + dstStride; | |
| 3423 | |
| 3424 // From this point on it is guranteed that we can read and write 16 lines downward | |
| 3425 // finish 1 block before the next otherwise weŽll might have a problem | |
| 3426 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
| 3427 for(x=0; x<width; x+=BLOCK_SIZE) | |
| 3428 { | |
| 3429 | |
| 3430 #ifdef HAVE_MMX2 | |
| 3431 /* | |
| 3432 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 3433 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 3434 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 3435 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 3436 */ | |
| 3437 | |
| 3438 asm( | |
| 3439 "movl %4, %%eax \n\t" | |
| 3440 "shrl $2, %%eax \n\t" | |
| 3441 "andl $6, %%eax \n\t" | |
| 3442 "addl %5, %%eax \n\t" | |
| 3443 "movl %%eax, %%ebx \n\t" | |
| 3444 "imul %1, %%eax \n\t" | |
| 3445 "imul %3, %%ebx \n\t" | |
| 3446 "prefetchnta 32(%%eax, %0) \n\t" | |
| 3447 "prefetcht0 32(%%ebx, %2) \n\t" | |
| 3448 "addl %1, %%eax \n\t" | |
| 3449 "addl %3, %%ebx \n\t" | |
| 3450 "prefetchnta 32(%%eax, %0) \n\t" | |
| 3451 "prefetcht0 32(%%ebx, %2) \n\t" | |
| 3452 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | |
| 3453 "m" (x), "m" (copyAhead) | |
| 3454 : "%eax", "%ebx" | |
| 3455 ); | |
| 3456 | |
| 3457 #elif defined(HAVE_3DNOW) | |
| 3458 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 3459 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
| 3460 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 3461 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 3462 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 3463 */ | |
| 3464 #endif | |
| 3465 | |
| 3466 blockCopy(dstBlock + dstStride*copyAhead, dstStride, | |
| 3467 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); | |
| 3468 | |
| 3469 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
| 3470 deInterlaceInterpolateLinear(dstBlock, dstStride); | |
| 3471 else if(mode & LINEAR_BLEND_DEINT_FILTER) | |
| 3472 deInterlaceBlendLinear(dstBlock, dstStride); | |
| 3473 else if(mode & MEDIAN_DEINT_FILTER) | |
| 3474 deInterlaceMedian(dstBlock, dstStride); | |
| 3475 else if(mode & CUBIC_IPOL_DEINT_FILTER) | |
| 3476 deInterlaceInterpolateCubic(dstBlock, dstStride); | |
| 3477 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | |
| 3478 deInterlaceBlendCubic(dstBlock, dstStride); | |
| 3479 */ | |
| 3480 dstBlock+=8; | |
| 3481 srcBlock+=8; | |
| 3482 } | |
| 3483 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride ); | |
| 3484 } | |
| 3485 | |
| 3486 for(y=0; y<height; y+=BLOCK_SIZE) | |
| 3487 { | |
| 3488 //1% speedup if these are here instead of the inner loop | |
| 3489 uint8_t *srcBlock= &(src[y*srcStride]); | |
| 3490 uint8_t *dstBlock= &(dst[y*dstStride]); | |
| 3491 #ifdef ARCH_X86 | |
| 3492 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | |
| 3493 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); | |
| 3494 int QPFrac= QPDelta; | |
| 3495 uint8_t *tempBlock1= tempBlocks; | |
| 3496 uint8_t *tempBlock2= tempBlocks + 8; | |
| 3497 #endif | |
| 3498 int QP=0; | |
| 3499 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards | |
| 3500 if not than use a temporary buffer */ | |
| 3501 if(y+15 >= height) | |
| 3502 { | |
| 3503 int i; | |
| 3504 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with | |
| 3505 blockcopy to dst later */ | |
| 3506 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, | |
| 3507 srcStride*MAX(height-y-copyAhead, 0) ); | |
| 3508 | |
| 3509 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ | |
| 3510 for(i=MAX(height-y, 8); i<copyAhead+8; i++) | |
| 3511 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); | |
| 3512 | |
| 3513 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ | |
| 3514 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) ); | |
| 3515 | |
| 3516 /* duplicate last line of dst to fill the void upto line (copyAhead) */ | |
| 3517 for(i=height-y+1; i<=copyAhead; i++) | |
| 3518 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); | |
| 3519 | |
| 3520 dstBlock= tempDst + dstStride; | |
| 3521 srcBlock= tempSrc; | |
| 3522 } | |
| 3523 | |
| 3524 // From this point on it is guranteed that we can read and write 16 lines downward | |
| 3525 // finish 1 block before the next otherwise weŽll might have a problem | |
| 3526 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
| 3527 for(x=0; x<width; x+=BLOCK_SIZE) | |
| 3528 { | |
| 3529 const int stride= dstStride; | |
| 3530 uint8_t *tmpXchg; | |
| 3531 #ifdef ARCH_X86 | |
| 3532 QP= *QPptr; | |
| 3533 asm volatile( | |
| 3534 "addl %2, %1 \n\t" | |
| 3535 "sbbl %%eax, %%eax \n\t" | |
| 3536 "shll $2, %%eax \n\t" | |
| 3537 "subl %%eax, %0 \n\t" | |
| 3538 : "+r" (QPptr), "+m" (QPFrac) | |
| 3539 : "r" (QPDelta) | |
| 3540 : "%eax" | |
| 3541 ); | |
| 3542 #else | |
| 3543 QP= isColor ? | |
| 3544 QPs[(y>>3)*QPStride + (x>>3)]: | |
| 3545 QPs[(y>>4)*QPStride + (x>>4)]; | |
| 3546 #endif | |
| 3547 if(!isColor) | |
| 3548 { | |
| 3549 QP= (QP* QPCorrecture)>>8; | |
| 3550 yHistogram[ srcBlock[srcStride*12 + 4] ]++; | |
| 3551 } | |
| 3552 #ifdef HAVE_MMX | |
| 3553 asm volatile( | |
| 3554 "movd %0, %%mm7 \n\t" | |
| 3555 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | |
| 3556 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | |
| 3557 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | |
| 3558 "movq %%mm7, pQPb \n\t" | |
| 3559 : : "r" (QP) | |
| 3560 ); | |
| 3561 #endif | |
| 3562 | |
| 3563 #ifdef MORE_TIMING | |
| 3564 T0= rdtsc(); | |
| 3565 #endif | |
| 3566 | |
| 3567 #ifdef HAVE_MMX2 | |
| 3568 /* | |
| 3569 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 3570 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 3571 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 3572 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 3573 */ | |
| 3574 | |
| 3575 asm( | |
| 3576 "movl %4, %%eax \n\t" | |
| 3577 "shrl $2, %%eax \n\t" | |
| 3578 "andl $6, %%eax \n\t" | |
| 3579 "addl %5, %%eax \n\t" | |
| 3580 "movl %%eax, %%ebx \n\t" | |
| 3581 "imul %1, %%eax \n\t" | |
| 3582 "imul %3, %%ebx \n\t" | |
| 3583 "prefetchnta 32(%%eax, %0) \n\t" | |
| 3584 "prefetcht0 32(%%ebx, %2) \n\t" | |
| 3585 "addl %1, %%eax \n\t" | |
| 3586 "addl %3, %%ebx \n\t" | |
| 3587 "prefetchnta 32(%%eax, %0) \n\t" | |
| 3588 "prefetcht0 32(%%ebx, %2) \n\t" | |
| 3589 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | |
| 3590 "m" (x), "m" (copyAhead) | |
| 3591 : "%eax", "%ebx" | |
| 3592 ); | |
| 3593 | |
| 3594 #elif defined(HAVE_3DNOW) | |
| 3595 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 3596 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
| 3597 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 3598 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 3599 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 3600 */ | |
| 3601 #endif | |
| 3602 | |
| 3603 #ifdef PP_FUNNY_STRIDE | |
| 3604 //can we mess with a 8x16 block, if not use a temp buffer, yes again | |
| 3605 if(x+7 >= width) | |
| 3606 { | |
| 3607 int i; | |
| 3608 dstBlockPtrBackup= dstBlock; | |
| 3609 srcBlockPtrBackup= srcBlock; | |
| 3610 | |
| 3611 for(i=0;i<BLOCK_SIZE*2; i++) | |
| 3612 { | |
| 3613 memcpy(tempSrcBlock+i*srcStride, srcBlock+i*srcStride, width-x); | |
| 3614 memcpy(tempDstBlock+i*dstStride, dstBlock+i*dstStride, width-x); | |
| 3615 } | |
| 3616 | |
| 3617 dstBlock= tempDstBlock; | |
| 3618 srcBlock= tempSrcBlock; | |
| 3619 } | |
| 3620 #endif | |
| 3621 | |
| 3622 blockCopy(dstBlock + dstStride*copyAhead, dstStride, | |
| 3623 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); | |
| 3624 | |
| 3625 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
| 3626 deInterlaceInterpolateLinear(dstBlock, dstStride); | |
| 3627 else if(mode & LINEAR_BLEND_DEINT_FILTER) | |
| 3628 deInterlaceBlendLinear(dstBlock, dstStride); | |
| 3629 else if(mode & MEDIAN_DEINT_FILTER) | |
| 3630 deInterlaceMedian(dstBlock, dstStride); | |
| 3631 else if(mode & CUBIC_IPOL_DEINT_FILTER) | |
| 3632 deInterlaceInterpolateCubic(dstBlock, dstStride); | |
| 3633 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | |
| 3634 deInterlaceBlendCubic(dstBlock, dstStride); | |
| 3635 */ | |
| 3636 | |
| 3637 /* only deblock if we have 2 blocks */ | |
| 3638 if(y + 8 < height) | |
| 3639 { | |
| 3640 #ifdef MORE_TIMING | |
| 3641 T1= rdtsc(); | |
| 3642 memcpyTime+= T1-T0; | |
| 3643 T0=T1; | |
| 3644 #endif | |
| 3645 if(mode & V_RK1_FILTER) | |
| 3646 vertRK1Filter(dstBlock, stride, QP); | |
| 3647 else if(mode & V_X1_FILTER) | |
| 3648 vertX1Filter(dstBlock, stride, QP); | |
| 3649 else if(mode & V_DEBLOCK) | |
| 3650 { | |
| 3651 if( isVertDC(dstBlock, stride)) | |
| 3652 { | |
| 3653 if(isVertMinMaxOk(dstBlock, stride, QP)) | |
| 3654 doVertLowPass(dstBlock, stride, QP); | |
| 3655 } | |
| 3656 else | |
| 3657 doVertDefFilter(dstBlock, stride, QP); | |
| 3658 } | |
| 3659 #ifdef MORE_TIMING | |
| 3660 T1= rdtsc(); | |
| 3661 vertTime+= T1-T0; | |
| 3662 T0=T1; | |
| 3663 #endif | |
| 3664 } | |
| 3665 | |
| 3666 #ifdef HAVE_MMX | |
| 3667 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride); | |
| 3668 #endif | |
| 3669 /* check if we have a previous block to deblock it with dstBlock */ | |
| 3670 if(x - 8 >= 0) | |
| 3671 { | |
| 3672 #ifdef MORE_TIMING | |
| 3673 T0= rdtsc(); | |
| 3674 #endif | |
| 3675 #ifdef HAVE_MMX | |
| 3676 if(mode & H_RK1_FILTER) | |
| 3677 vertRK1Filter(tempBlock1, 16, QP); | |
| 3678 else if(mode & H_X1_FILTER) | |
| 3679 vertX1Filter(tempBlock1, 16, QP); | |
| 3680 else if(mode & H_DEBLOCK) | |
| 3681 { | |
| 3682 if( isVertDC(tempBlock1, 16) ) | |
| 3683 { | |
| 3684 if(isVertMinMaxOk(tempBlock1, 16, QP)) | |
| 3685 doVertLowPass(tempBlock1, 16, QP); | |
| 3686 } | |
| 3687 else | |
| 3688 doVertDefFilter(tempBlock1, 16, QP); | |
| 3689 } | |
| 3690 | |
| 3691 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16); | |
| 3692 | |
| 3693 #else | |
| 3694 if(mode & H_X1_FILTER) | |
| 3695 horizX1Filter(dstBlock-4, stride, QP); | |
| 3696 else if(mode & H_DEBLOCK) | |
| 3697 { | |
| 3698 if( isHorizDC(dstBlock-4, stride)) | |
| 3699 { | |
| 3700 if(isHorizMinMaxOk(dstBlock-4, stride, QP)) | |
| 3701 doHorizLowPass(dstBlock-4, stride, QP); | |
| 3702 } | |
| 3703 else | |
| 3704 doHorizDefFilter(dstBlock-4, stride, QP); | |
| 3705 } | |
| 3706 #endif | |
| 3707 #ifdef MORE_TIMING | |
| 3708 T1= rdtsc(); | |
| 3709 horizTime+= T1-T0; | |
| 3710 T0=T1; | |
| 3711 #endif | |
| 3712 if(mode & DERING) | |
| 3713 { | |
| 3714 //FIXME filter first line | |
| 3715 if(y>0) dering(dstBlock - stride - 8, stride, QP); | |
| 3716 } | |
| 3717 | |
| 3718 if(mode & TEMP_NOISE_FILTER) | |
| 3719 { | |
| 3720 tempNoiseReducer(dstBlock-8, stride, | |
| 3721 tempBlured[isColor] + y*dstStride + x, | |
| 3722 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
| 3723 ppMode->maxTmpNoise); | |
| 3724 } | |
| 3725 } | |
| 3726 | |
| 3727 #ifdef PP_FUNNY_STRIDE | |
| 3728 /* did we use a tmp-block buffer */ | |
| 3729 if(x+7 >= width) | |
| 3730 { | |
| 3731 int i; | |
| 3732 dstBlock= dstBlockPtrBackup; | |
| 3733 srcBlock= srcBlockPtrBackup; | |
| 3734 | |
| 3735 for(i=0;i<BLOCK_SIZE*2; i++) | |
| 3736 { | |
| 3737 memcpy(dstBlock+i*dstStride, tempDstBlock+i*dstStride, width-x); | |
| 3738 } | |
| 3739 } | |
| 3740 #endif | |
| 3741 | |
| 3742 dstBlock+=8; | |
| 3743 srcBlock+=8; | |
| 3744 | |
| 3745 #ifdef HAVE_MMX | |
| 3746 tmpXchg= tempBlock1; | |
| 3747 tempBlock1= tempBlock2; | |
| 3748 tempBlock2 = tmpXchg; | |
| 3749 #endif | |
| 3750 } | |
| 3751 | |
| 3752 if(mode & DERING) | |
| 3753 { | |
| 3754 if(y > 0) dering(dstBlock - dstStride - 8, dstStride, QP); | |
| 3755 } | |
| 3756 | |
| 3757 if((mode & TEMP_NOISE_FILTER)) | |
| 3758 { | |
| 3759 tempNoiseReducer(dstBlock-8, dstStride, | |
| 3760 tempBlured[isColor] + y*dstStride + x, | |
| 3761 tempBluredPast[isColor] + (y>>3)*256 + (x>>3), | |
| 3762 ppMode->maxTmpNoise); | |
| 3763 } | |
| 3764 | |
| 3765 /* did we use a tmp buffer for the last lines*/ | |
| 3766 if(y+15 >= height) | |
| 3767 { | |
| 3768 uint8_t *dstBlock= &(dst[y*dstStride]); | |
| 3769 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); | |
| 3770 } | |
| 3771 /* | |
| 3772 for(x=0; x<width; x+=32) | |
| 3773 { | |
| 3774 volatile int i; | |
| 3775 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] | |
| 3776 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | |
| 3777 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; | |
| 3778 // + dstBlock[x +13*dstStride] | |
| 3779 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; | |
| 3780 }*/ | |
| 3781 } | |
| 3782 #ifdef HAVE_3DNOW | |
| 3783 asm volatile("femms"); | |
| 3784 #elif defined (HAVE_MMX) | |
| 3785 asm volatile("emms"); | |
| 3786 #endif | |
| 3787 | |
| 3788 #ifdef TIMING | |
| 3789 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) | |
| 3790 sumTime= rdtsc() - sumTime; | |
| 3791 if(!isColor) | |
| 3792 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r", | |
| 3793 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000), | |
| 3794 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) | |
| 3795 , black, white); | |
| 3796 #endif | |
| 3797 #ifdef DEBUG_BRIGHTNESS | |
| 3798 if(!isColor) | |
| 3799 { | |
| 3800 int max=1; | |
| 3801 int i; | |
| 3802 for(i=0; i<256; i++) | |
| 3803 if(yHistogram[i] > max) max=yHistogram[i]; | |
| 3804 | |
| 3805 for(i=1; i<256; i++) | |
| 3806 { | |
| 3807 int x; | |
| 3808 int start=yHistogram[i-1]/(max/256+1); | |
| 3809 int end=yHistogram[i]/(max/256+1); | |
| 3810 int inc= end > start ? 1 : -1; | |
| 3811 for(x=start; x!=end+inc; x+=inc) | |
| 3812 dst[ i*dstStride + x]+=128; | |
| 3813 } | |
| 3814 | |
| 3815 for(i=0; i<100; i+=2) | |
| 3816 { | |
| 3817 dst[ (white)*dstStride + i]+=128; | |
| 3818 dst[ (black)*dstStride + i]+=128; | |
| 3819 } | |
| 3820 | |
| 3821 } | |
| 3822 #endif | |
| 3823 | |
| 3824 } |
