Mercurial > libavcodec.hg
comparison libpostproc/postprocess.c @ 96:29ac11dc53d3 libavcodec
fixed a bug in the horizontal default filter
3dnow version of the Horizontal & Vertical Lowpass filters
mmx version of the Horizontal Default filter
mmx2 & C versions of a simple filter described in a paper from ramkishor & karan
added mode flags & quality2mode function
| author | arpi |
|---|---|
| date | Wed, 10 Oct 2001 22:21:19 +0000 |
| parents | 8bce253b537c |
| children | e57b1d38d71f |
comparison
equal
deleted
inserted
replaced
| 95:8bce253b537c | 96:29ac11dc53d3 |
|---|---|
| 15 along with this program; if not, write to the Free Software | 15 along with this program; if not, write to the Free Software |
| 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
| 17 */ | 17 */ |
| 18 | 18 |
| 19 /* | 19 /* |
| 20 C MMX MMX2 | 20 C MMX MMX2 3DNow* |
| 21 isVertDC Ec Ec | 21 isVertDC Ec Ec |
| 22 isVertMinMaxOk Ec Ec | 22 isVertMinMaxOk Ec Ec |
| 23 doVertLowPass E e | 23 doVertLowPass E e e* |
| 24 doVertDefFilter Ec Ec Ec | 24 doVertDefFilter Ec Ec Ec |
| 25 isHorizDC Ec Ec | 25 isHorizDC Ec Ec |
| 26 isHorizMinMaxOk a | 26 isHorizMinMaxOk a |
| 27 doHorizLowPass E a | 27 doHorizLowPass E a a* |
| 28 doHorizDefFilter E a | 28 doHorizDefFilter E ac ac |
| 29 deRing | 29 deRing |
| 30 | 30 |
| 31 * i dont have a 3dnow CPU -> its untested | |
| 31 E = Exact implementation | 32 E = Exact implementation |
| 32 e = allmost exact implementation | 33 e = allmost exact implementation |
| 33 a = alternative / approximate impl | 34 a = alternative / approximate impl |
| 34 c = checked against the other implementations (-vo md5) | 35 c = checked against the other implementations (-vo md5) |
| 35 */ | 36 */ |
| 37 /* | 38 /* |
| 38 TODO: | 39 TODO: |
| 39 verify that everything workes as it should | 40 verify that everything workes as it should |
| 40 reduce the time wasted on the mem transfer | 41 reduce the time wasted on the mem transfer |
| 41 implement dering | 42 implement dering |
| 42 implement everything in C at least | 43 implement everything in C at least (done at the moment but ...) |
| 43 figure range of QP out (assuming <256 for now) | 44 figure range of QP out (assuming <256 for now) |
| 44 unroll stuff if instructions depend too much on the prior one | 45 unroll stuff if instructions depend too much on the prior one |
| 45 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? | 46 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? |
| 46 move YScale thing to the end instead of fixing QP | 47 move YScale thing to the end instead of fixing QP |
| 48 write a faster and higher quality deblocking filter :) | |
| 47 ... | 49 ... |
| 48 | 50 |
| 49 Notes: | 51 Notes: |
| 50 | 52 |
| 53 */ | |
| 54 | |
| 55 /* | |
| 56 Changelog: | |
| 57 0.1.2 | |
| 58 fixed a bug in the horizontal default filter | |
| 59 3dnow version of the Horizontal & Vertical Lowpass filters | |
| 60 mmx version of the Horizontal Default filter | |
| 61 mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar | |
| 62 added mode flags & quality2mode function | |
| 63 0.1.1 | |
| 51 */ | 64 */ |
| 52 | 65 |
| 53 | 66 |
| 54 #include <inttypes.h> | 67 #include <inttypes.h> |
| 55 #include <stdio.h> | 68 #include <stdio.h> |
| 56 #include "../config.h" | 69 #include "../config.h" |
| 70 //#undef HAVE_MMX2 | |
| 71 //#define HAVE_3DNOW | |
| 72 //#undef HAVE_MMX | |
| 57 #include "postprocess.h" | 73 #include "postprocess.h" |
| 58 //#undef HAVE_MMX2 | |
| 59 //#undef HAVE_MMX | |
| 60 | |
| 61 | 74 |
| 62 | 75 |
| 63 static uint64_t packedYOffset= 0x0000000000000000LL; | 76 static uint64_t packedYOffset= 0x0000000000000000LL; |
| 64 static uint64_t packedYScale= 0x0100010001000100LL; | 77 static uint64_t packedYScale= 0x0100010001000100LL; |
| 65 static uint64_t w05= 0x0005000500050005LL; | 78 static uint64_t w05= 0x0005000500050005LL; |
| 69 static uint64_t bm00010000= 0x000000FF00000000LL; | 82 static uint64_t bm00010000= 0x000000FF00000000LL; |
| 70 static uint64_t bm00001000= 0x00000000FF000000LL; | 83 static uint64_t bm00001000= 0x00000000FF000000LL; |
| 71 static uint64_t bm10000000= 0xFF00000000000000LL; | 84 static uint64_t bm10000000= 0xFF00000000000000LL; |
| 72 static uint64_t bm10000001= 0xFF000000000000FFLL; | 85 static uint64_t bm10000001= 0xFF000000000000FFLL; |
| 73 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; | 86 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; |
| 87 static uint64_t bm00000011= 0x000000000000FFFFLL; | |
| 88 static uint64_t bm11000000= 0xFFFF000000000000LL; | |
| 74 static uint64_t bm00011000= 0x000000FFFF000000LL; | 89 static uint64_t bm00011000= 0x000000FFFF000000LL; |
| 75 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; | 90 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; |
| 76 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; | 91 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; |
| 77 static uint64_t b00= 0x0000000000000000LL; | 92 static uint64_t b00= 0x0000000000000000LL; |
| 78 static uint64_t b02= 0x0202020202020202LL; | 93 static uint64_t b02= 0x0202020202020202LL; |
| 79 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; | 94 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; |
| 80 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; | 95 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; |
| 96 static uint64_t b20= 0x2020202020202020LL; | |
| 97 static uint64_t b80= 0x8080808080808080LL; | |
| 81 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL; | 98 static uint64_t b7E= 0x7E7E7E7E7E7E7E7ELL; |
| 82 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL; | 99 static uint64_t b7C= 0x7C7C7C7C7C7C7C7CLL; |
| 83 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL; | 100 static uint64_t b3F= 0x3F3F3F3F3F3F3F3FLL; |
| 84 static uint64_t temp0=0; | 101 static uint64_t temp0=0; |
| 85 static uint64_t temp1=0; | 102 static uint64_t temp1=0; |
| 310 */ | 327 */ |
| 311 static inline void doVertLowPass(uint8_t *src, int stride, int QP) | 328 static inline void doVertLowPass(uint8_t *src, int stride, int QP) |
| 312 { | 329 { |
| 313 // QP= 64; | 330 // QP= 64; |
| 314 | 331 |
| 315 #ifdef HAVE_MMX2 | 332 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 333 //#ifdef HAVE_MMX2 | |
| 316 asm volatile( //"movv %0 %1 %2\n\t" | 334 asm volatile( //"movv %0 %1 %2\n\t" |
| 317 "pushl %0 \n\t" | 335 "pushl %0 \n\t" |
| 318 "movq pQPb, %%mm0 \n\t" // QP,..., QP | 336 "movq pQPb, %%mm0 \n\t" // QP,..., QP |
| 319 // "movq bFF , %%mm0 \n\t" // QP,..., QP | 337 // "movq bFF , %%mm0 \n\t" // QP,..., QP |
| 320 | 338 |
| 370 "pand b3F, %%mm0 \n\t" | 388 "pand b3F, %%mm0 \n\t" |
| 371 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4 | 389 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4 |
| 372 */ | 390 */ |
| 373 "movq (%0, %1), %%mm0 \n\t" // 1 | 391 "movq (%0, %1), %%mm0 \n\t" // 1 |
| 374 "movq %%mm0, %%mm1 \n\t" // 1 | 392 "movq %%mm0, %%mm1 \n\t" // 1 |
| 375 "pavgb %%mm6, %%mm0 \n\t" //1 1 /2 | 393 PAVGB(%%mm6, %%mm0) //1 1 /2 |
| 376 "pavgb %%mm6, %%mm0 \n\t" //3 1 /4 | 394 PAVGB(%%mm6, %%mm0) //3 1 /4 |
| 377 | 395 |
| 378 "movq (%0, %1, 4), %%mm2 \n\t" // 1 | 396 "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
| 379 "movq %%mm2, %%mm5 \n\t" // 1 | 397 "movq %%mm2, %%mm5 \n\t" // 1 |
| 380 "pavgb (%%eax), %%mm2 \n\t" // 11 /2 | 398 PAVGB((%%eax), %%mm2) // 11 /2 |
| 381 "pavgb (%0, %1, 2), %%mm2 \n\t" // 211 /4 | 399 PAVGB((%0, %1, 2), %%mm2) // 211 /4 |
| 382 "movq %%mm2, %%mm3 \n\t" // 211 /4 | 400 "movq %%mm2, %%mm3 \n\t" // 211 /4 |
| 383 "movq (%0), %%mm4 \n\t" // 1 | 401 "movq (%0), %%mm4 \n\t" // 1 |
| 384 "pavgb %%mm4, %%mm3 \n\t" // 4 211 /8 | 402 PAVGB(%%mm4, %%mm3) // 4 211 /8 |
| 385 "pavgb %%mm0, %%mm3 \n\t" //642211 /16 | 403 PAVGB(%%mm0, %%mm3) //642211 /16 |
| 386 "movq %%mm3, (%0) \n\t" // X | 404 "movq %%mm3, (%0) \n\t" // X |
| 387 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 | 405 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
| 388 "movq %%mm1, %%mm0 \n\t" // 1 | 406 "movq %%mm1, %%mm0 \n\t" // 1 |
| 389 "pavgb %%mm6, %%mm0 \n\t" //1 1 /2 | 407 PAVGB(%%mm6, %%mm0) //1 1 /2 |
| 390 "movq %%mm4, %%mm3 \n\t" // 1 | 408 "movq %%mm4, %%mm3 \n\t" // 1 |
| 391 "pavgb (%0,%1,2), %%mm3 \n\t" // 1 1 /2 | 409 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
| 392 "pavgb (%%eax,%1,2), %%mm5 \n\t" // 11 /2 | 410 PAVGB((%%eax,%1,2), %%mm5) // 11 /2 |
| 393 "pavgb (%%eax), %%mm5 \n\t" // 211 /4 | 411 PAVGB((%%eax), %%mm5) // 211 /4 |
| 394 "pavgb %%mm5, %%mm3 \n\t" // 2 2211 /8 | 412 PAVGB(%%mm5, %%mm3) // 2 2211 /8 |
| 395 "pavgb %%mm0, %%mm3 \n\t" //4242211 /16 | 413 PAVGB(%%mm0, %%mm3) //4242211 /16 |
| 396 "movq %%mm3, (%0,%1) \n\t" // X | 414 "movq %%mm3, (%0,%1) \n\t" // X |
| 397 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 | 415 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
| 398 "pavgb %%mm4, %%mm6 \n\t" //11 /2 | 416 PAVGB(%%mm4, %%mm6) //11 /2 |
| 399 "movq (%%ebx), %%mm0 \n\t" // 1 | 417 "movq (%%ebx), %%mm0 \n\t" // 1 |
| 400 "pavgb (%%eax, %1, 2), %%mm0 \n\t" // 11/2 | 418 PAVGB((%%eax, %1, 2), %%mm0) // 11/2 |
| 401 "movq %%mm0, %%mm3 \n\t" // 11/2 | 419 "movq %%mm0, %%mm3 \n\t" // 11/2 |
| 402 "pavgb %%mm1, %%mm0 \n\t" // 2 11/4 | 420 PAVGB(%%mm1, %%mm0) // 2 11/4 |
| 403 "pavgb %%mm6, %%mm0 \n\t" //222 11/8 | 421 PAVGB(%%mm6, %%mm0) //222 11/8 |
| 404 "pavgb %%mm2, %%mm0 \n\t" //22242211/16 | 422 PAVGB(%%mm2, %%mm0) //22242211/16 |
| 405 "movq (%0, %1, 2), %%mm2 \n\t" // 1 | 423 "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
| 406 "movq %%mm0, (%0, %1, 2) \n\t" // X | 424 "movq %%mm0, (%0, %1, 2) \n\t" // X |
| 407 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 | 425 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
| 408 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | 426 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 409 "pavgb (%%ebx), %%mm0 \n\t" // 11 /2 | 427 PAVGB((%%ebx), %%mm0) // 11 /2 |
| 410 "pavgb %%mm0, %%mm6 \n\t" //11 11 /4 | 428 PAVGB(%%mm0, %%mm6) //11 11 /4 |
| 411 "pavgb %%mm1, %%mm4 \n\t" // 11 /2 | 429 PAVGB(%%mm1, %%mm4) // 11 /2 |
| 412 "pavgb %%mm2, %%mm1 \n\t" // 11 /2 | 430 PAVGB(%%mm2, %%mm1) // 11 /2 |
| 413 "pavgb %%mm1, %%mm6 \n\t" //1122 11 /8 | 431 PAVGB(%%mm1, %%mm6) //1122 11 /8 |
| 414 "pavgb %%mm5, %%mm6 \n\t" //112242211 /16 | 432 PAVGB(%%mm5, %%mm6) //112242211 /16 |
| 415 "movq (%%eax), %%mm5 \n\t" // 1 | 433 "movq (%%eax), %%mm5 \n\t" // 1 |
| 416 "movq %%mm6, (%%eax) \n\t" // X | 434 "movq %%mm6, (%%eax) \n\t" // X |
| 417 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 | 435 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
| 418 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 | 436 "movq (%%eax, %1, 4), %%mm6 \n\t" // 1 |
| 419 "pavgb %%mm7, %%mm6 \n\t" // 11 /2 | 437 PAVGB(%%mm7, %%mm6) // 11 /2 |
| 420 "pavgb %%mm4, %%mm6 \n\t" // 11 11 /4 | 438 PAVGB(%%mm4, %%mm6) // 11 11 /4 |
| 421 "pavgb %%mm3, %%mm6 \n\t" // 11 2211 /8 | 439 PAVGB(%%mm3, %%mm6) // 11 2211 /8 |
| 422 "pavgb %%mm5, %%mm2 \n\t" // 11 /2 | 440 PAVGB(%%mm5, %%mm2) // 11 /2 |
| 423 "movq (%0, %1, 4), %%mm4 \n\t" // 1 | 441 "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
| 424 "pavgb %%mm4, %%mm2 \n\t" // 112 /4 | 442 PAVGB(%%mm4, %%mm2) // 112 /4 |
| 425 "pavgb %%mm2, %%mm6 \n\t" // 112242211 /16 | 443 PAVGB(%%mm2, %%mm6) // 112242211 /16 |
| 426 "movq %%mm6, (%0, %1, 4) \n\t" // X | 444 "movq %%mm6, (%0, %1, 4) \n\t" // X |
| 427 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 | 445 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
| 428 "pavgb %%mm7, %%mm1 \n\t" // 11 2 /4 | 446 PAVGB(%%mm7, %%mm1) // 11 2 /4 |
| 429 "pavgb %%mm4, %%mm5 \n\t" // 11 /2 | 447 PAVGB(%%mm4, %%mm5) // 11 /2 |
| 430 "pavgb %%mm5, %%mm0 \n\t" // 11 11 /4 | 448 PAVGB(%%mm5, %%mm0) // 11 11 /4 |
| 431 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 | 449 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
| 432 "pavgb %%mm6, %%mm1 \n\t" // 11 4 2 /8 | 450 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
| 433 "pavgb %%mm0, %%mm1 \n\t" // 11224222 /16 | 451 PAVGB(%%mm0, %%mm1) // 11224222 /16 |
| 434 // "pxor %%mm1, %%mm1 \n\t" | 452 // "pxor %%mm1, %%mm1 \n\t" |
| 435 "movq %%mm1, (%%eax, %1, 2) \n\t" // X | 453 "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
| 436 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 | 454 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
| 437 "pavgb (%%ebx), %%mm2 \n\t" // 112 4 /8 | 455 PAVGB((%%ebx), %%mm2) // 112 4 /8 |
| 438 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | 456 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 439 "pavgb %%mm0, %%mm6 \n\t" // 1 1 /2 | 457 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
| 440 "pavgb %%mm7, %%mm6 \n\t" // 1 12 /4 | 458 PAVGB(%%mm7, %%mm6) // 1 12 /4 |
| 441 "pavgb %%mm2, %%mm6 \n\t" // 1122424 /4 | 459 PAVGB(%%mm2, %%mm6) // 1122424 /4 |
| 442 // "pxor %%mm6, %%mm6 \n\t" | 460 // "pxor %%mm6, %%mm6 \n\t" |
| 443 "movq %%mm6, (%%ebx) \n\t" // X | 461 "movq %%mm6, (%%ebx) \n\t" // X |
| 444 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 | 462 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
| 445 "pavgb %%mm7, %%mm5 \n\t" // 11 2 /4 | 463 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
| 446 "pavgb %%mm7, %%mm5 \n\t" // 11 6 /8 | 464 PAVGB(%%mm7, %%mm5) // 11 6 /8 |
| 447 | 465 |
| 448 "pavgb %%mm3, %%mm0 \n\t" // 112 /4 | 466 PAVGB(%%mm3, %%mm0) // 112 /4 |
| 449 "pavgb %%mm0, %%mm5 \n\t" // 112246 /16 | 467 PAVGB(%%mm0, %%mm5) // 112246 /16 |
| 450 // "pxor %%mm5, %%mm5 \n\t" | 468 // "pxor %%mm5, %%mm5 \n\t" |
| 451 // "movq pQPb, %%mm5 \n\t" | 469 // "movq pQPb, %%mm5 \n\t" |
| 452 "movq %%mm5, (%%eax, %1, 4) \n\t" // X | 470 "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
| 453 "popl %0\n\t" | 471 "popl %0\n\t" |
| 454 | 472 |
| 455 : | 473 : |
| 456 : "r" (src), "r" (stride) | 474 : "r" (src), "r" (stride) |
| 457 : "%eax", "%ebx" | 475 : "%eax", "%ebx" |
| 458 ); | 476 ); |
| 459 | |
| 460 #else | 477 #else |
| 461 const int l1= stride; | 478 const int l1= stride; |
| 462 const int l2= stride + l1; | 479 const int l2= stride + l1; |
| 463 const int l3= stride + l2; | 480 const int l3= stride + l2; |
| 464 const int l4= stride + l3; | 481 const int l4= stride + l3; |
| 496 src++; | 513 src++; |
| 497 } | 514 } |
| 498 | 515 |
| 499 #endif | 516 #endif |
| 500 } | 517 } |
| 518 | |
| 519 /** | |
| 520 * Experimental implementation of the filter (Algorithm 1) described in a paper from Ramkishor & Karandikar | |
| 521 * values are correctly clipped (MMX2) | |
| 522 * values are wraparound (C) | |
| 523 * conclusion: its fast, but introduces ugly horizontal patterns if there is a continious gradient | |
| 524 0 8 16 24 | |
| 525 x = 8 | |
| 526 x/2 = 4 | |
| 527 x/8 = 1 | |
| 528 1 12 12 23 | |
| 529 */ | |
| 530 static inline void vertRKFilter(uint8_t *src, int stride, int QP) | |
| 531 { | |
| 532 #ifdef HAVE_MMX2 | |
| 533 // FIXME rounding | |
| 534 asm volatile( | |
| 535 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 536 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | |
| 537 "leal (%0, %1), %%eax \n\t" | |
| 538 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 539 // 0 1 2 3 4 5 6 7 8 9 | |
| 540 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 541 "movq pQPb, %%mm0 \n\t" // QP,..., QP | |
| 542 "movq %%mm0, %%mm1 \n\t" // QP,..., QP | |
| 543 "paddusb b02, %%mm0 \n\t" | |
| 544 "psrlw $2, %%mm0 \n\t" | |
| 545 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4 | |
| 546 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... | |
| 547 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | |
| 548 "movq (%%ebx), %%mm3 \n\t" // line 5 | |
| 549 "movq %%mm2, %%mm4 \n\t" // line 4 | |
| 550 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | |
| 551 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | |
| 552 "pavgb %%mm3, %%mm5 \n\t" | |
| 553 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 | |
| 554 "psubusb %%mm3, %%mm4 \n\t" | |
| 555 "psubusb %%mm2, %%mm3 \n\t" | |
| 556 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | |
| 557 "psubusb %%mm0, %%mm4 \n\t" | |
| 558 "pcmpeqb %%mm7, %%mm4 \n\t" | |
| 559 "pand %%mm4, %%mm5 \n\t" // d/2 | |
| 560 | |
| 561 // "paddb %%mm6, %%mm2 \n\t" // line 4 + 0x80 | |
| 562 "paddb %%mm5, %%mm2 \n\t" | |
| 563 // "psubb %%mm6, %%mm2 \n\t" | |
| 564 "movq %%mm2, (%0,%1, 4) \n\t" | |
| 565 | |
| 566 "movq (%%ebx), %%mm2 \n\t" | |
| 567 // "paddb %%mm6, %%mm2 \n\t" // line 5 + 0x80 | |
| 568 "psubb %%mm5, %%mm2 \n\t" | |
| 569 // "psubb %%mm6, %%mm2 \n\t" | |
| 570 "movq %%mm2, (%%ebx) \n\t" | |
| 571 | |
| 572 "paddb %%mm6, %%mm5 \n\t" | |
| 573 "psrlw $2, %%mm5 \n\t" | |
| 574 "pand b3F, %%mm5 \n\t" | |
| 575 "psubb b20, %%mm5 \n\t" // (l5-l4)/8 | |
| 576 | |
| 577 "movq (%%eax, %1, 2), %%mm2 \n\t" | |
| 578 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | |
| 579 "paddsb %%mm5, %%mm2 \n\t" | |
| 580 "psubb %%mm6, %%mm2 \n\t" | |
| 581 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
| 582 | |
| 583 "movq (%%ebx, %1), %%mm2 \n\t" | |
| 584 "paddb %%mm6, %%mm2 \n\t" // line 6 + 0x80 | |
| 585 "psubsb %%mm5, %%mm2 \n\t" | |
| 586 "psubb %%mm6, %%mm2 \n\t" | |
| 587 "movq %%mm2, (%%ebx, %1) \n\t" | |
| 588 | |
| 589 : | |
| 590 : "r" (src), "r" (stride) | |
| 591 : "%eax", "%ebx" | |
| 592 ); | |
| 593 #else | |
| 594 const int l1= stride; | |
| 595 const int l2= stride + l1; | |
| 596 const int l3= stride + l2; | |
| 597 const int l4= stride + l3; | |
| 598 const int l5= stride + l4; | |
| 599 const int l6= stride + l5; | |
| 600 const int l7= stride + l6; | |
| 601 const int l8= stride + l7; | |
| 602 const int l9= stride + l8; | |
| 603 for(int x=0; x<BLOCK_SIZE; x++) | |
| 604 { | |
| 605 if(ABS(src[l4]-src[l5]) < QP + QP/4) | |
| 606 { | |
| 607 int x = src[l5] - src[l4]; | |
| 608 | |
| 609 src[l3] +=x/8; | |
| 610 src[l4] +=x/2; | |
| 611 src[l5] -=x/2; | |
| 612 src[l6] -=x/8; | |
| 613 } | |
| 614 src++; | |
| 615 } | |
| 616 | |
| 617 #endif | |
| 618 } | |
| 619 | |
| 620 /** | |
| 621 * Experimental Filter 1 | |
| 622 */ | |
| 623 static inline void vertX1Filter(uint8_t *src, int stride, int QP) | |
| 624 { | |
| 625 #ifdef HAVE_MMX2X | |
| 626 // FIXME | |
| 627 asm volatile( | |
| 628 | |
| 629 : | |
| 630 : "r" (src), "r" (stride) | |
| 631 : "%eax", "%ebx" | |
| 632 ); | |
| 633 #else | |
| 634 const int l1= stride; | |
| 635 const int l2= stride + l1; | |
| 636 const int l3= stride + l2; | |
| 637 const int l4= stride + l3; | |
| 638 const int l5= stride + l4; | |
| 639 const int l6= stride + l5; | |
| 640 const int l7= stride + l6; | |
| 641 const int l8= stride + l7; | |
| 642 const int l9= stride + l8; | |
| 643 for(int x=0; x<BLOCK_SIZE; x++) | |
| 644 { | |
| 645 int v2= src[l2]; | |
| 646 int v3= src[l3]; | |
| 647 int v4= src[l4]; | |
| 648 int v5= src[l5]; | |
| 649 int v6= src[l6]; | |
| 650 int v7= src[l7]; | |
| 651 | |
| 652 if(ABS(v4-v5)<QP && ABS(v4-v5) - (ABS(v3-v4) + ABS(v5-v6))>0 ) | |
| 653 { | |
| 654 src[l3] = (6*v2 + 4*v3 + 3*v4 + 2*v5 + v6 )/16; | |
| 655 src[l4] = (3*v2 + 3*v3 + 4*v4 + 3*v5 + 2*v6 + v7 )/16; | |
| 656 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16; | |
| 657 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; | |
| 658 } | |
| 659 src++; | |
| 660 } | |
| 661 | |
| 662 #endif | |
| 663 } | |
| 664 | |
| 501 | 665 |
| 502 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | 666 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
| 503 { | 667 { |
| 504 #ifdef HAVE_MMX | 668 #ifdef HAVE_MMX |
| 505 src+= stride; | 669 src+= stride; |
| 913 #endif | 1077 #endif |
| 914 } | 1078 } |
| 915 | 1079 |
| 916 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) | 1080 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) |
| 917 { | 1081 { |
| 918 #ifdef HAVE_MMX2 | 1082 #ifdef HAVE_MMX |
| 919 asm volatile( | 1083 asm volatile( |
| 920 "pushl %0 \n\t" | 1084 "pushl %0 \n\t" |
| 921 "pxor %%mm7, %%mm7 \n\t" | 1085 "pxor %%mm7, %%mm7 \n\t" |
| 922 "movq bm00001000, %%mm6 \n\t" | 1086 "movq bm00001000, %%mm6 \n\t" |
| 923 "movd %2, %%mm5 \n\t" // QP | 1087 "movd %2, %%mm5 \n\t" // QP |
| 928 "pxor %%mm5, %%mm5 \n\t" // 0 | 1092 "pxor %%mm5, %%mm5 \n\t" // 0 |
| 929 "psubb %%mm4, %%mm5 \n\t" // -QP | 1093 "psubb %%mm4, %%mm5 \n\t" // -QP |
| 930 "leal tempBlock, %%eax \n\t" | 1094 "leal tempBlock, %%eax \n\t" |
| 931 | 1095 |
| 932 //FIXME? "unroll by 2" and mix | 1096 //FIXME? "unroll by 2" and mix |
| 933 #define HDF(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | 1097 #ifdef HAVE_MMX2 |
| 1098 #define HDF(i) \ | |
| 1099 "movq " #i "(%%eax), %%mm0 \n\t"\ | |
| 934 "movq %%mm0, %%mm1 \n\t"\ | 1100 "movq %%mm0, %%mm1 \n\t"\ |
| 935 "movq %%mm0, %%mm2 \n\t"\ | 1101 "movq %%mm0, %%mm2 \n\t"\ |
| 936 "psrlq $8, %%mm1 \n\t"\ | 1102 "psrlq $8, %%mm1 \n\t"\ |
| 937 "psubusb %%mm1, %%mm2 \n\t"\ | 1103 "psubusb %%mm1, %%mm2 \n\t"\ |
| 938 "psubusb %%mm0, %%mm1 \n\t"\ | 1104 "psubusb %%mm0, %%mm1 \n\t"\ |
| 939 "por %%mm2, %%mm1 \n\t" /* |px - p(x+1)| */\ | 1105 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ |
| 940 "pcmpeqb %%mm7, %%mm2 \n\t" /* sgn[px - p(x+1)] */\ | 1106 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ |
| 941 "pshufw $0xAA, %%mm1, %%mm3 \n\t"\ | 1107 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ |
| 942 "pminub %%mm1, %%mm3 \n\t"\ | 1108 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\ |
| 943 "psrlq $16, %%mm3 \n\t"\ | 1109 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ |
| 1110 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\ | |
| 1111 "paddb %%mm5, %%mm1 \n\t"\ | |
| 1112 "psubusb %%mm5, %%mm1 \n\t"\ | |
| 1113 "psrlw $2, %%mm1 \n\t"\ | |
| 1114 "pxor %%mm2, %%mm1 \n\t"\ | |
| 1115 "psubb %%mm2, %%mm1 \n\t"\ | |
| 1116 "pand %%mm6, %%mm1 \n\t"\ | |
| 1117 "psubb %%mm1, %%mm0 \n\t"\ | |
| 1118 "psllq $8, %%mm1 \n\t"\ | |
| 1119 "paddb %%mm1, %%mm0 \n\t"\ | |
| 1120 "movd %%mm0, (%0) \n\t"\ | |
| 1121 "psrlq $32, %%mm0 \n\t"\ | |
| 1122 "movd %%mm0, 4(%0) \n\t" | |
| 1123 #else | |
| 1124 #define HDF(i)\ | |
| 1125 "movq " #i "(%%eax), %%mm0 \n\t"\ | |
| 1126 "movq %%mm0, %%mm1 \n\t"\ | |
| 1127 "movq %%mm0, %%mm2 \n\t"\ | |
| 1128 "psrlq $8, %%mm1 \n\t"\ | |
| 1129 "psubusb %%mm1, %%mm2 \n\t"\ | |
| 1130 "psubusb %%mm0, %%mm1 \n\t"\ | |
| 1131 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
| 1132 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
| 1133 "movq %%mm1, %%mm3 \n\t"\ | |
| 1134 "psllq $32, %%mm3 \n\t"\ | |
| 1135 "movq %%mm3, %%mm4 \n\t"\ | |
| 1136 "psubusb %%mm1, %%mm4 \n\t"\ | |
| 1137 "psubb %%mm4, %%mm3 \n\t"\ | |
| 1138 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ | |
| 944 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\ | 1139 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\ |
| 945 "paddb %%mm5, %%mm1 \n\t"\ | 1140 "paddb %%mm5, %%mm1 \n\t"\ |
| 946 "psubusb %%mm5, %%mm1 \n\t"\ | 1141 "psubusb %%mm5, %%mm1 \n\t"\ |
| 947 "psrlw $2, %%mm1 \n\t"\ | 1142 "psrlw $2, %%mm1 \n\t"\ |
| 948 "pxor %%mm2, %%mm1 \n\t"\ | 1143 "pxor %%mm2, %%mm1 \n\t"\ |
| 952 "psllq $8, %%mm1 \n\t"\ | 1147 "psllq $8, %%mm1 \n\t"\ |
| 953 "paddb %%mm1, %%mm0 \n\t"\ | 1148 "paddb %%mm1, %%mm0 \n\t"\ |
| 954 "movd %%mm0, (%0) \n\t"\ | 1149 "movd %%mm0, (%0) \n\t"\ |
| 955 "psrlq $32, %%mm0 \n\t"\ | 1150 "psrlq $32, %%mm0 \n\t"\ |
| 956 "movd %%mm0, 4(%0) \n\t" | 1151 "movd %%mm0, 4(%0) \n\t" |
| 957 | 1152 #endif |
| 958 HDF(0) | 1153 HDF(0) |
| 959 "addl %1, %0 \n\t" | 1154 "addl %1, %0 \n\t" |
| 960 HDF(8) | 1155 HDF(8) |
| 961 "addl %1, %0 \n\t" | 1156 "addl %1, %0 \n\t" |
| 962 HDF(16) | 1157 HDF(16) |
| 1023 } | 1218 } |
| 1024 | 1219 |
| 1025 /** | 1220 /** |
| 1026 * Do a horizontal low pass filter on the 8x8 block | 1221 * Do a horizontal low pass filter on the 8x8 block |
| 1027 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | 1222 * useing the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) |
| 1028 * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2 version) | 1223 * useing approximately the 7-Tap Filter (1,2,3,4,3,2,1)/16 (MMX2/3DNOW version) |
| 1029 */ | 1224 */ |
| 1030 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | 1225 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) |
| 1031 { | 1226 { |
| 1032 //return; | 1227 //return; |
| 1033 #ifdef HAVE_MMX2 | 1228 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 1034 asm volatile( //"movv %0 %1 %2\n\t" | 1229 asm volatile( //"movv %0 %1 %2\n\t" |
| 1035 "pushl %0\n\t" | 1230 "pushl %0\n\t" |
| 1036 "pxor %%mm7, %%mm7 \n\t" | 1231 "pxor %%mm7, %%mm7 \n\t" |
| 1037 "leal tempBlock, %%eax \n\t" | 1232 "leal tempBlock, %%eax \n\t" |
| 1038 | 1233 /* |
| 1039 #define HLP1 "movq (%0), %%mm0 \n\t"\ | 1234 #define HLP1 "movq (%0), %%mm0 \n\t"\ |
| 1040 "movq %%mm0, %%mm1 \n\t"\ | 1235 "movq %%mm0, %%mm1 \n\t"\ |
| 1041 "psllq $8, %%mm0 \n\t"\ | 1236 "psllq $8, %%mm0 \n\t"\ |
| 1042 "pavgb %%mm1, %%mm0 \n\t"\ | 1237 PAVGB(%%mm1, %%mm0)\ |
| 1043 "psrlw $8, %%mm0 \n\t"\ | 1238 "psrlw $8, %%mm0 \n\t"\ |
| 1044 "pxor %%mm1, %%mm1 \n\t"\ | 1239 "pxor %%mm1, %%mm1 \n\t"\ |
| 1045 "packuswb %%mm1, %%mm0 \n\t"\ | 1240 "packuswb %%mm1, %%mm0 \n\t"\ |
| 1046 "movq %%mm0, %%mm1 \n\t"\ | 1241 "movq %%mm0, %%mm1 \n\t"\ |
| 1047 "movq %%mm0, %%mm2 \n\t"\ | 1242 "movq %%mm0, %%mm2 \n\t"\ |
| 1048 "psllq $32, %%mm0 \n\t"\ | 1243 "psllq $32, %%mm0 \n\t"\ |
| 1049 "paddb %%mm0, %%mm1 \n\t"\ | 1244 "paddb %%mm0, %%mm1 \n\t"\ |
| 1050 "psllq $16, %%mm2 \n\t"\ | 1245 "psllq $16, %%mm2 \n\t"\ |
| 1051 "pavgb %%mm2, %%mm0 \n\t"\ | 1246 PAVGB(%%mm2, %%mm0)\ |
| 1052 "movq %%mm0, %%mm3 \n\t"\ | 1247 "movq %%mm0, %%mm3 \n\t"\ |
| 1053 "pand bm11001100, %%mm0 \n\t"\ | 1248 "pand bm11001100, %%mm0 \n\t"\ |
| 1054 "paddusb %%mm0, %%mm3 \n\t"\ | 1249 "paddusb %%mm0, %%mm3 \n\t"\ |
| 1055 "psrlq $8, %%mm3 \n\t"\ | 1250 "psrlq $8, %%mm3 \n\t"\ |
| 1056 "pavgb %%mm1, %%mm4 \n\t"\ | 1251 PAVGB(%%mm1, %%mm4)\ |
| 1057 "pavgb %%mm3, %%mm2 \n\t"\ | 1252 PAVGB(%%mm3, %%mm2)\ |
| 1058 "psrlq $16, %%mm2 \n\t"\ | 1253 "psrlq $16, %%mm2 \n\t"\ |
| 1059 "punpcklbw %%mm2, %%mm2 \n\t"\ | 1254 "punpcklbw %%mm2, %%mm2 \n\t"\ |
| 1060 "movq %%mm2, (%0) \n\t"\ | 1255 "movq %%mm2, (%0) \n\t"\ |
| 1061 | 1256 |
| 1062 #define HLP2 "movq (%0), %%mm0 \n\t"\ | 1257 #define HLP2 "movq (%0), %%mm0 \n\t"\ |
| 1063 "movq %%mm0, %%mm1 \n\t"\ | 1258 "movq %%mm0, %%mm1 \n\t"\ |
| 1064 "psllq $8, %%mm0 \n\t"\ | 1259 "psllq $8, %%mm0 \n\t"\ |
| 1065 "pavgb %%mm1, %%mm0 \n\t"\ | 1260 PAVGB(%%mm1, %%mm0)\ |
| 1066 "psrlw $8, %%mm0 \n\t"\ | 1261 "psrlw $8, %%mm0 \n\t"\ |
| 1067 "pxor %%mm1, %%mm1 \n\t"\ | 1262 "pxor %%mm1, %%mm1 \n\t"\ |
| 1068 "packuswb %%mm1, %%mm0 \n\t"\ | 1263 "packuswb %%mm1, %%mm0 \n\t"\ |
| 1069 "movq %%mm0, %%mm2 \n\t"\ | 1264 "movq %%mm0, %%mm2 \n\t"\ |
| 1070 "psllq $32, %%mm0 \n\t"\ | 1265 "psllq $32, %%mm0 \n\t"\ |
| 1071 "psllq $16, %%mm2 \n\t"\ | 1266 "psllq $16, %%mm2 \n\t"\ |
| 1072 "pavgb %%mm2, %%mm0 \n\t"\ | 1267 PAVGB(%%mm2, %%mm0)\ |
| 1073 "movq %%mm0, %%mm3 \n\t"\ | 1268 "movq %%mm0, %%mm3 \n\t"\ |
| 1074 "pand bm11001100, %%mm0 \n\t"\ | 1269 "pand bm11001100, %%mm0 \n\t"\ |
| 1075 "paddusb %%mm0, %%mm3 \n\t"\ | 1270 "paddusb %%mm0, %%mm3 \n\t"\ |
| 1076 "psrlq $8, %%mm3 \n\t"\ | 1271 "psrlq $8, %%mm3 \n\t"\ |
| 1077 "pavgb %%mm3, %%mm2 \n\t"\ | 1272 PAVGB(%%mm3, %%mm2)\ |
| 1078 "psrlq $16, %%mm2 \n\t"\ | 1273 "psrlq $16, %%mm2 \n\t"\ |
| 1079 "punpcklbw %%mm2, %%mm2 \n\t"\ | 1274 "punpcklbw %%mm2, %%mm2 \n\t"\ |
| 1080 "movq %%mm2, (%0) \n\t"\ | 1275 "movq %%mm2, (%0) \n\t"\ |
| 1081 | 1276 */ |
| 1082 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16 | 1277 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16 |
| 1083 /* | 1278 /* |
| 1084 31 | 1279 31 |
| 1085 121 | 1280 121 |
| 1086 121 | 1281 121 |
| 1098 123433 = | 1293 123433 = |
| 1099 12463 12346 | 1294 12463 12346 |
| 1100 1249 123A | 1295 1249 123A |
| 1101 | 1296 |
| 1102 */ | 1297 */ |
| 1298 #ifdef HAVE_MMX2 | |
| 1103 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | 1299 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ |
| 1104 "movq %%mm0, %%mm1 \n\t"\ | 1300 "movq %%mm0, %%mm1 \n\t"\ |
| 1105 "movq %%mm0, %%mm2 \n\t"\ | 1301 "movq %%mm0, %%mm2 \n\t"\ |
| 1106 "movq %%mm0, %%mm3 \n\t"\ | 1302 "movq %%mm0, %%mm3 \n\t"\ |
| 1107 "movq %%mm0, %%mm4 \n\t"\ | 1303 "movq %%mm0, %%mm4 \n\t"\ |
| 1109 "psrlq $8, %%mm2 \n\t"\ | 1305 "psrlq $8, %%mm2 \n\t"\ |
| 1110 "pand bm00000001, %%mm3 \n\t"\ | 1306 "pand bm00000001, %%mm3 \n\t"\ |
| 1111 "pand bm10000000, %%mm4 \n\t"\ | 1307 "pand bm10000000, %%mm4 \n\t"\ |
| 1112 "por %%mm3, %%mm1 \n\t"\ | 1308 "por %%mm3, %%mm1 \n\t"\ |
| 1113 "por %%mm4, %%mm2 \n\t"\ | 1309 "por %%mm4, %%mm2 \n\t"\ |
| 1114 "pavgb %%mm2, %%mm1 \n\t"\ | 1310 PAVGB(%%mm2, %%mm1)\ |
| 1115 "pavgb %%mm1, %%mm0 \n\t"\ | 1311 PAVGB(%%mm1, %%mm0)\ |
| 1116 \ | 1312 \ |
| 1117 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\ | 1313 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\ |
| 1118 "pshufw $0x90, %%mm0, %%mm4 \n\t"\ | 1314 "pshufw $0x90, %%mm0, %%mm4 \n\t"\ |
| 1119 "pavgb %%mm3, %%mm4 \n\t"\ | 1315 PAVGB(%%mm3, %%mm4)\ |
| 1120 "pavgb %%mm4, %%mm0 \n\t"\ | 1316 PAVGB(%%mm4, %%mm0)\ |
| 1121 "movd %%mm0, (%0) \n\t"\ | 1317 "movd %%mm0, (%0) \n\t"\ |
| 1122 "psrlq $32, %%mm0 \n\t"\ | 1318 "psrlq $32, %%mm0 \n\t"\ |
| 1123 "movd %%mm0, 4(%0) \n\t"\ | 1319 "movd %%mm0, 4(%0) \n\t" |
| 1320 #else | |
| 1321 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |
| 1322 "movq %%mm0, %%mm1 \n\t"\ | |
| 1323 "movq %%mm0, %%mm2 \n\t"\ | |
| 1324 "movq %%mm0, %%mm3 \n\t"\ | |
| 1325 "movq %%mm0, %%mm4 \n\t"\ | |
| 1326 "psllq $8, %%mm1 \n\t"\ | |
| 1327 "psrlq $8, %%mm2 \n\t"\ | |
| 1328 "pand bm00000001, %%mm3 \n\t"\ | |
| 1329 "pand bm10000000, %%mm4 \n\t"\ | |
| 1330 "por %%mm3, %%mm1 \n\t"\ | |
| 1331 "por %%mm4, %%mm2 \n\t"\ | |
| 1332 PAVGB(%%mm2, %%mm1)\ | |
| 1333 PAVGB(%%mm1, %%mm0)\ | |
| 1334 \ | |
| 1335 "movq %%mm0, %%mm3 \n\t"\ | |
| 1336 "movq %%mm0, %%mm4 \n\t"\ | |
| 1337 "movq %%mm0, %%mm5 \n\t"\ | |
| 1338 "psrlq $16, %%mm3 \n\t"\ | |
| 1339 "psllq $16, %%mm4 \n\t"\ | |
| 1340 "pand bm11000000, %%mm5 \n\t"\ | |
| 1341 "por %%mm5, %%mm3 \n\t"\ | |
| 1342 "movq %%mm0, %%mm5 \n\t"\ | |
| 1343 "pand bm00000011, %%mm5 \n\t"\ | |
| 1344 "por %%mm5, %%mm4 \n\t"\ | |
| 1345 PAVGB(%%mm3, %%mm4)\ | |
| 1346 PAVGB(%%mm4, %%mm0)\ | |
| 1347 "movd %%mm0, (%0) \n\t"\ | |
| 1348 "psrlq $32, %%mm0 \n\t"\ | |
| 1349 "movd %%mm0, 4(%0) \n\t" | |
| 1350 #endif | |
| 1124 | 1351 |
| 1125 #define HLP(i) HLP3(i) | 1352 #define HLP(i) HLP3(i) |
| 1126 | 1353 |
| 1127 HLP(0) | 1354 HLP(0) |
| 1128 "addl %1, %0 \n\t" | 1355 "addl %1, %0 \n\t" |
| 1227 "psrlq $16, %%mm7 \n\t" | 1454 "psrlq $16, %%mm7 \n\t" |
| 1228 "pmaxub %%mm4, %%mm7 \n\t" | 1455 "pmaxub %%mm4, %%mm7 \n\t" |
| 1229 "movq %%mm7, %%mm4 \n\t" | 1456 "movq %%mm7, %%mm4 \n\t" |
| 1230 "psrlq $8, %%mm7 \n\t" | 1457 "psrlq $8, %%mm7 \n\t" |
| 1231 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels | 1458 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels |
| 1232 "pavgb %%mm6, %%mm7 \n\t" // (max + min)/2 | 1459 PAVGB(%%mm6, %%mm7) // (max + min)/2 |
| 1233 | 1460 |
| 1234 | 1461 |
| 1235 : : "r" (src), "r" (stride), "r" (QP) | 1462 : : "r" (src), "r" (stride), "r" (QP) |
| 1236 : "%eax", "%ebx" | 1463 : "%eax", "%ebx" |
| 1237 ); | 1464 ); |
| 1239 | 1466 |
| 1240 //FIXME | 1467 //FIXME |
| 1241 #endif | 1468 #endif |
| 1242 } | 1469 } |
| 1243 | 1470 |
| 1471 | |
| 1472 | |
| 1473 | |
| 1244 /** | 1474 /** |
| 1245 * ... | 1475 * ... |
| 1476 * the mode value is interpreted as a quality value if its negative, its range is then (-1 ... -63) | |
| 1477 * -63 is best quality -1 is worst | |
| 1246 */ | 1478 */ |
| 1247 extern "C"{ | 1479 extern "C"{ |
| 1248 void postprocess(unsigned char * src[], int src_stride, | 1480 void postprocess(unsigned char * src[], int src_stride, |
| 1249 unsigned char * dst[], int dst_stride, | 1481 unsigned char * dst[], int dst_stride, |
| 1250 int horizontal_size, int vertical_size, | 1482 int horizontal_size, int vertical_size, |
| 1251 QP_STORE_T *QP_store, int QP_stride, | 1483 QP_STORE_T *QP_store, int QP_stride, |
| 1252 int mode) | 1484 int mode) |
| 1253 { | 1485 { |
| 1486 | |
| 1487 if(mode<0) mode= getModeForQuality(-mode); | |
| 1488 | |
| 1254 /* | 1489 /* |
| 1255 long long T= rdtsc(); | 1490 long long T= rdtsc(); |
| 1256 for(int y=vertical_size-1; y>=0 ; y--) | 1491 for(int y=vertical_size-1; y>=0 ; y--) |
| 1257 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride); | 1492 memcpy(dst[0] + y*src_stride, src[0] + y*src_stride,src_stride); |
| 1258 // memcpy(dst[0], src[0],src_stride*vertical_size); | 1493 // memcpy(dst[0], src[0],src_stride*vertical_size); |
| 1264 long long T= rdtsc(); | 1499 long long T= rdtsc(); |
| 1265 while( (rdtsc() - T)/1000 < 4000); | 1500 while( (rdtsc() - T)/1000 < 4000); |
| 1266 | 1501 |
| 1267 return; | 1502 return; |
| 1268 */ | 1503 */ |
| 1269 postProcess(src[0], src_stride, | 1504 postProcess(src[0], src_stride, dst[0], dst_stride, |
| 1270 dst[0], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, false); | 1505 horizontal_size, vertical_size, QP_store, QP_stride, false, mode); |
| 1271 | 1506 |
| 1272 horizontal_size >>= 1; | 1507 horizontal_size >>= 1; |
| 1273 vertical_size >>= 1; | 1508 vertical_size >>= 1; |
| 1274 src_stride >>= 1; | 1509 src_stride >>= 1; |
| 1275 dst_stride >>= 1; | 1510 dst_stride >>= 1; |
| 1276 | 1511 |
| 1277 if(1) | 1512 if(1) |
| 1278 { | 1513 { |
| 1279 postProcess(src[1], src_stride, | 1514 postProcess(src[1], src_stride, dst[1], dst_stride, |
| 1280 dst[1], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, true); | 1515 horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); |
| 1281 postProcess(src[2], src_stride, | 1516 postProcess(src[2], src_stride, dst[2], dst_stride, |
| 1282 dst[2], dst_stride, horizontal_size, vertical_size, QP_store, QP_stride, true); | 1517 horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); |
| 1283 } | 1518 } |
| 1284 else | 1519 else |
| 1285 { | 1520 { |
| 1286 memcpy(dst[1], src[1], src_stride*horizontal_size); | 1521 memcpy(dst[1], src[1], src_stride*horizontal_size); |
| 1287 memcpy(dst[2], src[2], src_stride*horizontal_size); | 1522 memcpy(dst[2], src[2], src_stride*horizontal_size); |
| 1288 } | 1523 } |
| 1289 } | 1524 } |
| 1290 } | 1525 /** |
| 1526 * gets the mode flags for a given quality (larger values mean slower but better postprocessing) | |
| 1527 * 0 <= quality < 64 | |
| 1528 */ | |
| 1529 int getModeForQuality(int quality){ | |
| 1530 int modes[6]= { | |
| 1531 LUM_V_DEBLOCK, | |
| 1532 LUM_V_DEBLOCK | LUM_H_DEBLOCK, | |
| 1533 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK, | |
| 1534 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK, | |
| 1535 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING, | |
| 1536 LUM_V_DEBLOCK | LUM_H_DEBLOCK | CHROM_V_DEBLOCK | CHROM_H_DEBLOCK | LUM_DERING | CHROM_DERING | |
| 1537 }; | |
| 1538 | |
| 1539 return modes[ (quality*6) >>6 ]; | |
| 1540 } | |
| 1541 | |
| 1542 } // extern "C" | |
| 1291 | 1543 |
| 1292 /** | 1544 /** |
| 1293 * Copies a block from src to dst and fixes the blacklevel | 1545 * Copies a block from src to dst and fixes the blacklevel |
| 1294 */ | 1546 */ |
| 1295 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) | 1547 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) |
| 1365 | 1617 |
| 1366 /** | 1618 /** |
| 1367 * Filters array of bytes (Y or U or V values) | 1619 * Filters array of bytes (Y or U or V values) |
| 1368 */ | 1620 */ |
| 1369 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | 1621 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 1370 QP_STORE_T QPs[], int QPStride, bool isColor) | 1622 QP_STORE_T QPs[], int QPStride, bool isColor, int mode) |
| 1371 { | 1623 { |
| 1372 | 1624 |
| 1373 #ifdef TIMEING | 1625 #ifdef TIMEING |
| 1374 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; | 1626 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; |
| 1375 sumTime= rdtsc(); | 1627 sumTime= rdtsc(); |
| 1411 packedYOffset= MAX(black - minAllowedY, 0); | 1663 packedYOffset= MAX(black - minAllowedY, 0); |
| 1412 packedYOffset|= packedYOffset<<32; | 1664 packedYOffset|= packedYOffset<<32; |
| 1413 packedYOffset|= packedYOffset<<16; | 1665 packedYOffset|= packedYOffset<<16; |
| 1414 packedYOffset|= packedYOffset<<8; | 1666 packedYOffset|= packedYOffset<<8; |
| 1415 | 1667 |
| 1416 // uint64_t scale= (int)(256.0*256.0/(white-black) + 0.5); | |
| 1417 double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); | 1668 double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); |
| 1418 | 1669 |
| 1419 packedYScale= uint16_t(scale*256.0 + 0.5); | 1670 packedYScale= uint16_t(scale*256.0 + 0.5); |
| 1420 packedYScale|= packedYScale<<32; | 1671 packedYScale|= packedYScale<<32; |
| 1421 packedYScale|= packedYScale<<16; | 1672 packedYScale|= packedYScale<<16; |
| 1460 if(y + 12 < height) | 1711 if(y + 12 < height) |
| 1461 { | 1712 { |
| 1462 #ifdef MORE_TIMEING | 1713 #ifdef MORE_TIMEING |
| 1463 T0= rdtsc(); | 1714 T0= rdtsc(); |
| 1464 #endif | 1715 #endif |
| 1716 | |
| 1465 #ifdef HAVE_MMX2 | 1717 #ifdef HAVE_MMX2 |
| 1466 | |
| 1467 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); | 1718 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); |
| 1468 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); | 1719 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); |
| 1469 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); | 1720 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); |
| 1470 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); | 1721 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); |
| 1722 #elif defined(HAVE_3DNOW) | |
| 1723 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 1724 /* prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); | |
| 1725 prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); | |
| 1726 prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); | |
| 1727 prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); | |
| 1728 */ | |
| 1471 #endif | 1729 #endif |
| 1472 if(!isColor) yHistogram[ srcBlock[0] ]++; | 1730 if(!isColor) yHistogram[ srcBlock[0] ]++; |
| 1473 | 1731 |
| 1474 blockCopy(vertBlock + dstStride*2, dstStride, | 1732 blockCopy(vertBlock + dstStride*2, dstStride, |
| 1475 vertSrcBlock + srcStride*2, srcStride); | 1733 vertSrcBlock + srcStride*2, srcStride); |
| 1478 #ifdef MORE_TIMEING | 1736 #ifdef MORE_TIMEING |
| 1479 T1= rdtsc(); | 1737 T1= rdtsc(); |
| 1480 memcpyTime+= T1-T0; | 1738 memcpyTime+= T1-T0; |
| 1481 T0=T1; | 1739 T0=T1; |
| 1482 #endif | 1740 #endif |
| 1483 | 1741 if(mode & V_DEBLOCK) |
| 1484 if( isVertDC(vertBlock, stride)) | |
| 1485 { | 1742 { |
| 1486 if(isVertMinMaxOk(vertBlock, stride, QP)) | 1743 if(mode & RK_FILTER) |
| 1487 doVertLowPass(vertBlock, stride, QP); | 1744 vertRKFilter(vertBlock, stride, QP); |
| 1745 else if(0) | |
| 1746 vertX1Filter(vertBlock, stride, QP); | |
| 1747 else | |
| 1748 { | |
| 1749 if( isVertDC(vertBlock, stride)) | |
| 1750 { | |
| 1751 if(isVertMinMaxOk(vertBlock, stride, QP)) | |
| 1752 doVertLowPass(vertBlock, stride, QP); | |
| 1753 } | |
| 1754 else | |
| 1755 doVertDefFilter(vertBlock, stride, QP); | |
| 1756 } | |
| 1488 } | 1757 } |
| 1489 else if(x<width) | |
| 1490 doVertDefFilter(vertBlock, stride, QP); | |
| 1491 | |
| 1492 #ifdef MORE_TIMEING | 1758 #ifdef MORE_TIMEING |
| 1493 T1= rdtsc(); | 1759 T1= rdtsc(); |
| 1494 vertTime+= T1-T0; | 1760 vertTime+= T1-T0; |
| 1495 T0=T1; | 1761 T0=T1; |
| 1496 #endif | 1762 #endif |
| 1506 if(x - 8 >= 0 && x<width) | 1772 if(x - 8 >= 0 && x<width) |
| 1507 { | 1773 { |
| 1508 #ifdef MORE_TIMEING | 1774 #ifdef MORE_TIMEING |
| 1509 T0= rdtsc(); | 1775 T0= rdtsc(); |
| 1510 #endif | 1776 #endif |
| 1511 | 1777 if(mode & H_DEBLOCK) |
| 1512 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) | |
| 1513 { | 1778 { |
| 1514 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | 1779 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) |
| 1515 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | 1780 { |
| 1781 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | |
| 1782 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | |
| 1783 } | |
| 1784 else | |
| 1785 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |
| 1516 } | 1786 } |
| 1517 else | |
| 1518 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |
| 1519 | |
| 1520 #ifdef MORE_TIMEING | 1787 #ifdef MORE_TIMEING |
| 1521 T1= rdtsc(); | 1788 T1= rdtsc(); |
| 1522 horizTime+= T1-T0; | 1789 horizTime+= T1-T0; |
| 1523 T0=T1; | 1790 T0=T1; |
| 1524 #endif | 1791 #endif |
| 1533 srcBlock+=8; | 1800 srcBlock+=8; |
| 1534 vertBlock+=8; | 1801 vertBlock+=8; |
| 1535 vertSrcBlock+=8; | 1802 vertSrcBlock+=8; |
| 1536 } | 1803 } |
| 1537 } | 1804 } |
| 1538 #ifdef HAVE_MMX | 1805 #ifdef HAVE_3DNOW |
| 1806 asm volatile("femms"); | |
| 1807 #elif defined (HAVE_MMX) | |
| 1539 asm volatile("emms"); | 1808 asm volatile("emms"); |
| 1540 #endif | 1809 #endif |
| 1541 | 1810 |
| 1542 #ifdef TIMEING | 1811 #ifdef TIMEING |
| 1543 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) | 1812 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) |
| 1547 int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000), | 1816 int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000), |
| 1548 int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000) | 1817 int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000) |
| 1549 , black, white); | 1818 , black, white); |
| 1550 #endif | 1819 #endif |
| 1551 } | 1820 } |
| 1821 | |
| 1822 |
