Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 97:e57b1d38d71f libavcodec
bugfixes: last 3 lines not brightness/contrast corrected
brightness statistics messed up with initial black pic
changed initial values of the brightness statistics
C++ -> C conversation
QP range question solved (very likely 1<=QP<=32 according to arpi)
new experimental vertical deblocking filter
RK filter has 3dNow support now (untested)
| author | michael |
|---|---|
| date | Thu, 11 Oct 2001 22:35:45 +0000 |
| parents | 29ac11dc53d3 |
| children | eaae16507d9b |
comparison
equal
deleted
inserted
replaced
| 96:29ac11dc53d3 | 97:e57b1d38d71f |
|---|---|
| 25 isHorizDC Ec Ec | 25 isHorizDC Ec Ec |
| 26 isHorizMinMaxOk a | 26 isHorizMinMaxOk a |
| 27 doHorizLowPass E a a* | 27 doHorizLowPass E a a* |
| 28 doHorizDefFilter E ac ac | 28 doHorizDefFilter E ac ac |
| 29 deRing | 29 deRing |
| 30 RKAlgo1 E a a* | |
| 31 X1 a E E* | |
| 32 | |
| 30 | 33 |
| 31 * i dont have a 3dnow CPU -> its untested | 34 * i dont have a 3dnow CPU -> its untested |
| 32 E = Exact implementation | 35 E = Exact implementation |
| 33 e = allmost exact implementation | 36 e = allmost exact implementation |
| 34 a = alternative / approximate impl | 37 a = alternative / approximate impl |
| 39 TODO: | 42 TODO: |
| 40 verify that everything workes as it should | 43 verify that everything workes as it should |
| 41 reduce the time wasted on the mem transfer | 44 reduce the time wasted on the mem transfer |
| 42 implement dering | 45 implement dering |
| 43 implement everything in C at least (done at the moment but ...) | 46 implement everything in C at least (done at the moment but ...) |
| 44 figure range of QP out (assuming <256 for now) | |
| 45 unroll stuff if instructions depend too much on the prior one | 47 unroll stuff if instructions depend too much on the prior one |
| 46 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? | 48 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? |
| 47 move YScale thing to the end instead of fixing QP | 49 move YScale thing to the end instead of fixing QP |
| 48 write a faster and higher quality deblocking filter :) | 50 write a faster and higher quality deblocking filter :) |
| 51 do something about the speed of the horizontal filters | |
| 52 make the mainloop more flexible (variable number of blocks at once | |
| 53 (the if/else stuff per block is slowing things down) | |
| 49 ... | 54 ... |
| 50 | 55 |
| 51 Notes: | 56 Notes: |
| 52 | 57 |
| 53 */ | 58 */ |
| 54 | 59 |
| 55 /* | 60 /* |
| 56 Changelog: | 61 Changelog: |
| 62 0.1.3 | |
| 63 bugfixes: last 3 lines not brightness/contrast corrected | |
| 64 brightness statistics messed up with initial black pic | |
| 65 changed initial values of the brightness statistics | |
| 66 C++ -> C conversation | |
| 67 QP range question solved (very likely 1<=QP<=32 according to arpi) | |
| 68 new experimental vertical deblocking filter | |
| 69 RK filter has 3dNow support now (untested) | |
| 57 0.1.2 | 70 0.1.2 |
| 58 fixed a bug in the horizontal default filter | 71 fixed a bug in the horizontal default filter |
| 59 3dnow version of the Horizontal & Vertical Lowpass filters | 72 3dnow version of the Horizontal & Vertical Lowpass filters |
| 60 mmx version of the Horizontal Default filter | 73 mmx version of the Horizontal Default filter |
| 61 mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar | 74 mmx2 & C versions of a simple filter described in a paper from ramkishor & karandikar |
| 64 */ | 77 */ |
| 65 | 78 |
| 66 | 79 |
| 67 #include <inttypes.h> | 80 #include <inttypes.h> |
| 68 #include <stdio.h> | 81 #include <stdio.h> |
| 82 #include <stdlib.h> | |
| 69 #include "../config.h" | 83 #include "../config.h" |
| 70 //#undef HAVE_MMX2 | 84 //#undef HAVE_MMX2 |
| 71 //#define HAVE_3DNOW | 85 //#define HAVE_3DNOW |
| 72 //#undef HAVE_MMX | 86 //#undef HAVE_MMX |
| 73 #include "postprocess.h" | 87 #include "postprocess.h" |
| 158 | 172 |
| 159 //FIXME? |255-0| = 1 (shouldnt be a problem ...) | 173 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
| 160 /** | 174 /** |
| 161 * Check if the middle 8x8 Block in the given 8x10 block is flat | 175 * Check if the middle 8x8 Block in the given 8x10 block is flat |
| 162 */ | 176 */ |
| 163 static inline bool isVertDC(uint8_t src[], int stride){ | 177 static inline int isVertDC(uint8_t src[], int stride){ |
| 164 // return true; | 178 // return true; |
| 165 int numEq= 0; | 179 int numEq= 0; |
| 180 int y; | |
| 166 src+= stride; // src points to begin of the 8x8 Block | 181 src+= stride; // src points to begin of the 8x8 Block |
| 167 #ifdef HAVE_MMX | 182 #ifdef HAVE_MMX |
| 168 asm volatile( | 183 asm volatile( |
| 169 // "int $3 \n\t" | 184 // "int $3 \n\t" |
| 170 "pushl %1\n\t" | 185 "pushl %1\n\t" |
| 240 // int asmEq= numEq; | 255 // int asmEq= numEq; |
| 241 // numEq=0; | 256 // numEq=0; |
| 242 // uint8_t *temp= src; | 257 // uint8_t *temp= src; |
| 243 | 258 |
| 244 #else | 259 #else |
| 245 for(int y=0; y<BLOCK_SIZE-1; y++) | 260 for(y=0; y<BLOCK_SIZE-1; y++) |
| 246 { | 261 { |
| 247 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; | 262 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; |
| 248 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++; | 263 if(((src[1] - src[1+stride] + 1)&0xFFFF) < 3) numEq++; |
| 249 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++; | 264 if(((src[2] - src[2+stride] + 1)&0xFFFF) < 3) numEq++; |
| 250 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++; | 265 if(((src[3] - src[3+stride] + 1)&0xFFFF) < 3) numEq++; |
| 266 } | 281 } |
| 267 printf("\n"); | 282 printf("\n"); |
| 268 } | 283 } |
| 269 } | 284 } |
| 270 */ | 285 */ |
| 271 return numEq > vFlatnessThreshold; | 286 // for(int i=0; i<numEq/8; i++) src[i]=255; |
| 287 return (numEq > vFlatnessThreshold) ? 1 : 0; | |
| 272 } | 288 } |
| 273 | 289 |
| 274 static inline bool isVertMinMaxOk(uint8_t src[], int stride, int QP) | 290 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) |
| 275 { | 291 { |
| 276 #ifdef HAVE_MMX | 292 #ifdef HAVE_MMX |
| 277 int isOk; | 293 int isOk; |
| 278 asm volatile( | 294 asm volatile( |
| 279 // "int $3 \n\t" | 295 // "int $3 \n\t" |
| 293 // "movd %%mm0, (%1, %2, 4)\n\t" | 309 // "movd %%mm0, (%1, %2, 4)\n\t" |
| 294 "movd %%mm0, %0 \n\t" | 310 "movd %%mm0, %0 \n\t" |
| 295 : "=r" (isOk) | 311 : "=r" (isOk) |
| 296 : "r" (src), "r" (stride) | 312 : "r" (src), "r" (stride) |
| 297 ); | 313 ); |
| 298 return isOk; | 314 return isOk ? 1 : 0; |
| 299 #else | 315 #else |
| 300 | 316 |
| 301 int isOk2= true; | 317 int isOk2= 1; |
| 302 for(int x=0; x<BLOCK_SIZE; x++) | 318 int x; |
| 319 for(x=0; x<BLOCK_SIZE; x++) | |
| 303 { | 320 { |
| 304 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=false; | 321 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; |
| 305 } | 322 } |
| 306 /* if(isOk && !isOk2 || !isOk && isOk2) | 323 /* if(isOk && !isOk2 || !isOk && isOk2) |
| 307 { | 324 { |
| 308 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP); | 325 printf("\nasm:%d c:%d QP:%d\n", isOk, isOk2, QP); |
| 309 for(int y=0; y<9; y++) | 326 for(int y=0; y<9; y++) |
| 482 const int l5= stride + l4; | 499 const int l5= stride + l4; |
| 483 const int l6= stride + l5; | 500 const int l6= stride + l5; |
| 484 const int l7= stride + l6; | 501 const int l7= stride + l6; |
| 485 const int l8= stride + l7; | 502 const int l8= stride + l7; |
| 486 const int l9= stride + l8; | 503 const int l9= stride + l8; |
| 487 | 504 int x; |
| 488 for(int x=0; x<BLOCK_SIZE; x++) | 505 for(x=0; x<BLOCK_SIZE; x++) |
| 489 { | 506 { |
| 490 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; | 507 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; |
| 491 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; | 508 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; |
| 492 | 509 |
| 493 int sums[9]; | 510 int sums[9]; |
| 527 x/8 = 1 | 544 x/8 = 1 |
| 528 1 12 12 23 | 545 1 12 12 23 |
| 529 */ | 546 */ |
| 530 static inline void vertRKFilter(uint8_t *src, int stride, int QP) | 547 static inline void vertRKFilter(uint8_t *src, int stride, int QP) |
| 531 { | 548 { |
| 532 #ifdef HAVE_MMX2 | 549 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 533 // FIXME rounding | 550 // FIXME rounding |
| 534 asm volatile( | 551 asm volatile( |
| 535 "pxor %%mm7, %%mm7 \n\t" // 0 | 552 "pxor %%mm7, %%mm7 \n\t" // 0 |
| 536 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | 553 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 537 "leal (%0, %1), %%eax \n\t" | 554 "leal (%0, %1), %%eax \n\t" |
| 547 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | 564 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 |
| 548 "movq (%%ebx), %%mm3 \n\t" // line 5 | 565 "movq (%%ebx), %%mm3 \n\t" // line 5 |
| 549 "movq %%mm2, %%mm4 \n\t" // line 4 | 566 "movq %%mm2, %%mm4 \n\t" // line 4 |
| 550 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | 567 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 |
| 551 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 | 568 "pxor %%mm2, %%mm5 \n\t" // -line 4 - 1 |
| 552 "pavgb %%mm3, %%mm5 \n\t" | 569 PAVGB(%%mm3, %%mm5) |
| 553 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 | 570 "paddb %%mm6, %%mm5 \n\t" // (l5-l4)/2 |
| 554 "psubusb %%mm3, %%mm4 \n\t" | 571 "psubusb %%mm3, %%mm4 \n\t" |
| 555 "psubusb %%mm2, %%mm3 \n\t" | 572 "psubusb %%mm2, %%mm3 \n\t" |
| 556 "por %%mm3, %%mm4 \n\t" // |l4 - l5| | 573 "por %%mm3, %%mm4 \n\t" // |l4 - l5| |
| 557 "psubusb %%mm0, %%mm4 \n\t" | 574 "psubusb %%mm0, %%mm4 \n\t" |
| 598 const int l5= stride + l4; | 615 const int l5= stride + l4; |
| 599 const int l6= stride + l5; | 616 const int l6= stride + l5; |
| 600 const int l7= stride + l6; | 617 const int l7= stride + l6; |
| 601 const int l8= stride + l7; | 618 const int l8= stride + l7; |
| 602 const int l9= stride + l8; | 619 const int l9= stride + l8; |
| 603 for(int x=0; x<BLOCK_SIZE; x++) | 620 int x; |
| 621 for(x=0; x<BLOCK_SIZE; x++) | |
| 604 { | 622 { |
| 605 if(ABS(src[l4]-src[l5]) < QP + QP/4) | 623 if(ABS(src[l4]-src[l5]) < QP + QP/4) |
| 606 { | 624 { |
| 607 int x = src[l5] - src[l4]; | 625 int v = (src[l5] - src[l4]); |
| 608 | 626 |
| 609 src[l3] +=x/8; | 627 src[l3] +=v/8; |
| 610 src[l4] +=x/2; | 628 src[l4] +=v/2; |
| 611 src[l5] -=x/2; | 629 src[l5] -=v/2; |
| 612 src[l6] -=x/8; | 630 src[l6] -=v/8; |
| 631 | |
| 613 } | 632 } |
| 614 src++; | 633 src++; |
| 615 } | 634 } |
| 616 | 635 |
| 617 #endif | 636 #endif |
| 618 } | 637 } |
| 619 | 638 |
| 620 /** | 639 /** |
| 621 * Experimental Filter 1 | 640 * Experimental Filter 1 |
| 641 * will nor damage linear gradients | |
| 642 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | |
| 643 * MMX2 version does correct clipping C version doesnt | |
| 622 */ | 644 */ |
| 623 static inline void vertX1Filter(uint8_t *src, int stride, int QP) | 645 static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
| 624 { | 646 { |
| 625 #ifdef HAVE_MMX2X | 647 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 626 // FIXME | |
| 627 asm volatile( | 648 asm volatile( |
| 649 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 650 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | |
| 651 "leal (%0, %1), %%eax \n\t" | |
| 652 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 653 // 0 1 2 3 4 5 6 7 8 9 | |
| 654 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 655 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 | |
| 656 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 | |
| 657 "movq %%mm1, %%mm2 \n\t" // line 4 | |
| 658 "psubusb %%mm0, %%mm1 \n\t" | |
| 659 "psubusb %%mm2, %%mm0 \n\t" | |
| 660 "por %%mm1, %%mm0 \n\t" // |l2 - l3| | |
| 661 "movq (%%ebx), %%mm3 \n\t" // line 5 | |
| 662 "movq (%%ebx, %1), %%mm4 \n\t" // line 6 | |
| 663 "movq %%mm3, %%mm5 \n\t" // line 5 | |
| 664 "psubusb %%mm4, %%mm3 \n\t" | |
| 665 "psubusb %%mm5, %%mm4 \n\t" | |
| 666 "por %%mm4, %%mm3 \n\t" // |l5 - l6| | |
| 667 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 | |
| 668 "movq %%mm2, %%mm1 \n\t" // line 4 | |
| 669 "psubusb %%mm5, %%mm2 \n\t" | |
| 670 "movq %%mm2, %%mm4 \n\t" | |
| 671 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 | |
| 672 "psubusb %%mm1, %%mm5 \n\t" | |
| 673 "por %%mm5, %%mm4 \n\t" // |l4 - l5| | |
| 674 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) | |
| 675 "movq %%mm4, %%mm3 \n\t" // d | |
| 676 "psubusb pQPb, %%mm4 \n\t" | |
| 677 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 | |
| 678 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 | |
| 679 | |
| 680 PAVGB(%%mm7, %%mm3) // d/2 | |
| 681 | |
| 682 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 | |
| 683 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | |
| 684 "psubusb %%mm3, %%mm0 \n\t" | |
| 685 "pxor %%mm2, %%mm0 \n\t" | |
| 686 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 | |
| 687 | |
| 688 "movq (%%ebx), %%mm0 \n\t" // line 5 | |
| 689 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | |
| 690 "paddusb %%mm3, %%mm0 \n\t" | |
| 691 "pxor %%mm2, %%mm0 \n\t" | |
| 692 "movq %%mm0, (%%ebx) \n\t" // line 5 | |
| 693 | |
| 694 PAVGB(%%mm7, %%mm3) // d/4 | |
| 695 | |
| 696 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 | |
| 697 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | |
| 698 "psubusb %%mm3, %%mm0 \n\t" | |
| 699 "pxor %%mm2, %%mm0 \n\t" | |
| 700 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 | |
| 701 | |
| 702 "movq (%%ebx, %1), %%mm0 \n\t" // line 6 | |
| 703 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | |
| 704 "paddusb %%mm3, %%mm0 \n\t" | |
| 705 "pxor %%mm2, %%mm0 \n\t" | |
| 706 "movq %%mm0, (%%ebx, %1) \n\t" // line 6 | |
| 707 | |
| 708 PAVGB(%%mm7, %%mm3) // d/8 | |
| 709 | |
| 710 "movq (%%eax, %1), %%mm0 \n\t" // line 2 | |
| 711 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 | |
| 712 "psubusb %%mm3, %%mm0 \n\t" | |
| 713 "pxor %%mm2, %%mm0 \n\t" | |
| 714 "movq %%mm0, (%%eax, %1) \n\t" // line 2 | |
| 715 | |
| 716 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 | |
| 717 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 | |
| 718 "paddusb %%mm3, %%mm0 \n\t" | |
| 719 "pxor %%mm2, %%mm0 \n\t" | |
| 720 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 | |
| 628 | 721 |
| 629 : | 722 : |
| 630 : "r" (src), "r" (stride) | 723 : "r" (src), "r" (stride) |
| 631 : "%eax", "%ebx" | 724 : "%eax", "%ebx" |
| 632 ); | 725 ); |
| 633 #else | 726 #else |
| 727 | |
| 728 const int l1= stride; | |
| 729 const int l2= stride + l1; | |
| 730 const int l3= stride + l2; | |
| 731 const int l4= stride + l3; | |
| 732 const int l5= stride + l4; | |
| 733 const int l6= stride + l5; | |
| 734 const int l7= stride + l6; | |
| 735 const int l8= stride + l7; | |
| 736 const int l9= stride + l8; | |
| 737 int x; | |
| 738 for(x=0; x<BLOCK_SIZE; x++) | |
| 739 { | |
| 740 int a= src[l3] - src[l4]; | |
| 741 int b= src[l4] - src[l5]; | |
| 742 int c= src[l6] - src[l7]; | |
| 743 | |
| 744 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); | |
| 745 | |
| 746 if(d < QP) | |
| 747 { | |
| 748 int v = d * SIGN(-b); | |
| 749 | |
| 750 src[l2] +=v/8; | |
| 751 src[l3] +=v/4; | |
| 752 src[l4] +=v/2; | |
| 753 src[l5] -=v/2; | |
| 754 src[l6] -=v/4; | |
| 755 src[l7] -=v/8; | |
| 756 | |
| 757 } | |
| 758 src++; | |
| 759 } | |
| 760 /* | |
| 634 const int l1= stride; | 761 const int l1= stride; |
| 635 const int l2= stride + l1; | 762 const int l2= stride + l1; |
| 636 const int l3= stride + l2; | 763 const int l3= stride + l2; |
| 637 const int l4= stride + l3; | 764 const int l4= stride + l3; |
| 638 const int l5= stride + l4; | 765 const int l5= stride + l4; |
| 656 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16; | 783 src[l5] = (1*v2 + 2*v3 + 3*v4 + 4*v5 + 3*v6 + 3*v7)/16; |
| 657 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; | 784 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; |
| 658 } | 785 } |
| 659 src++; | 786 src++; |
| 660 } | 787 } |
| 661 | 788 */ |
| 662 #endif | 789 #endif |
| 663 } | 790 } |
| 664 | 791 |
| 665 | 792 |
| 666 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | 793 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
| 906 const int l5= stride + l4; | 1033 const int l5= stride + l4; |
| 907 const int l6= stride + l5; | 1034 const int l6= stride + l5; |
| 908 const int l7= stride + l6; | 1035 const int l7= stride + l6; |
| 909 const int l8= stride + l7; | 1036 const int l8= stride + l7; |
| 910 // const int l9= stride + l8; | 1037 // const int l9= stride + l8; |
| 911 | 1038 int x; |
| 912 for(int x=0; x<BLOCK_SIZE; x++) | 1039 for(x=0; x<BLOCK_SIZE; x++) |
| 913 { | 1040 { |
| 914 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | 1041 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
| 915 if(ABS(middleEnergy) < 8*QP) | 1042 if(ABS(middleEnergy) < 8*QP) |
| 916 { | 1043 { |
| 917 const int q=(src[l4] - src[l5])/2; | 1044 const int q=(src[l4] - src[l5])/2; |
| 945 | 1072 |
| 946 //FIXME? |255-0| = 1 | 1073 //FIXME? |255-0| = 1 |
| 947 /** | 1074 /** |
| 948 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. | 1075 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. |
| 949 */ | 1076 */ |
| 950 static inline bool isHorizDCAndCopy2Temp(uint8_t src[], int stride) | 1077 static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride) |
| 951 { | 1078 { |
| 952 // src++; | 1079 // src++; |
| 953 int numEq= 0; | 1080 int numEq= 0; |
| 954 #ifdef HAVE_MMX | 1081 #ifdef HAVE_MMX |
| 955 asm volatile ( | 1082 asm volatile ( |
| 1005 : "%eax" | 1132 : "%eax" |
| 1006 ); | 1133 ); |
| 1007 // printf("%d\n", numEq); | 1134 // printf("%d\n", numEq); |
| 1008 numEq= (256 - (numEq & 0xFF)) &0xFF; | 1135 numEq= (256 - (numEq & 0xFF)) &0xFF; |
| 1009 #else | 1136 #else |
| 1010 for(int y=0; y<BLOCK_SIZE; y++) | 1137 int y; |
| 1138 for(y=0; y<BLOCK_SIZE; y++) | |
| 1011 { | 1139 { |
| 1012 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; | 1140 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; |
| 1013 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; | 1141 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; |
| 1014 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; | 1142 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; |
| 1015 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; | 1143 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; |
| 1042 */ | 1170 */ |
| 1043 // printf("%d\n", numEq); | 1171 // printf("%d\n", numEq); |
| 1044 return numEq > hFlatnessThreshold; | 1172 return numEq > hFlatnessThreshold; |
| 1045 } | 1173 } |
| 1046 | 1174 |
| 1047 static inline bool isHorizMinMaxOk(uint8_t src[], int stride, int QP) | 1175 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) |
| 1048 { | 1176 { |
| 1049 #ifdef MMX_FIXME | 1177 #ifdef MMX_FIXME |
| 1050 FIXME | 1178 FIXME |
| 1051 int isOk; | 1179 int isOk; |
| 1052 asm volatile( | 1180 asm volatile( |
| 1069 : "=r" (isOk) | 1197 : "=r" (isOk) |
| 1070 : "r" (src), "r" (stride) | 1198 : "r" (src), "r" (stride) |
| 1071 ); | 1199 ); |
| 1072 return isOk; | 1200 return isOk; |
| 1073 #else | 1201 #else |
| 1074 if(abs(src[0] - src[7]) > 2*QP) return false; | 1202 if(abs(src[0] - src[7]) > 2*QP) return 0; |
| 1075 | 1203 |
| 1076 return true; | 1204 return 1; |
| 1077 #endif | 1205 #endif |
| 1078 } | 1206 } |
| 1079 | 1207 |
| 1080 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) | 1208 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) |
| 1081 { | 1209 { |
| 1171 : "%eax" | 1299 : "%eax" |
| 1172 ); | 1300 ); |
| 1173 #else | 1301 #else |
| 1174 uint8_t *src= tempBlock; | 1302 uint8_t *src= tempBlock; |
| 1175 | 1303 |
| 1176 for(int y=0; y<BLOCK_SIZE; y++) | 1304 int y; |
| 1305 for(y=0; y<BLOCK_SIZE; y++) | |
| 1177 { | 1306 { |
| 1178 dst[0] = src[0]; | 1307 dst[0] = src[0]; |
| 1179 dst[1] = src[1]; | 1308 dst[1] = src[1]; |
| 1180 dst[2] = src[2]; | 1309 dst[2] = src[2]; |
| 1181 dst[3] = src[3]; | 1310 dst[3] = src[3]; |
| 1373 : "%eax", "%ebx" | 1502 : "%eax", "%ebx" |
| 1374 ); | 1503 ); |
| 1375 | 1504 |
| 1376 #else | 1505 #else |
| 1377 uint8_t *temp= tempBlock; | 1506 uint8_t *temp= tempBlock; |
| 1378 for(int y=0; y<BLOCK_SIZE; y++) | 1507 int y; |
| 1508 for(y=0; y<BLOCK_SIZE; y++) | |
| 1379 { | 1509 { |
| 1380 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; | 1510 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; |
| 1381 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; | 1511 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; |
| 1382 | 1512 |
| 1383 int sums[9]; | 1513 int sums[9]; |
| 1500 while( (rdtsc() - T)/1000 < 4000); | 1630 while( (rdtsc() - T)/1000 < 4000); |
| 1501 | 1631 |
| 1502 return; | 1632 return; |
| 1503 */ | 1633 */ |
| 1504 postProcess(src[0], src_stride, dst[0], dst_stride, | 1634 postProcess(src[0], src_stride, dst[0], dst_stride, |
| 1505 horizontal_size, vertical_size, QP_store, QP_stride, false, mode); | 1635 horizontal_size, vertical_size, QP_store, QP_stride, 0, mode); |
| 1506 | 1636 |
| 1507 horizontal_size >>= 1; | 1637 horizontal_size >>= 1; |
| 1508 vertical_size >>= 1; | 1638 vertical_size >>= 1; |
| 1509 src_stride >>= 1; | 1639 src_stride >>= 1; |
| 1510 dst_stride >>= 1; | 1640 dst_stride >>= 1; |
| 1511 | 1641 |
| 1512 if(1) | 1642 if(1) |
| 1513 { | 1643 { |
| 1514 postProcess(src[1], src_stride, dst[1], dst_stride, | 1644 postProcess(src[1], src_stride, dst[1], dst_stride, |
| 1515 horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); | 1645 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); |
| 1516 postProcess(src[2], src_stride, dst[2], dst_stride, | 1646 postProcess(src[2], src_stride, dst[2], dst_stride, |
| 1517 horizontal_size, vertical_size, QP_store, QP_stride, true, mode >>4); | 1647 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); |
| 1518 } | 1648 } |
| 1519 else | 1649 else |
| 1520 { | 1650 { |
| 1521 memcpy(dst[1], src[1], src_stride*horizontal_size); | 1651 memcpy(dst[1], src[1], src_stride*horizontal_size); |
| 1522 memcpy(dst[2], src[2], src_stride*horizontal_size); | 1652 memcpy(dst[2], src[2], src_stride*horizontal_size); |
| 1541 | 1671 |
| 1542 } // extern "C" | 1672 } // extern "C" |
| 1543 | 1673 |
| 1544 /** | 1674 /** |
| 1545 * Copies a block from src to dst and fixes the blacklevel | 1675 * Copies a block from src to dst and fixes the blacklevel |
| 1676 * numLines must be a multiple of 4 | |
| 1677 * levelFix == 0 -> dont touch the brighness & contrast | |
| 1546 */ | 1678 */ |
| 1547 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride) | 1679 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
| 1680 int numLines, int levelFix) | |
| 1548 { | 1681 { |
| 1682 int i; | |
| 1683 if(levelFix) | |
| 1684 { | |
| 1549 #ifdef HAVE_MMX | 1685 #ifdef HAVE_MMX |
| 1550 asm volatile( | 1686 asm volatile( |
| 1687 "movl %4, %%eax \n\t" | |
| 1688 "movl %%eax, temp0\n\t" | |
| 1551 "pushl %0 \n\t" | 1689 "pushl %0 \n\t" |
| 1552 "pushl %1 \n\t" | 1690 "pushl %1 \n\t" |
| 1553 "leal (%2,%2), %%eax \n\t" | 1691 "leal (%2,%2), %%eax \n\t" |
| 1554 "leal (%3,%3), %%ebx \n\t" | 1692 "leal (%3,%3), %%ebx \n\t" |
| 1555 "movq packedYOffset, %%mm2 \n\t" | 1693 "movq packedYOffset, %%mm2 \n\t" |
| 1556 "movq packedYScale, %%mm3 \n\t" | 1694 "movq packedYScale, %%mm3 \n\t" |
| 1557 | |
| 1558 #define SIMPLE_CPY \ | |
| 1559 "movq (%0), %%mm0 \n\t"\ | |
| 1560 "movq (%0,%2), %%mm1 \n\t"\ | |
| 1561 "psubusb %%mm2, %%mm0 \n\t"\ | |
| 1562 "psubusb %%mm2, %%mm1 \n\t"\ | |
| 1563 "movq %%mm0, (%1) \n\t"\ | |
| 1564 "movq %%mm1, (%1, %3) \n\t"\ | |
| 1565 | 1695 |
| 1566 #define SCALED_CPY \ | 1696 #define SCALED_CPY \ |
| 1567 "movq (%0), %%mm0 \n\t"\ | 1697 "movq (%0), %%mm0 \n\t"\ |
| 1568 "movq (%0,%2), %%mm1 \n\t"\ | 1698 "movq (%0,%2), %%mm1 \n\t"\ |
| 1569 "psubusb %%mm2, %%mm0 \n\t"\ | 1699 "psubusb %%mm2, %%mm0 \n\t"\ |
| 1583 "pmulhuw %%mm3, %%mm4 \n\t"\ | 1713 "pmulhuw %%mm3, %%mm4 \n\t"\ |
| 1584 "pmulhuw %%mm3, %%mm5 \n\t"\ | 1714 "pmulhuw %%mm3, %%mm5 \n\t"\ |
| 1585 "packuswb %%mm5, %%mm4 \n\t"\ | 1715 "packuswb %%mm5, %%mm4 \n\t"\ |
| 1586 "movq %%mm4, (%1, %3) \n\t"\ | 1716 "movq %%mm4, (%1, %3) \n\t"\ |
| 1587 | 1717 |
| 1588 | 1718 "1: \n\t" |
| 1589 #define CPY SCALED_CPY | 1719 SCALED_CPY |
| 1590 //#define CPY SIMPLE_CPY | |
| 1591 // "prefetchnta 8(%0)\n\t" | |
| 1592 CPY | |
| 1593 "addl %%eax, %0 \n\t" | 1720 "addl %%eax, %0 \n\t" |
| 1594 "addl %%ebx, %1 \n\t" | 1721 "addl %%ebx, %1 \n\t" |
| 1595 CPY | 1722 SCALED_CPY |
| 1596 "addl %%eax, %0 \n\t" | 1723 "addl %%eax, %0 \n\t" |
| 1597 "addl %%ebx, %1 \n\t" | 1724 "addl %%ebx, %1 \n\t" |
| 1598 CPY | 1725 "decl temp0 \n\t" |
| 1599 "addl %%eax, %0 \n\t" | 1726 "jnz 1b \n\t" |
| 1600 "addl %%ebx, %1 \n\t" | 1727 |
| 1601 CPY | |
| 1602 "popl %1 \n\t" | 1728 "popl %1 \n\t" |
| 1603 "popl %0 \n\t" | 1729 "popl %0 \n\t" |
| 1604 : : "r" (src), | 1730 : : "r" (src), |
| 1605 "r" (dst), | 1731 "r" (dst), |
| 1606 "r" (srcStride), | 1732 "r" (srcStride), |
| 1607 "r" (dstStride) | 1733 "r" (dstStride), |
| 1734 "m" (numLines>>2) | |
| 1608 : "%eax", "%ebx" | 1735 : "%eax", "%ebx" |
| 1609 ); | 1736 ); |
| 1610 #else | 1737 #else |
| 1611 for(int i=0; i<BLOCK_SIZE; i++) // last 10x8 Block is copied allready so +2 | 1738 for(i=0; i<numLines; i++) |
| 1612 memcpy( &(dst[dstStride*i]), | 1739 memcpy( &(dst[dstStride*i]), |
| 1613 &(src[srcStride*i]), BLOCK_SIZE); | 1740 &(src[srcStride*i]), BLOCK_SIZE); |
| 1614 #endif | 1741 #endif |
| 1742 } | |
| 1743 else | |
| 1744 { | |
| 1745 #ifdef HAVE_MMX | |
| 1746 asm volatile( | |
| 1747 "movl %4, %%eax \n\t" | |
| 1748 "movl %%eax, temp0\n\t" | |
| 1749 "pushl %0 \n\t" | |
| 1750 "pushl %1 \n\t" | |
| 1751 "leal (%2,%2), %%eax \n\t" | |
| 1752 "leal (%3,%3), %%ebx \n\t" | |
| 1753 "movq packedYOffset, %%mm2 \n\t" | |
| 1754 "movq packedYScale, %%mm3 \n\t" | |
| 1755 | |
| 1756 #define SIMPLE_CPY \ | |
| 1757 "movq (%0), %%mm0 \n\t"\ | |
| 1758 "movq (%0,%2), %%mm1 \n\t"\ | |
| 1759 "movq %%mm0, (%1) \n\t"\ | |
| 1760 "movq %%mm1, (%1, %3) \n\t"\ | |
| 1761 | |
| 1762 "1: \n\t" | |
| 1763 SIMPLE_CPY | |
| 1764 "addl %%eax, %0 \n\t" | |
| 1765 "addl %%ebx, %1 \n\t" | |
| 1766 SIMPLE_CPY | |
| 1767 "addl %%eax, %0 \n\t" | |
| 1768 "addl %%ebx, %1 \n\t" | |
| 1769 "decl temp0 \n\t" | |
| 1770 "jnz 1b \n\t" | |
| 1771 | |
| 1772 "popl %1 \n\t" | |
| 1773 "popl %0 \n\t" | |
| 1774 : : "r" (src), | |
| 1775 "r" (dst), | |
| 1776 "r" (srcStride), | |
| 1777 "r" (dstStride), | |
| 1778 "m" (numLines>>2) | |
| 1779 : "%eax", "%ebx" | |
| 1780 ); | |
| 1781 #else | |
| 1782 for(i=0; i<numLines; i++) | |
| 1783 memcpy( &(dst[dstStride*i]), | |
| 1784 &(src[srcStride*i]), BLOCK_SIZE); | |
| 1785 #endif | |
| 1786 } | |
| 1615 } | 1787 } |
| 1616 | 1788 |
| 1617 | 1789 |
| 1618 /** | 1790 /** |
| 1619 * Filters array of bytes (Y or U or V values) | 1791 * Filters array of bytes (Y or U or V values) |
| 1620 */ | 1792 */ |
| 1621 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | 1793 void postProcess(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 1622 QP_STORE_T QPs[], int QPStride, bool isColor, int mode) | 1794 QP_STORE_T QPs[], int QPStride, int isColor, int mode) |
| 1623 { | 1795 { |
| 1796 int x,y; | |
| 1797 /* we need 64bit here otherwise weŽll going to have a problem | |
| 1798 after watching a black picture for 5 hours*/ | |
| 1799 static uint64_t *yHistogram= NULL; | |
| 1800 int black=0, white=255; // blackest black and whitest white in the picture | |
| 1624 | 1801 |
| 1625 #ifdef TIMEING | 1802 #ifdef TIMEING |
| 1626 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; | 1803 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; |
| 1627 sumTime= rdtsc(); | 1804 sumTime= rdtsc(); |
| 1628 #endif | 1805 #endif |
| 1629 | 1806 |
| 1630 /* we need 64bit here otherwise weŽll going to have a problem | |
| 1631 after watching a black picture for 5 hours*/ | |
| 1632 static uint64_t *yHistogram= NULL; | |
| 1633 if(!yHistogram) | 1807 if(!yHistogram) |
| 1634 { | 1808 { |
| 1635 yHistogram= new uint64_t[256]; | 1809 int i; |
| 1636 for(int i=0; i<256; i++) yHistogram[i]= width*height/64/256; | 1810 yHistogram= (uint64_t*)malloc(8*256); |
| 1811 for(i=0; i<256; i++) yHistogram[i]= width*height/64*15/256; | |
| 1637 } | 1812 } |
| 1638 | 1813 |
| 1639 int black=0, white=255; // blackest black and whitest white in the picture | |
| 1640 if(!isColor) | 1814 if(!isColor) |
| 1641 { | 1815 { |
| 1642 uint64_t sum= 0; | 1816 uint64_t sum= 0; |
| 1643 for(int i=0; i<256; i++) | 1817 int i; |
| 1818 static int framenum= -1; | |
| 1819 uint64_t maxClipped; | |
| 1820 uint64_t clipped; | |
| 1821 double scale; | |
| 1822 | |
| 1823 framenum++; | |
| 1824 if(framenum == 1) yHistogram[0]= width*height/64*15/256; | |
| 1825 | |
| 1826 for(i=0; i<256; i++) | |
| 1827 { | |
| 1644 sum+= yHistogram[i]; | 1828 sum+= yHistogram[i]; |
| 1645 | 1829 // printf("%d ", yHistogram[i]); |
| 1646 uint64_t maxClipped= (uint64_t)(sum * maxClippedThreshold); | 1830 } |
| 1647 | 1831 // printf("\n\n"); |
| 1648 uint64_t clipped= sum; | 1832 |
| 1833 /* we allways get a completly black picture first */ | |
| 1834 | |
| 1835 maxClipped= (uint64_t)(sum * maxClippedThreshold); | |
| 1836 | |
| 1837 clipped= sum; | |
| 1649 for(black=255; black>0; black--) | 1838 for(black=255; black>0; black--) |
| 1650 { | 1839 { |
| 1651 if(clipped < maxClipped) break; | 1840 if(clipped < maxClipped) break; |
| 1652 clipped-= yHistogram[black]; | 1841 clipped-= yHistogram[black]; |
| 1653 } | 1842 } |
| 1663 packedYOffset= MAX(black - minAllowedY, 0); | 1852 packedYOffset= MAX(black - minAllowedY, 0); |
| 1664 packedYOffset|= packedYOffset<<32; | 1853 packedYOffset|= packedYOffset<<32; |
| 1665 packedYOffset|= packedYOffset<<16; | 1854 packedYOffset|= packedYOffset<<16; |
| 1666 packedYOffset|= packedYOffset<<8; | 1855 packedYOffset|= packedYOffset<<8; |
| 1667 | 1856 |
| 1668 double scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); | 1857 scale= (double)(maxAllowedY - minAllowedY) / (double)(white-black); |
| 1669 | 1858 |
| 1670 packedYScale= uint16_t(scale*256.0 + 0.5); | 1859 packedYScale= (uint16_t)(scale*256.0 + 0.5); |
| 1671 packedYScale|= packedYScale<<32; | 1860 packedYScale|= packedYScale<<32; |
| 1672 packedYScale|= packedYScale<<16; | 1861 packedYScale|= packedYScale<<16; |
| 1673 } | 1862 } |
| 1674 else | 1863 else |
| 1675 { | 1864 { |
| 1676 packedYScale= 0x0100010001000100LL; | 1865 packedYScale= 0x0100010001000100LL; |
| 1677 packedYOffset= 0; | 1866 packedYOffset= 0; |
| 1678 } | 1867 } |
| 1679 | 1868 |
| 1680 for(int x=0; x<width; x+=BLOCK_SIZE) | 1869 for(x=0; x<width; x+=BLOCK_SIZE) |
| 1681 blockCopy(dst + x, dstStride, src + x, srcStride); | 1870 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); |
| 1682 | 1871 |
| 1683 for(int y=0; y<height; y+=BLOCK_SIZE) | 1872 for(y=0; y<height; y+=BLOCK_SIZE) |
| 1684 { | 1873 { |
| 1685 //1% speedup if these are here instead of the inner loop | 1874 //1% speedup if these are here instead of the inner loop |
| 1686 uint8_t *srcBlock= &(src[y*srcStride]); | 1875 uint8_t *srcBlock= &(src[y*srcStride]); |
| 1687 uint8_t *dstBlock= &(dst[y*dstStride]); | 1876 uint8_t *dstBlock= &(dst[y*dstStride]); |
| 1688 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start | 1877 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start |
| 1689 uint8_t *vertBlock= &(dstBlock[dstStride*3]); | 1878 uint8_t *vertBlock= &(dstBlock[dstStride*3]); |
| 1690 | 1879 |
| 1691 // finish 1 block before the next otherwise weŽll might have a problem | 1880 // finish 1 block before the next otherwise weŽll might have a problem |
| 1692 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | 1881 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
| 1693 for(int x=0; x<width; x+=BLOCK_SIZE) | 1882 for(x=0; x<width; x+=BLOCK_SIZE) |
| 1694 { | 1883 { |
| 1884 const int stride= dstStride; | |
| 1695 int QP= isColor ? | 1885 int QP= isColor ? |
| 1696 QPs[(y>>3)*QPStride + (x>>3)]: | 1886 QPs[(y>>3)*QPStride + (x>>3)]: |
| 1697 (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8; | 1887 (QPs[(y>>4)*QPStride + (x>>4)] * (packedYScale &0xFFFF))>>8; |
| 1698 #ifdef HAVE_MMX | 1888 #ifdef HAVE_MMX |
| 1699 asm volatile( | 1889 asm volatile( |
| 1705 : : "r" (QP) | 1895 : : "r" (QP) |
| 1706 ); | 1896 ); |
| 1707 #endif | 1897 #endif |
| 1708 | 1898 |
| 1709 | 1899 |
| 1710 const int stride= dstStride; | |
| 1711 if(y + 12 < height) | 1900 if(y + 12 < height) |
| 1712 { | 1901 { |
| 1713 #ifdef MORE_TIMEING | 1902 #ifdef MORE_TIMEING |
| 1714 T0= rdtsc(); | 1903 T0= rdtsc(); |
| 1715 #endif | 1904 #endif |
| 1728 */ | 1917 */ |
| 1729 #endif | 1918 #endif |
| 1730 if(!isColor) yHistogram[ srcBlock[0] ]++; | 1919 if(!isColor) yHistogram[ srcBlock[0] ]++; |
| 1731 | 1920 |
| 1732 blockCopy(vertBlock + dstStride*2, dstStride, | 1921 blockCopy(vertBlock + dstStride*2, dstStride, |
| 1733 vertSrcBlock + srcStride*2, srcStride); | 1922 vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX); |
| 1734 | 1923 |
| 1735 | 1924 |
| 1736 #ifdef MORE_TIMEING | 1925 #ifdef MORE_TIMEING |
| 1737 T1= rdtsc(); | 1926 T1= rdtsc(); |
| 1738 memcpyTime+= T1-T0; | 1927 memcpyTime+= T1-T0; |
| 1740 #endif | 1929 #endif |
| 1741 if(mode & V_DEBLOCK) | 1930 if(mode & V_DEBLOCK) |
| 1742 { | 1931 { |
| 1743 if(mode & RK_FILTER) | 1932 if(mode & RK_FILTER) |
| 1744 vertRKFilter(vertBlock, stride, QP); | 1933 vertRKFilter(vertBlock, stride, QP); |
| 1745 else if(0) | 1934 else if(mode & X1_FILTER) |
| 1746 vertX1Filter(vertBlock, stride, QP); | 1935 vertX1Filter(vertBlock, stride, QP); |
| 1747 else | 1936 else |
| 1748 { | 1937 { |
| 1749 if( isVertDC(vertBlock, stride)) | 1938 if( isVertDC(vertBlock, stride)) |
| 1750 { | 1939 { |
| 1760 vertTime+= T1-T0; | 1949 vertTime+= T1-T0; |
| 1761 T0=T1; | 1950 T0=T1; |
| 1762 #endif | 1951 #endif |
| 1763 } | 1952 } |
| 1764 else | 1953 else |
| 1765 { | 1954 blockCopy(vertBlock + dstStride*1, dstStride, |
| 1766 for(int i=2; i<BLOCK_SIZE/2+1; i++) // last 10x8 Block is copied allready so +2 | 1955 vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX); |
| 1767 memcpy( &(vertBlock[dstStride*i]), | 1956 |
| 1768 &(vertSrcBlock[srcStride*i]), BLOCK_SIZE); | |
| 1769 | |
| 1770 } | |
| 1771 | 1957 |
| 1772 if(x - 8 >= 0 && x<width) | 1958 if(x - 8 >= 0 && x<width) |
| 1773 { | 1959 { |
| 1774 #ifdef MORE_TIMEING | 1960 #ifdef MORE_TIMEING |
| 1775 T0= rdtsc(); | 1961 T0= rdtsc(); |
| 1811 #ifdef TIMEING | 1997 #ifdef TIMEING |
| 1812 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) | 1998 // FIXME diff is mostly the time spent for rdtsc (should subtract that but ...) |
| 1813 sumTime= rdtsc() - sumTime; | 1999 sumTime= rdtsc() - sumTime; |
| 1814 if(!isColor) | 2000 if(!isColor) |
| 1815 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r", | 2001 printf("cpy:%4dk, vert:%4dk, horiz:%4dk, sum:%4dk, diff:%4dk, color: %d/%d \r", |
| 1816 int(memcpyTime/1000), int(vertTime/1000), int(horizTime/1000), | 2002 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000), |
| 1817 int(sumTime/1000), int((sumTime-memcpyTime-vertTime-horizTime)/1000) | 2003 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) |
| 1818 , black, white); | 2004 , black, white); |
| 1819 #endif | 2005 #endif |
| 1820 } | 2006 } |
| 1821 | 2007 |
| 1822 | 2008 |
