Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 99:4f072fa99ccf libavcodec
fixed a rounding bug thing in the X1 Filter
changed the X1 Filter slightly to make flat blocks look like in the 9tap lpf
minor change to the -pp numbers & added decimal numbers in comments
new experimental horizontal deblocking filter
| author | michael |
|---|---|
| date | Sat, 13 Oct 2001 02:31:15 +0000 |
| parents | eaae16507d9b |
| children | 1d1182345591 |
comparison
equal
deleted
inserted
replaced
| 98:eaae16507d9b | 99:4f072fa99ccf |
|---|---|
| 25 isHorizDC Ec Ec | 25 isHorizDC Ec Ec |
| 26 isHorizMinMaxOk a | 26 isHorizMinMaxOk a |
| 27 doHorizLowPass E a a* | 27 doHorizLowPass E a a* |
| 28 doHorizDefFilter E ac ac | 28 doHorizDefFilter E ac ac |
| 29 deRing | 29 deRing |
| 30 RKAlgo1 E a a* | 30 Vertical RKAlgo1 E a a* |
| 31 X1 a E E* | 31 Vertical X1 a E E* |
| 32 Horizontal X1 a E E* | |
| 32 | 33 |
| 33 | 34 |
| 34 * i dont have a 3dnow CPU -> its untested | 35 * i dont have a 3dnow CPU -> its untested |
| 35 E = Exact implementation | 36 E = Exact implementation |
| 36 e = allmost exact implementation | 37 e = allmost exact implementation |
| 38 c = checked against the other implementations (-vo md5) | 39 c = checked against the other implementations (-vo md5) |
| 39 */ | 40 */ |
| 40 | 41 |
| 41 /* | 42 /* |
| 42 TODO: | 43 TODO: |
| 43 verify that everything workes as it should | 44 verify that everything workes as it should (how?) |
| 44 reduce the time wasted on the mem transfer | 45 reduce the time wasted on the mem transfer |
| 45 implement dering | 46 implement dering |
| 46 implement everything in C at least (done at the moment but ...) | 47 implement everything in C at least (done at the moment but ...) |
| 47 unroll stuff if instructions depend too much on the prior one | 48 unroll stuff if instructions depend too much on the prior one |
| 48 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? | 49 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? |
| 49 move YScale thing to the end instead of fixing QP | 50 move YScale thing to the end instead of fixing QP |
| 50 write a faster and higher quality deblocking filter :) | 51 write a faster and higher quality deblocking filter :) |
| 51 do something about the speed of the horizontal filters | 52 do something about the speed of the horizontal filters |
| 52 make the mainloop more flexible (variable number of blocks at once | 53 make the mainloop more flexible (variable number of blocks at once |
| 53 (the if/else stuff per block is slowing things down) | 54 (the if/else stuff per block is slowing things down) |
| 55 compare the quality & speed of all filters | |
| 56 implement a few simple deinterlacing filters | |
| 57 split this huge file | |
| 54 ... | 58 ... |
| 55 | 59 |
| 56 Notes: | 60 Notes: |
| 57 | 61 |
| 58 */ | 62 */ |
| 59 | 63 |
| 60 /* | 64 /* |
| 61 Changelog: | 65 Changelog: use the CVS log |
| 62 0.1.3 | 66 0.1.3 |
| 63 bugfixes: last 3 lines not brightness/contrast corrected | 67 bugfixes: last 3 lines not brightness/contrast corrected |
| 64 brightness statistics messed up with initial black pic | 68 brightness statistics messed up with initial black pic |
| 65 changed initial values of the brightness statistics | 69 changed initial values of the brightness statistics |
| 66 C++ -> C conversation | 70 C++ -> C conversation |
| 97 static uint64_t bm00001000= 0x00000000FF000000LL; | 101 static uint64_t bm00001000= 0x00000000FF000000LL; |
| 98 static uint64_t bm10000000= 0xFF00000000000000LL; | 102 static uint64_t bm10000000= 0xFF00000000000000LL; |
| 99 static uint64_t bm10000001= 0xFF000000000000FFLL; | 103 static uint64_t bm10000001= 0xFF000000000000FFLL; |
| 100 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; | 104 static uint64_t bm11000011= 0xFFFF00000000FFFFLL; |
| 101 static uint64_t bm00000011= 0x000000000000FFFFLL; | 105 static uint64_t bm00000011= 0x000000000000FFFFLL; |
| 106 static uint64_t bm11111110= 0xFFFFFFFFFFFFFF00LL; | |
| 102 static uint64_t bm11000000= 0xFFFF000000000000LL; | 107 static uint64_t bm11000000= 0xFFFF000000000000LL; |
| 103 static uint64_t bm00011000= 0x000000FFFF000000LL; | 108 static uint64_t bm00011000= 0x000000FFFF000000LL; |
| 104 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; | 109 static uint64_t bm00110011= 0x0000FFFF0000FFFFLL; |
| 105 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; | 110 static uint64_t bm11001100= 0xFFFF0000FFFF0000LL; |
| 106 static uint64_t b00= 0x0000000000000000LL; | 111 static uint64_t b00= 0x0000000000000000LL; |
| 112 static uint64_t b01= 0x0101010101010101LL; | |
| 107 static uint64_t b02= 0x0202020202020202LL; | 113 static uint64_t b02= 0x0202020202020202LL; |
| 108 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; | 114 static uint64_t b0F= 0x0F0F0F0F0F0F0F0FLL; |
| 109 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; | 115 static uint64_t bFF= 0xFFFFFFFFFFFFFFFFLL; |
| 110 static uint64_t b20= 0x2020202020202020LL; | 116 static uint64_t b20= 0x2020202020202020LL; |
| 111 static uint64_t b80= 0x8080808080808080LL; | 117 static uint64_t b80= 0x8080808080808080LL; |
| 542 x = 8 | 548 x = 8 |
| 543 x/2 = 4 | 549 x/2 = 4 |
| 544 x/8 = 1 | 550 x/8 = 1 |
| 545 1 12 12 23 | 551 1 12 12 23 |
| 546 */ | 552 */ |
| 547 static inline void vertRKFilter(uint8_t *src, int stride, int QP) | 553 static inline void vertRK1Filter(uint8_t *src, int stride, int QP) |
| 548 { | 554 { |
| 549 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 555 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 550 // FIXME rounding | 556 // FIXME rounding |
| 551 asm volatile( | 557 asm volatile( |
| 552 "pxor %%mm7, %%mm7 \n\t" // 0 | 558 "pxor %%mm7, %%mm7 \n\t" // 0 |
| 636 #endif | 642 #endif |
| 637 } | 643 } |
| 638 | 644 |
| 639 /** | 645 /** |
| 640 * Experimental Filter 1 | 646 * Experimental Filter 1 |
| 641 * will nor damage linear gradients | 647 * will not damage linear gradients |
| 648 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
| 642 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | 649 * can only smooth blocks at the expected locations (it cant smooth them if they did move) |
| 643 * MMX2 version does correct clipping C version doesnt | 650 * MMX2 version does correct clipping C version doesnt |
| 644 */ | 651 */ |
| 645 static inline void vertX1Filter(uint8_t *src, int stride, int QP) | 652 static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
| 646 { | 653 { |
| 673 "por %%mm5, %%mm4 \n\t" // |l4 - l5| | 680 "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
| 674 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) | 681 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
| 675 "movq %%mm4, %%mm3 \n\t" // d | 682 "movq %%mm4, %%mm3 \n\t" // d |
| 676 "psubusb pQPb, %%mm4 \n\t" | 683 "psubusb pQPb, %%mm4 \n\t" |
| 677 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 | 684 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
| 685 "psubusb b01, %%mm3 \n\t" | |
| 678 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 | 686 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
| 679 | 687 |
| 680 PAVGB(%%mm7, %%mm3) // d/2 | 688 PAVGB(%%mm7, %%mm3) // d/2 |
| 689 "movq %%mm3, %%mm1 \n\t" // d/2 | |
| 690 PAVGB(%%mm7, %%mm3) // d/4 | |
| 691 PAVGB(%%mm1, %%mm3) // 3*d/8 | |
| 681 | 692 |
| 682 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 | 693 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
| 683 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | 694 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
| 684 "psubusb %%mm3, %%mm0 \n\t" | 695 "psubusb %%mm3, %%mm0 \n\t" |
| 685 "pxor %%mm2, %%mm0 \n\t" | 696 "pxor %%mm2, %%mm0 \n\t" |
| 689 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | 700 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
| 690 "paddusb %%mm3, %%mm0 \n\t" | 701 "paddusb %%mm3, %%mm0 \n\t" |
| 691 "pxor %%mm2, %%mm0 \n\t" | 702 "pxor %%mm2, %%mm0 \n\t" |
| 692 "movq %%mm0, (%%ebx) \n\t" // line 5 | 703 "movq %%mm0, (%%ebx) \n\t" // line 5 |
| 693 | 704 |
| 694 PAVGB(%%mm7, %%mm3) // d/4 | 705 PAVGB(%%mm7, %%mm1) // d/4 |
| 695 | 706 |
| 696 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 | 707 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
| 697 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 | 708 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
| 698 "psubusb %%mm3, %%mm0 \n\t" | 709 "psubusb %%mm1, %%mm0 \n\t" |
| 699 "pxor %%mm2, %%mm0 \n\t" | 710 "pxor %%mm2, %%mm0 \n\t" |
| 700 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 | 711 "movq %%mm0, (%%eax, %1, 2) \n\t" // line 3 |
| 701 | 712 |
| 702 "movq (%%ebx, %1), %%mm0 \n\t" // line 6 | 713 "movq (%%ebx, %1), %%mm0 \n\t" // line 6 |
| 703 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 | 714 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
| 704 "paddusb %%mm3, %%mm0 \n\t" | 715 "paddusb %%mm1, %%mm0 \n\t" |
| 705 "pxor %%mm2, %%mm0 \n\t" | 716 "pxor %%mm2, %%mm0 \n\t" |
| 706 "movq %%mm0, (%%ebx, %1) \n\t" // line 6 | 717 "movq %%mm0, (%%ebx, %1) \n\t" // line 6 |
| 707 | 718 |
| 708 PAVGB(%%mm7, %%mm3) // d/8 | 719 PAVGB(%%mm7, %%mm1) // d/8 |
| 709 | 720 |
| 710 "movq (%%eax, %1), %%mm0 \n\t" // line 2 | 721 "movq (%%eax, %1), %%mm0 \n\t" // line 2 |
| 711 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 | 722 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
| 712 "psubusb %%mm3, %%mm0 \n\t" | 723 "psubusb %%mm1, %%mm0 \n\t" |
| 713 "pxor %%mm2, %%mm0 \n\t" | 724 "pxor %%mm2, %%mm0 \n\t" |
| 714 "movq %%mm0, (%%eax, %1) \n\t" // line 2 | 725 "movq %%mm0, (%%eax, %1) \n\t" // line 2 |
| 715 | 726 |
| 716 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 | 727 "movq (%%ebx, %1, 2), %%mm0 \n\t" // line 7 |
| 717 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 | 728 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
| 718 "paddusb %%mm3, %%mm0 \n\t" | 729 "paddusb %%mm1, %%mm0 \n\t" |
| 719 "pxor %%mm2, %%mm0 \n\t" | 730 "pxor %%mm2, %%mm0 \n\t" |
| 720 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 | 731 "movq %%mm0, (%%ebx, %1, 2) \n\t" // line 7 |
| 721 | 732 |
| 722 : | 733 : |
| 723 : "r" (src), "r" (stride) | 734 : "r" (src), "r" (stride) |
| 737 int x; | 748 int x; |
| 738 for(x=0; x<BLOCK_SIZE; x++) | 749 for(x=0; x<BLOCK_SIZE; x++) |
| 739 { | 750 { |
| 740 int a= src[l3] - src[l4]; | 751 int a= src[l3] - src[l4]; |
| 741 int b= src[l4] - src[l5]; | 752 int b= src[l4] - src[l5]; |
| 742 int c= src[l6] - src[l7]; | 753 int c= src[l5] - src[l6]; |
| 743 | 754 |
| 744 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); | 755 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); |
| 745 | 756 |
| 746 if(d < QP) | 757 if(d < QP) |
| 747 { | 758 { |
| 748 int v = d * SIGN(-b); | 759 int v = d * SIGN(-b); |
| 749 | 760 |
| 750 src[l2] +=v/8; | 761 src[l2] +=v/8; |
| 751 src[l3] +=v/4; | 762 src[l3] +=v/4; |
| 752 src[l4] +=v/2; | 763 src[l4] +=3*v/8; |
| 753 src[l5] -=v/2; | 764 src[l5] -=3*v/8; |
| 754 src[l6] -=v/4; | 765 src[l6] -=v/4; |
| 755 src[l7] -=v/8; | 766 src[l7] -=v/8; |
| 756 | 767 |
| 757 } | 768 } |
| 758 src++; | 769 src++; |
| 784 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; | 795 src[l6] = ( 1*v3 + 2*v4 + 3*v5 + 4*v6 + 6*v7)/16; |
| 785 } | 796 } |
| 786 src++; | 797 src++; |
| 787 } | 798 } |
| 788 */ | 799 */ |
| 800 #endif | |
| 801 } | |
| 802 | |
| 803 /** | |
| 804 * Experimental Filter 1 (Horizontal) | |
| 805 * will not damage linear gradients | |
| 806 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
| 807 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | |
| 808 * MMX2 version does correct clipping C version doesnt | |
| 809 * not identical with the vertical one | |
| 810 */ | |
| 811 static inline void horizX1Filter(uint8_t *src, int stride, int QP) | |
| 812 { | |
| 813 int y; | |
| 814 static uint64_t *lut= NULL; | |
| 815 if(lut==NULL) | |
| 816 { | |
| 817 int i; | |
| 818 lut= (uint64_t*)memalign(8, 256*8); | |
| 819 for(i=0; i<256; i++) | |
| 820 { | |
| 821 int v= i < 128 ? 2*i : 2*(i-256); | |
| 822 /* | |
| 823 //Simulate 112242211 9-Tap filter | |
| 824 uint64_t a= (v/16) & 0xFF; | |
| 825 uint64_t b= (v/8) & 0xFF; | |
| 826 uint64_t c= (v/4) & 0xFF; | |
| 827 uint64_t d= (3*v/8) & 0xFF; | |
| 828 */ | |
| 829 //Simulate piecewise linear interpolation | |
| 830 uint64_t a= (v/16) & 0xFF; | |
| 831 uint64_t b= (v*3/16) & 0xFF; | |
| 832 uint64_t c= (v*5/16) & 0xFF; | |
| 833 uint64_t d= (7*v/16) & 0xFF; | |
| 834 uint64_t A= (0x100 - a)&0xFF; | |
| 835 uint64_t B= (0x100 - b)&0xFF; | |
| 836 uint64_t C= (0x100 - c)&0xFF; | |
| 837 uint64_t D= (0x100 - c)&0xFF; | |
| 838 | |
| 839 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | | |
| 840 (D<<24) | (C<<16) | (B<<8) | (A); | |
| 841 //lut[i] = (v<<32) | (v<<24); | |
| 842 } | |
| 843 } | |
| 844 | |
| 845 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | |
| 846 asm volatile( | |
| 847 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 848 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | |
| 849 "leal (%0, %1), %%eax \n\t" | |
| 850 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 851 | |
| 852 "movq b80, %%mm6 \n\t" | |
| 853 "movd %2, %%mm5 \n\t" // QP | |
| 854 "movq %%mm5, %%mm4 \n\t" | |
| 855 "paddusb %%mm5, %%mm5 \n\t" // 2QP | |
| 856 "paddusb %%mm5, %%mm4 \n\t" // 3QP | |
| 857 "pxor %%mm5, %%mm5 \n\t" // 0 | |
| 858 "psubb %%mm4, %%mm5 \n\t" // -3QP | |
| 859 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP | |
| 860 "psllq $24, %%mm5 \n\t" | |
| 861 | |
| 862 // 0 1 2 3 4 5 6 7 8 9 | |
| 863 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 864 | |
| 865 #define HX1old(a) \ | |
| 866 "movd " #a ", %%mm0 \n\t"\ | |
| 867 "movd 4" #a ", %%mm1 \n\t"\ | |
| 868 "punpckldq %%mm1, %%mm0 \n\t"\ | |
| 869 "movq %%mm0, %%mm1 \n\t"\ | |
| 870 "movq %%mm0, %%mm2 \n\t"\ | |
| 871 "psrlq $8, %%mm1 \n\t"\ | |
| 872 "psubusb %%mm1, %%mm2 \n\t"\ | |
| 873 "psubusb %%mm0, %%mm1 \n\t"\ | |
| 874 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
| 875 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
| 876 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ | |
| 877 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 878 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 879 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
| 880 "paddb %%mm5, %%mm1 \n\t"\ | |
| 881 "psubusb %%mm5, %%mm1 \n\t"\ | |
| 882 PAVGB(%%mm7, %%mm1)\ | |
| 883 "pxor %%mm2, %%mm1 \n\t"\ | |
| 884 "psubb %%mm2, %%mm1 \n\t"\ | |
| 885 "psrlq $24, %%mm1 \n\t"\ | |
| 886 "movd %%mm1, %%ecx \n\t"\ | |
| 887 "paddb %%mm6, %%mm0 \n\t"\ | |
| 888 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ | |
| 889 "paddb %%mm6, %%mm0 \n\t"\ | |
| 890 "movq %%mm0, " #a " \n\t"\ | |
| 891 | |
| 892 /* | |
| 893 HX1old((%0)) | |
| 894 HX1old((%%eax)) | |
| 895 HX1old((%%eax, %1)) | |
| 896 HX1old((%%eax, %1, 2)) | |
| 897 HX1old((%0, %1, 4)) | |
| 898 HX1old((%%ebx)) | |
| 899 HX1old((%%ebx, %1)) | |
| 900 HX1old((%%ebx, %1, 2)) | |
| 901 */ | |
| 902 | |
| 903 //FIXME add some comments, its unreadable ... | |
| 904 #define HX1b(a, c, b, d) \ | |
| 905 "movd " #a ", %%mm0 \n\t"\ | |
| 906 "movd 4" #a ", %%mm1 \n\t"\ | |
| 907 "punpckldq %%mm1, %%mm0 \n\t"\ | |
| 908 "movd " #b ", %%mm4 \n\t"\ | |
| 909 "movq %%mm0, %%mm1 \n\t"\ | |
| 910 "movq %%mm0, %%mm2 \n\t"\ | |
| 911 "psrlq $8, %%mm1 \n\t"\ | |
| 912 "movd 4" #b ", %%mm3 \n\t"\ | |
| 913 "psubusb %%mm1, %%mm2 \n\t"\ | |
| 914 "psubusb %%mm0, %%mm1 \n\t"\ | |
| 915 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
| 916 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
| 917 "punpckldq %%mm3, %%mm4 \n\t"\ | |
| 918 "movq %%mm1, %%mm3 \n\t"\ | |
| 919 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ | |
| 920 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 921 "paddb %%mm6, %%mm0 \n\t"\ | |
| 922 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 923 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
| 924 "movq %%mm4, %%mm3 \n\t"\ | |
| 925 "paddb %%mm5, %%mm1 \n\t"\ | |
| 926 "psubusb %%mm5, %%mm1 \n\t"\ | |
| 927 "psrlq $8, %%mm3 \n\t"\ | |
| 928 PAVGB(%%mm7, %%mm1)\ | |
| 929 "pxor %%mm2, %%mm1 \n\t"\ | |
| 930 "psubb %%mm2, %%mm1 \n\t"\ | |
| 931 "movq %%mm4, %%mm2 \n\t"\ | |
| 932 "psrlq $24, %%mm1 \n\t"\ | |
| 933 "psubusb %%mm3, %%mm2 \n\t"\ | |
| 934 "movd %%mm1, %%ecx \n\t"\ | |
| 935 "psubusb %%mm4, %%mm3 \n\t"\ | |
| 936 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ | |
| 937 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\ | |
| 938 "paddb %%mm6, %%mm0 \n\t"\ | |
| 939 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
| 940 "movq %%mm3, %%mm1 \n\t"\ | |
| 941 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\ | |
| 942 "movq %%mm0, " #a " \n\t"\ | |
| 943 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 944 "paddb %%mm6, %%mm4 \n\t"\ | |
| 945 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 946 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
| 947 "paddb %%mm5, %%mm3 \n\t"\ | |
| 948 "psubusb %%mm5, %%mm3 \n\t"\ | |
| 949 PAVGB(%%mm7, %%mm3)\ | |
| 950 "pxor %%mm2, %%mm3 \n\t"\ | |
| 951 "psubb %%mm2, %%mm3 \n\t"\ | |
| 952 "psrlq $24, %%mm3 \n\t"\ | |
| 953 "movd " #c ", %%mm0 \n\t"\ | |
| 954 "movd 4" #c ", %%mm1 \n\t"\ | |
| 955 "punpckldq %%mm1, %%mm0 \n\t"\ | |
| 956 "paddb %%mm6, %%mm0 \n\t"\ | |
| 957 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ | |
| 958 "paddb %%mm6, %%mm0 \n\t"\ | |
| 959 "movq %%mm0, " #c " \n\t"\ | |
| 960 "movd %%mm3, %%ecx \n\t"\ | |
| 961 "movd " #d ", %%mm0 \n\t"\ | |
| 962 "paddsb (%3, %%ecx, 8), %%mm4 \n\t"\ | |
| 963 "movd 4" #d ", %%mm1 \n\t"\ | |
| 964 "paddb %%mm6, %%mm4 \n\t"\ | |
| 965 "punpckldq %%mm1, %%mm0 \n\t"\ | |
| 966 "movq %%mm4, " #b " \n\t"\ | |
| 967 "paddb %%mm6, %%mm0 \n\t"\ | |
| 968 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ | |
| 969 "paddb %%mm6, %%mm0 \n\t"\ | |
| 970 "movq %%mm0, " #d " \n\t"\ | |
| 971 | |
| 972 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2)) | |
| 973 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2)) | |
| 974 | |
| 975 | |
| 976 : | |
| 977 : "r" (src), "r" (stride), "r" (QP), "r" (lut) | |
| 978 : "%eax", "%ebx", "%ecx" | |
| 979 ); | |
| 980 #else | |
| 981 | |
| 982 //FIXME (has little in common with the mmx2 version) | |
| 983 for(y=0; y<BLOCK_SIZE; y++) | |
| 984 { | |
| 985 int a= src[1] - src[2]; | |
| 986 int b= src[3] - src[4]; | |
| 987 int c= src[5] - src[6]; | |
| 988 | |
| 989 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); | |
| 990 | |
| 991 if(d < QP) | |
| 992 { | |
| 993 int v = d * SIGN(-b); | |
| 994 | |
| 995 src[1] +=v/8; | |
| 996 src[2] +=v/4; | |
| 997 src[3] +=3*v/8; | |
| 998 src[4] -=3*v/8; | |
| 999 src[5] -=v/4; | |
| 1000 src[6] -=v/8; | |
| 1001 | |
| 1002 } | |
| 1003 src+=stride; | |
| 1004 } | |
| 789 #endif | 1005 #endif |
| 790 } | 1006 } |
| 791 | 1007 |
| 792 | 1008 |
| 793 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | 1009 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
| 1636 | 1852 |
| 1637 horizontal_size >>= 1; | 1853 horizontal_size >>= 1; |
| 1638 vertical_size >>= 1; | 1854 vertical_size >>= 1; |
| 1639 src_stride >>= 1; | 1855 src_stride >>= 1; |
| 1640 dst_stride >>= 1; | 1856 dst_stride >>= 1; |
| 1857 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00); | |
| 1641 | 1858 |
| 1642 if(1) | 1859 if(1) |
| 1643 { | 1860 { |
| 1644 postProcess(src[1], src_stride, dst[1], dst_stride, | 1861 postProcess(src[1], src_stride, dst[1], dst_stride, |
| 1645 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); | 1862 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); |
| 1646 postProcess(src[2], src_stride, dst[2], dst_stride, | 1863 postProcess(src[2], src_stride, dst[2], dst_stride, |
| 1647 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode >>4); | 1864 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); |
| 1648 } | 1865 } |
| 1649 else | 1866 else |
| 1650 { | 1867 { |
| 1651 memcpy(dst[1], src[1], src_stride*horizontal_size); | 1868 memcpy(dst[1], src[1], src_stride*horizontal_size); |
| 1652 memcpy(dst[2], src[2], src_stride*horizontal_size); | 1869 memcpy(dst[2], src[2], src_stride*horizontal_size); |
| 1927 memcpyTime+= T1-T0; | 2144 memcpyTime+= T1-T0; |
| 1928 T0=T1; | 2145 T0=T1; |
| 1929 #endif | 2146 #endif |
| 1930 if(mode & V_DEBLOCK) | 2147 if(mode & V_DEBLOCK) |
| 1931 { | 2148 { |
| 1932 if(mode & RK_FILTER) | 2149 if(mode & V_RK1_FILTER) |
| 1933 vertRKFilter(vertBlock, stride, QP); | 2150 vertRK1Filter(vertBlock, stride, QP); |
| 1934 else if(mode & X1_FILTER) | 2151 else if(mode & V_X1_FILTER) |
| 1935 vertX1Filter(vertBlock, stride, QP); | 2152 vertX1Filter(vertBlock, stride, QP); |
| 1936 else | 2153 else |
| 1937 { | 2154 { |
| 1938 if( isVertDC(vertBlock, stride)) | 2155 if( isVertDC(vertBlock, stride)) |
| 1939 { | 2156 { |
| 1960 #ifdef MORE_TIMEING | 2177 #ifdef MORE_TIMEING |
| 1961 T0= rdtsc(); | 2178 T0= rdtsc(); |
| 1962 #endif | 2179 #endif |
| 1963 if(mode & H_DEBLOCK) | 2180 if(mode & H_DEBLOCK) |
| 1964 { | 2181 { |
| 1965 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) | 2182 if(mode & H_X1_FILTER) |
| 2183 horizX1Filter(dstBlock-4, stride, QP); | |
| 2184 else | |
| 1966 { | 2185 { |
| 1967 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | 2186 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) |
| 1968 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | 2187 { |
| 2188 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | |
| 2189 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | |
| 2190 } | |
| 2191 else | |
| 2192 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |
| 1969 } | 2193 } |
| 1970 else | |
| 1971 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | |
| 1972 } | 2194 } |
| 1973 #ifdef MORE_TIMEING | 2195 #ifdef MORE_TIMEING |
| 1974 T1= rdtsc(); | 2196 T1= rdtsc(); |
| 1975 horizTime+= T1-T0; | 2197 horizTime+= T1-T0; |
| 1976 T0=T1; | 2198 T0=T1; |
