Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 164:dedb3aef2bee libavcodec
cleanup
precopy fewer lines from src to dst if possible
speedup (due to cleanup of blockcopy)
| author | michael |
|---|---|
| date | Tue, 20 Nov 2001 17:47:52 +0000 |
| parents | 32e7f17a04a7 |
| children | ea3b49451497 |
comparison
equal
deleted
inserted
replaced
| 163:32e7f17a04a7 | 164:dedb3aef2bee |
|---|---|
| 60 split this huge file | 60 split this huge file |
| 61 border remover | 61 border remover |
| 62 optimize c versions | 62 optimize c versions |
| 63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks | 63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks |
| 64 smart blur | 64 smart blur |
| 65 commandline option for the deblock thresholds | |
| 65 ... | 66 ... |
| 66 */ | 67 */ |
| 67 | 68 |
| 68 //Changelog: use the CVS log | 69 //Changelog: use the CVS log |
| 69 | 70 |
| 856 } | 857 } |
| 857 */ | 858 */ |
| 858 #endif | 859 #endif |
| 859 } | 860 } |
| 860 | 861 |
| 861 /** | |
| 862 * Experimental Filter 1 (Horizontal) | |
| 863 * will not damage linear gradients | |
| 864 * Flat blocks should look like they where passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter | |
| 865 * can only smooth blocks at the expected locations (it cant smooth them if they did move) | |
| 866 * MMX2 version does correct clipping C version doesnt | |
| 867 * not identical with the vertical one | |
| 868 */ | |
| 869 static inline void horizX1Filter(uint8_t *src, int stride, int QP) | |
| 870 { | |
| 871 int y; | |
| 872 static uint64_t *lut= NULL; | |
| 873 if(lut==NULL) | |
| 874 { | |
| 875 int i; | |
| 876 lut= (uint64_t*)memalign(8, 256*8); | |
| 877 for(i=0; i<256; i++) | |
| 878 { | |
| 879 int v= i < 128 ? 2*i : 2*(i-256); | |
| 880 /* | |
| 881 //Simulate 112242211 9-Tap filter | |
| 882 uint64_t a= (v/16) & 0xFF; | |
| 883 uint64_t b= (v/8) & 0xFF; | |
| 884 uint64_t c= (v/4) & 0xFF; | |
| 885 uint64_t d= (3*v/8) & 0xFF; | |
| 886 */ | |
| 887 //Simulate piecewise linear interpolation | |
| 888 uint64_t a= (v/16) & 0xFF; | |
| 889 uint64_t b= (v*3/16) & 0xFF; | |
| 890 uint64_t c= (v*5/16) & 0xFF; | |
| 891 uint64_t d= (7*v/16) & 0xFF; | |
| 892 uint64_t A= (0x100 - a)&0xFF; | |
| 893 uint64_t B= (0x100 - b)&0xFF; | |
| 894 uint64_t C= (0x100 - c)&0xFF; | |
| 895 uint64_t D= (0x100 - c)&0xFF; | |
| 896 | |
| 897 lut[i] = (a<<56) | (b<<48) | (c<<40) | (d<<32) | | |
| 898 (D<<24) | (C<<16) | (B<<8) | (A); | |
| 899 //lut[i] = (v<<32) | (v<<24); | |
| 900 } | |
| 901 } | |
| 902 | |
| 903 #if 0 | |
| 904 asm volatile( | |
| 905 "pxor %%mm7, %%mm7 \n\t" // 0 | |
| 906 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | |
| 907 "leal (%0, %1), %%eax \n\t" | |
| 908 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 909 | |
| 910 "movq b80, %%mm6 \n\t" | |
| 911 "movd pQPb, %%mm5 \n\t" // QP | |
| 912 "movq %%mm5, %%mm4 \n\t" | |
| 913 "paddusb %%mm5, %%mm5 \n\t" // 2QP | |
| 914 "paddusb %%mm5, %%mm4 \n\t" // 3QP | |
| 915 "pxor %%mm5, %%mm5 \n\t" // 0 | |
| 916 "psubb %%mm4, %%mm5 \n\t" // -3QP | |
| 917 "por bm11111110, %%mm5 \n\t" // ...,FF,FF,-3QP | |
| 918 "psllq $24, %%mm5 \n\t" | |
| 919 | |
| 920 // 0 1 2 3 4 5 6 7 8 9 | |
| 921 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 922 | |
| 923 #define HX1old(a) \ | |
| 924 "movd " #a ", %%mm0 \n\t"\ | |
| 925 "movd 4" #a ", %%mm1 \n\t"\ | |
| 926 "punpckldq %%mm1, %%mm0 \n\t"\ | |
| 927 "movq %%mm0, %%mm1 \n\t"\ | |
| 928 "movq %%mm0, %%mm2 \n\t"\ | |
| 929 "psrlq $8, %%mm1 \n\t"\ | |
| 930 "psubusb %%mm1, %%mm2 \n\t"\ | |
| 931 "psubusb %%mm0, %%mm1 \n\t"\ | |
| 932 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
| 933 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
| 934 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ | |
| 935 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 936 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 937 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
| 938 "paddb %%mm5, %%mm1 \n\t"\ | |
| 939 "psubusb %%mm5, %%mm1 \n\t"\ | |
| 940 PAVGB(%%mm7, %%mm1)\ | |
| 941 "pxor %%mm2, %%mm1 \n\t"\ | |
| 942 "psubb %%mm2, %%mm1 \n\t"\ | |
| 943 "psrlq $24, %%mm1 \n\t"\ | |
| 944 "movd %%mm1, %%ecx \n\t"\ | |
| 945 "paddb %%mm6, %%mm0 \n\t"\ | |
| 946 "paddsb (%3, %%ecx, 8), %%mm0 \n\t"\ | |
| 947 "paddb %%mm6, %%mm0 \n\t"\ | |
| 948 "movq %%mm0, " #a " \n\t"\ | |
| 949 | |
| 950 /* | |
| 951 HX1old((%0)) | |
| 952 HX1old((%%eax)) | |
| 953 HX1old((%%eax, %1)) | |
| 954 HX1old((%%eax, %1, 2)) | |
| 955 HX1old((%0, %1, 4)) | |
| 956 HX1old((%%ebx)) | |
| 957 HX1old((%%ebx, %1)) | |
| 958 HX1old((%%ebx, %1, 2)) | |
| 959 */ | |
| 960 | |
| 961 //FIXME add some comments, its unreadable ... | |
| 962 #define HX1b(a, c, b, d) \ | |
| 963 "movd " #a ", %%mm0 \n\t"\ | |
| 964 "movd 4" #a ", %%mm1 \n\t"\ | |
| 965 "punpckldq %%mm1, %%mm0 \n\t"\ | |
| 966 "movd " #b ", %%mm4 \n\t"\ | |
| 967 "movq %%mm0, %%mm1 \n\t"\ | |
| 968 "movq %%mm0, %%mm2 \n\t"\ | |
| 969 "psrlq $8, %%mm1 \n\t"\ | |
| 970 "movd 4" #b ", %%mm3 \n\t"\ | |
| 971 "psubusb %%mm1, %%mm2 \n\t"\ | |
| 972 "psubusb %%mm0, %%mm1 \n\t"\ | |
| 973 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
| 974 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
| 975 "punpckldq %%mm3, %%mm4 \n\t"\ | |
| 976 "movq %%mm1, %%mm3 \n\t"\ | |
| 977 "psllq $32, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ | |
| 978 PAVGB(%%mm1, %%mm3) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 979 "paddb %%mm6, %%mm0 \n\t"\ | |
| 980 "psrlq $16, %%mm3 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 981 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
| 982 "movq %%mm4, %%mm3 \n\t"\ | |
| 983 "paddb %%mm5, %%mm1 \n\t"\ | |
| 984 "psubusb %%mm5, %%mm1 \n\t"\ | |
| 985 "psrlq $8, %%mm3 \n\t"\ | |
| 986 PAVGB(%%mm7, %%mm1)\ | |
| 987 "pxor %%mm2, %%mm1 \n\t"\ | |
| 988 "psubb %%mm2, %%mm1 \n\t"\ | |
| 989 "movq %%mm4, %%mm2 \n\t"\ | |
| 990 "psrlq $24, %%mm1 \n\t"\ | |
| 991 "psubusb %%mm3, %%mm2 \n\t"\ | |
| 992 "movd %%mm1, %%ecx \n\t"\ | |
| 993 "psubusb %%mm4, %%mm3 \n\t"\ | |
| 994 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\ | |
| 995 "por %%mm2, %%mm3 \n\t" /* p´x = |px - p(x+1)| */\ | |
| 996 "paddb %%mm6, %%mm0 \n\t"\ | |
| 997 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
| 998 "movq %%mm3, %%mm1 \n\t"\ | |
| 999 "psllq $32, %%mm1 \n\t" /* p´5 = |p1 - p2| */\ | |
| 1000 "movq %%mm0, " #a " \n\t"\ | |
| 1001 PAVGB(%%mm3, %%mm1) /* p´5 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 1002 "paddb %%mm6, %%mm4 \n\t"\ | |
| 1003 "psrlq $16, %%mm1 \n\t" /* p´3 = (|p2-p1| + |p6-p5|)/2 */\ | |
| 1004 "psubusb %%mm1, %%mm3 \n\t" /* |p3-p4|-(|p2-p1| + |p6-p5|)/2 */\ | |
| 1005 "paddb %%mm5, %%mm3 \n\t"\ | |
| 1006 "psubusb %%mm5, %%mm3 \n\t"\ | |
| 1007 PAVGB(%%mm7, %%mm3)\ | |
| 1008 "pxor %%mm2, %%mm3 \n\t"\ | |
| 1009 "psubb %%mm2, %%mm3 \n\t"\ | |
| 1010 "psrlq $24, %%mm3 \n\t"\ | |
| 1011 "movd " #c ", %%mm0 \n\t"\ | |
| 1012 "movd 4" #c ", %%mm1 \n\t"\ | |
| 1013 "punpckldq %%mm1, %%mm0 \n\t"\ | |
| 1014 "paddb %%mm6, %%mm0 \n\t"\ | |
| 1015 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\ | |
| 1016 "paddb %%mm6, %%mm0 \n\t"\ | |
| 1017 "movq %%mm0, " #c " \n\t"\ | |
| 1018 "movd %%mm3, %%ecx \n\t"\ | |
| 1019 "movd " #d ", %%mm0 \n\t"\ | |
| 1020 "paddsb (%2, %%ecx, 8), %%mm4 \n\t"\ | |
| 1021 "movd 4" #d ", %%mm1 \n\t"\ | |
| 1022 "paddb %%mm6, %%mm4 \n\t"\ | |
| 1023 "punpckldq %%mm1, %%mm0 \n\t"\ | |
| 1024 "movq %%mm4, " #b " \n\t"\ | |
| 1025 "paddb %%mm6, %%mm0 \n\t"\ | |
| 1026 "paddsb (%2, %%ecx, 8), %%mm0 \n\t"\ | |
| 1027 "paddb %%mm6, %%mm0 \n\t"\ | |
| 1028 "movq %%mm0, " #d " \n\t"\ | |
| 1029 | |
| 1030 HX1b((%0),(%%eax),(%%eax, %1),(%%eax, %1, 2)) | |
| 1031 HX1b((%0, %1, 4),(%%ebx),(%%ebx, %1),(%%ebx, %1, 2)) | |
| 1032 | |
| 1033 | |
| 1034 : | |
| 1035 : "r" (src), "r" (stride), "r" (lut) | |
| 1036 : "%eax", "%ebx", "%ecx" | |
| 1037 ); | |
| 1038 #else | |
| 1039 | |
| 1040 //FIXME (has little in common with the mmx2 version) | |
| 1041 for(y=0; y<BLOCK_SIZE; y++) | |
| 1042 { | |
| 1043 int a= src[1] - src[2]; | |
| 1044 int b= src[3] - src[4]; | |
| 1045 int c= src[5] - src[6]; | |
| 1046 | |
| 1047 int d= MAX(ABS(b) - (ABS(a) + ABS(c))/2, 0); | |
| 1048 | |
| 1049 if(d < QP) | |
| 1050 { | |
| 1051 int v = d * SIGN(-b); | |
| 1052 | |
| 1053 src[1] +=v/8; | |
| 1054 src[2] +=v/4; | |
| 1055 src[3] +=3*v/8; | |
| 1056 src[4] -=3*v/8; | |
| 1057 src[5] -=v/4; | |
| 1058 src[6] -=v/8; | |
| 1059 | |
| 1060 } | |
| 1061 src+=stride; | |
| 1062 } | |
| 1063 #endif | |
| 1064 } | |
| 1065 | |
| 1066 | |
| 1067 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | 862 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
| 1068 { | 863 { |
| 1069 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 864 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 1070 /* | 865 /* |
| 1071 uint8_t tmp[16]; | 866 uint8_t tmp[16]; |
| 1637 | 1432 |
| 1638 src[l4]-= d; | 1433 src[l4]-= d; |
| 1639 src[l5]+= d; | 1434 src[l5]+= d; |
| 1640 } | 1435 } |
| 1641 src++; | 1436 src++; |
| 1642 } | |
| 1643 #endif | |
| 1644 } | |
| 1645 | |
| 1646 //FIXME? |255-0| = 1 | |
| 1647 /** | |
| 1648 * Check if the given 8x8 Block is mostly "flat" | |
| 1649 */ | |
| 1650 static inline int isHorizDC(uint8_t src[], int stride) | |
| 1651 { | |
| 1652 // src++; | |
| 1653 int numEq= 0; | |
| 1654 #if 0 | |
| 1655 asm volatile ( | |
| 1656 // "int $3 \n\t" | |
| 1657 "leal (%1, %2), %%ecx \n\t" | |
| 1658 "leal (%%ecx, %2, 4), %%ebx \n\t" | |
| 1659 // 0 1 2 3 4 5 6 7 8 9 | |
| 1660 // %1 ecx ecx+%2 ecx+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 | |
| 1661 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F | |
| 1662 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D | |
| 1663 "pxor %%mm0, %%mm0 \n\t" | |
| 1664 "movl %1, %%eax \n\t" | |
| 1665 "andl $0x1F, %%eax \n\t" | |
| 1666 "cmpl $24, %%eax \n\t" | |
| 1667 "leal tempBlock, %%eax \n\t" | |
| 1668 "jb 1f \n\t" | |
| 1669 | |
| 1670 #define HDC_CHECK_AND_CPY(src, dst) \ | |
| 1671 "movd " #src ", %%mm2 \n\t"\ | |
| 1672 "punpckldq 4" #src ", %%mm2 \n\t" /* (%1) */\ | |
| 1673 "movq %%mm2, %%mm1 \n\t"\ | |
| 1674 "psrlq $8, %%mm2 \n\t"\ | |
| 1675 "psubb %%mm1, %%mm2 \n\t"\ | |
| 1676 "paddb %%mm7, %%mm2 \n\t"\ | |
| 1677 "pcmpgtb %%mm6, %%mm2 \n\t"\ | |
| 1678 "paddb %%mm2, %%mm0 \n\t"\ | |
| 1679 "movq %%mm1," #dst "(%%eax) \n\t" | |
| 1680 | |
| 1681 HDC_CHECK_AND_CPY((%1),0) | |
| 1682 HDC_CHECK_AND_CPY((%%ecx),8) | |
| 1683 HDC_CHECK_AND_CPY((%%ecx, %2),16) | |
| 1684 HDC_CHECK_AND_CPY((%%ecx, %2, 2),24) | |
| 1685 HDC_CHECK_AND_CPY((%1, %2, 4),32) | |
| 1686 HDC_CHECK_AND_CPY((%%ebx),40) | |
| 1687 HDC_CHECK_AND_CPY((%%ebx, %2),48) | |
| 1688 HDC_CHECK_AND_CPY((%%ebx, %2, 2),56) | |
| 1689 "jmp 2f \n\t" | |
| 1690 "1: \n\t" | |
| 1691 // src does not cross a 32 byte cache line so dont waste time with alignment | |
| 1692 #define HDC_CHECK_AND_CPY2(src, dst) \ | |
| 1693 "movq " #src ", %%mm2 \n\t"\ | |
| 1694 "movq " #src ", %%mm1 \n\t"\ | |
| 1695 "psrlq $8, %%mm2 \n\t"\ | |
| 1696 "psubb %%mm1, %%mm2 \n\t"\ | |
| 1697 "paddb %%mm7, %%mm2 \n\t"\ | |
| 1698 "pcmpgtb %%mm6, %%mm2 \n\t"\ | |
| 1699 "paddb %%mm2, %%mm0 \n\t"\ | |
| 1700 "movq %%mm1," #dst "(%%eax) \n\t" | |
| 1701 | |
| 1702 HDC_CHECK_AND_CPY2((%1),0) | |
| 1703 HDC_CHECK_AND_CPY2((%%ecx),8) | |
| 1704 HDC_CHECK_AND_CPY2((%%ecx, %2),16) | |
| 1705 HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24) | |
| 1706 HDC_CHECK_AND_CPY2((%1, %2, 4),32) | |
| 1707 HDC_CHECK_AND_CPY2((%%ebx),40) | |
| 1708 HDC_CHECK_AND_CPY2((%%ebx, %2),48) | |
| 1709 HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56) | |
| 1710 "2: \n\t" | |
| 1711 "psllq $8, %%mm0 \n\t" // remove dummy value | |
| 1712 "movq %%mm0, %%mm1 \n\t" | |
| 1713 "psrlw $8, %%mm0 \n\t" | |
| 1714 "paddb %%mm1, %%mm0 \n\t" | |
| 1715 "movq %%mm0, %%mm1 \n\t" | |
| 1716 "psrlq $16, %%mm0 \n\t" | |
| 1717 "paddb %%mm1, %%mm0 \n\t" | |
| 1718 "movq %%mm0, %%mm1 \n\t" | |
| 1719 "psrlq $32, %%mm0 \n\t" | |
| 1720 "paddb %%mm1, %%mm0 \n\t" | |
| 1721 "movd %%mm0, %0 \n\t" | |
| 1722 : "=r" (numEq) | |
| 1723 : "r" (src), "r" (stride) | |
| 1724 : "%eax", "%ebx", "%ecx" | |
| 1725 ); | |
| 1726 // printf("%d\n", numEq); | |
| 1727 numEq= (256 - numEq) &0xFF; | |
| 1728 #else | |
| 1729 int y; | |
| 1730 for(y=0; y<BLOCK_SIZE; y++) | |
| 1731 { | |
| 1732 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; | |
| 1733 if(((src[1] - src[2] + 1) & 0xFFFF) < 3) numEq++; | |
| 1734 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; | |
| 1735 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; | |
| 1736 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; | |
| 1737 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; | |
| 1738 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; | |
| 1739 src+= stride; | |
| 1740 } | |
| 1741 #endif | |
| 1742 /* if(abs(numEq - asmEq) > 0) | |
| 1743 { | |
| 1744 // printf("\nasm:%d c:%d\n", asmEq, numEq); | |
| 1745 for(int y=0; y<8; y++) | |
| 1746 { | |
| 1747 for(int x=0; x<8; x++) | |
| 1748 { | |
| 1749 printf("%d ", src[x + y*stride]); | |
| 1750 } | |
| 1751 printf("\n"); | |
| 1752 } | |
| 1753 } | |
| 1754 */ | |
| 1755 // printf("%d\n", numEq); | |
| 1756 return numEq > hFlatnessThreshold; | |
| 1757 } | |
| 1758 | |
| 1759 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) | |
| 1760 { | |
| 1761 if(abs(src[0] - src[7]) > 2*QP) return 0; | |
| 1762 | |
| 1763 return 1; | |
| 1764 } | |
| 1765 | |
| 1766 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) | |
| 1767 { | |
| 1768 #if 0 | |
| 1769 asm volatile( | |
| 1770 "leal (%0, %1), %%ecx \n\t" | |
| 1771 "leal (%%ecx, %1, 4), %%ebx \n\t" | |
| 1772 // 0 1 2 3 4 5 6 7 8 9 | |
| 1773 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 1774 "pxor %%mm7, %%mm7 \n\t" | |
| 1775 "movq bm00001000, %%mm6 \n\t" | |
| 1776 "movd %2, %%mm5 \n\t" // QP | |
| 1777 "movq %%mm5, %%mm4 \n\t" | |
| 1778 "paddusb %%mm5, %%mm5 \n\t" // 2QP | |
| 1779 "paddusb %%mm5, %%mm4 \n\t" // 3QP | |
| 1780 "psllq $24, %%mm4 \n\t" | |
| 1781 "pxor %%mm5, %%mm5 \n\t" // 0 | |
| 1782 "psubb %%mm4, %%mm5 \n\t" // -QP | |
| 1783 "leal tempBlock, %%eax \n\t" | |
| 1784 | |
| 1785 //FIXME? "unroll by 2" and mix | |
| 1786 #ifdef HAVE_MMX2 | |
| 1787 #define HDF(src, dst) \ | |
| 1788 "movq " #src "(%%eax), %%mm0 \n\t"\ | |
| 1789 "movq " #src "(%%eax), %%mm1 \n\t"\ | |
| 1790 "movq " #src "(%%eax), %%mm2 \n\t"\ | |
| 1791 "psrlq $8, %%mm1 \n\t"\ | |
| 1792 "psubusb %%mm1, %%mm2 \n\t"\ | |
| 1793 "psubusb %%mm0, %%mm1 \n\t"\ | |
| 1794 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
| 1795 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
| 1796 "pshufw $0x00, %%mm1, %%mm3 \n\t" /* p´5 = |p1 - p2| */\ | |
| 1797 "pminub %%mm1, %%mm3 \n\t" /* p´5 = min(|p2-p1|, |p6-p5|)*/\ | |
| 1798 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ | |
| 1799 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5-p6|) */\ | |
| 1800 "paddb %%mm5, %%mm1 \n\t"\ | |
| 1801 "psubusb %%mm5, %%mm1 \n\t"\ | |
| 1802 "psrlw $2, %%mm1 \n\t"\ | |
| 1803 "pxor %%mm2, %%mm1 \n\t"\ | |
| 1804 "psubb %%mm2, %%mm1 \n\t"\ | |
| 1805 "pand %%mm6, %%mm1 \n\t"\ | |
| 1806 "psubb %%mm1, %%mm0 \n\t"\ | |
| 1807 "psllq $8, %%mm1 \n\t"\ | |
| 1808 "paddb %%mm1, %%mm0 \n\t"\ | |
| 1809 "movd %%mm0, " #dst" \n\t"\ | |
| 1810 "psrlq $32, %%mm0 \n\t"\ | |
| 1811 "movd %%mm0, 4" #dst" \n\t" | |
| 1812 #else | |
| 1813 #define HDF(src, dst)\ | |
| 1814 "movq " #src "(%%eax), %%mm0 \n\t"\ | |
| 1815 "movq %%mm0, %%mm1 \n\t"\ | |
| 1816 "movq %%mm0, %%mm2 \n\t"\ | |
| 1817 "psrlq $8, %%mm1 \n\t"\ | |
| 1818 "psubusb %%mm1, %%mm2 \n\t"\ | |
| 1819 "psubusb %%mm0, %%mm1 \n\t"\ | |
| 1820 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | |
| 1821 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | |
| 1822 "movq %%mm1, %%mm3 \n\t"\ | |
| 1823 "psllq $32, %%mm3 \n\t"\ | |
| 1824 "movq %%mm3, %%mm4 \n\t"\ | |
| 1825 "psubusb %%mm1, %%mm4 \n\t"\ | |
| 1826 "psubb %%mm4, %%mm3 \n\t"\ | |
| 1827 "psrlq $16, %%mm3 \n\t" /* p´3 = min(|p2-p1|, |p6-p5|)*/\ | |
| 1828 "psubusb %%mm3, %%mm1 \n\t" /* |p3-p4|-min(|p1-p2|,|p5,ü6|) */\ | |
| 1829 "paddb %%mm5, %%mm1 \n\t"\ | |
| 1830 "psubusb %%mm5, %%mm1 \n\t"\ | |
| 1831 "psrlw $2, %%mm1 \n\t"\ | |
| 1832 "pxor %%mm2, %%mm1 \n\t"\ | |
| 1833 "psubb %%mm2, %%mm1 \n\t"\ | |
| 1834 "pand %%mm6, %%mm1 \n\t"\ | |
| 1835 "psubb %%mm1, %%mm0 \n\t"\ | |
| 1836 "psllq $8, %%mm1 \n\t"\ | |
| 1837 "paddb %%mm1, %%mm0 \n\t"\ | |
| 1838 "movd %%mm0, " #dst " \n\t"\ | |
| 1839 "psrlq $32, %%mm0 \n\t"\ | |
| 1840 "movd %%mm0, 4" #dst " \n\t" | |
| 1841 #endif | |
| 1842 HDF(0,(%0)) | |
| 1843 HDF(8,(%%ecx)) | |
| 1844 HDF(16,(%%ecx, %1)) | |
| 1845 HDF(24,(%%ecx, %1, 2)) | |
| 1846 HDF(32,(%0, %1, 4)) | |
| 1847 HDF(40,(%%ebx)) | |
| 1848 HDF(48,(%%ebx, %1)) | |
| 1849 HDF(56,(%%ebx, %1, 2)) | |
| 1850 : | |
| 1851 : "r" (dst), "r" (stride), "r" (QP) | |
| 1852 : "%eax", "%ebx", "%ecx" | |
| 1853 ); | |
| 1854 #else | |
| 1855 int y; | |
| 1856 for(y=0; y<BLOCK_SIZE; y++) | |
| 1857 { | |
| 1858 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]); | |
| 1859 | |
| 1860 if(ABS(middleEnergy) < 8*QP) | |
| 1861 { | |
| 1862 const int q=(dst[3] - dst[4])/2; | |
| 1863 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); | |
| 1864 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); | |
| 1865 | |
| 1866 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | |
| 1867 d= MAX(d, 0); | |
| 1868 | |
| 1869 d= (5*d + 32) >> 6; | |
| 1870 d*= SIGN(-middleEnergy); | |
| 1871 | |
| 1872 if(q>0) | |
| 1873 { | |
| 1874 d= d<0 ? 0 : d; | |
| 1875 d= d>q ? q : d; | |
| 1876 } | |
| 1877 else | |
| 1878 { | |
| 1879 d= d>0 ? 0 : d; | |
| 1880 d= d<q ? q : d; | |
| 1881 } | |
| 1882 | |
| 1883 dst[3]-= d; | |
| 1884 dst[4]+= d; | |
| 1885 } | |
| 1886 dst+= stride; | |
| 1887 } | |
| 1888 #endif | |
| 1889 } | |
| 1890 | |
| 1891 /** | |
| 1892 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) | |
| 1893 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | |
| 1894 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) | |
| 1895 */ | |
| 1896 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP) | |
| 1897 { | |
| 1898 | |
| 1899 #if 0 | |
| 1900 asm volatile( | |
| 1901 "leal (%0, %1), %%ecx \n\t" | |
| 1902 "leal (%%ecx, %1, 4), %%ebx \n\t" | |
| 1903 // 0 1 2 3 4 5 6 7 8 9 | |
| 1904 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 1905 "pxor %%mm7, %%mm7 \n\t" | |
| 1906 "leal tempBlock, %%eax \n\t" | |
| 1907 /* | |
| 1908 #define HLP1 "movq (%0), %%mm0 \n\t"\ | |
| 1909 "movq %%mm0, %%mm1 \n\t"\ | |
| 1910 "psllq $8, %%mm0 \n\t"\ | |
| 1911 PAVGB(%%mm1, %%mm0)\ | |
| 1912 "psrlw $8, %%mm0 \n\t"\ | |
| 1913 "pxor %%mm1, %%mm1 \n\t"\ | |
| 1914 "packuswb %%mm1, %%mm0 \n\t"\ | |
| 1915 "movq %%mm0, %%mm1 \n\t"\ | |
| 1916 "movq %%mm0, %%mm2 \n\t"\ | |
| 1917 "psllq $32, %%mm0 \n\t"\ | |
| 1918 "paddb %%mm0, %%mm1 \n\t"\ | |
| 1919 "psllq $16, %%mm2 \n\t"\ | |
| 1920 PAVGB(%%mm2, %%mm0)\ | |
| 1921 "movq %%mm0, %%mm3 \n\t"\ | |
| 1922 "pand bm11001100, %%mm0 \n\t"\ | |
| 1923 "paddusb %%mm0, %%mm3 \n\t"\ | |
| 1924 "psrlq $8, %%mm3 \n\t"\ | |
| 1925 PAVGB(%%mm1, %%mm4)\ | |
| 1926 PAVGB(%%mm3, %%mm2)\ | |
| 1927 "psrlq $16, %%mm2 \n\t"\ | |
| 1928 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
| 1929 "movq %%mm2, (%0) \n\t"\ | |
| 1930 | |
| 1931 #define HLP2 "movq (%0), %%mm0 \n\t"\ | |
| 1932 "movq %%mm0, %%mm1 \n\t"\ | |
| 1933 "psllq $8, %%mm0 \n\t"\ | |
| 1934 PAVGB(%%mm1, %%mm0)\ | |
| 1935 "psrlw $8, %%mm0 \n\t"\ | |
| 1936 "pxor %%mm1, %%mm1 \n\t"\ | |
| 1937 "packuswb %%mm1, %%mm0 \n\t"\ | |
| 1938 "movq %%mm0, %%mm2 \n\t"\ | |
| 1939 "psllq $32, %%mm0 \n\t"\ | |
| 1940 "psllq $16, %%mm2 \n\t"\ | |
| 1941 PAVGB(%%mm2, %%mm0)\ | |
| 1942 "movq %%mm0, %%mm3 \n\t"\ | |
| 1943 "pand bm11001100, %%mm0 \n\t"\ | |
| 1944 "paddusb %%mm0, %%mm3 \n\t"\ | |
| 1945 "psrlq $8, %%mm3 \n\t"\ | |
| 1946 PAVGB(%%mm3, %%mm2)\ | |
| 1947 "psrlq $16, %%mm2 \n\t"\ | |
| 1948 "punpcklbw %%mm2, %%mm2 \n\t"\ | |
| 1949 "movq %%mm2, (%0) \n\t"\ | |
| 1950 */ | |
| 1951 // approximately a 7-Tap Filter with Vector (1,2,3,4,3,2,1)/16 | |
| 1952 /* | |
| 1953 Implemented Exact 7-Tap | |
| 1954 9421 A321 | |
| 1955 36421 64321 | |
| 1956 334321 = | |
| 1957 1234321 = | |
| 1958 1234321 = | |
| 1959 123433 = | |
| 1960 12463 12346 | |
| 1961 1249 123A | |
| 1962 | |
| 1963 */ | |
| 1964 | |
| 1965 #ifdef HAVE_MMX2 | |
| 1966 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |
| 1967 "movq %%mm0, %%mm1 \n\t"\ | |
| 1968 "movq %%mm0, %%mm2 \n\t"\ | |
| 1969 "movq %%mm0, %%mm3 \n\t"\ | |
| 1970 "movq %%mm0, %%mm4 \n\t"\ | |
| 1971 "psllq $8, %%mm1 \n\t"\ | |
| 1972 "psrlq $8, %%mm2 \n\t"\ | |
| 1973 "pand bm00000001, %%mm3 \n\t"\ | |
| 1974 "pand bm10000000, %%mm4 \n\t"\ | |
| 1975 "por %%mm3, %%mm1 \n\t"\ | |
| 1976 "por %%mm4, %%mm2 \n\t"\ | |
| 1977 PAVGB(%%mm2, %%mm1)\ | |
| 1978 PAVGB(%%mm1, %%mm0)\ | |
| 1979 \ | |
| 1980 "pshufw $0xF9, %%mm0, %%mm3 \n\t"\ | |
| 1981 "pshufw $0x90, %%mm0, %%mm4 \n\t"\ | |
| 1982 PAVGB(%%mm3, %%mm4)\ | |
| 1983 PAVGB(%%mm4, %%mm0)\ | |
| 1984 "movd %%mm0, (%0) \n\t"\ | |
| 1985 "psrlq $32, %%mm0 \n\t"\ | |
| 1986 "movd %%mm0, 4(%0) \n\t" | |
| 1987 #else | |
| 1988 #define HLP3(i) "movq " #i "(%%eax), %%mm0 \n\t"\ | |
| 1989 "movq %%mm0, %%mm1 \n\t"\ | |
| 1990 "movq %%mm0, %%mm2 \n\t"\ | |
| 1991 "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |
| 1992 "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |
| 1993 "psllq $8, %%mm1 \n\t"\ | |
| 1994 "psrlq $8, %%mm2 \n\t"\ | |
| 1995 "psrlq $24, %%mm3 \n\t"\ | |
| 1996 "psllq $56, %%mm4 \n\t"\ | |
| 1997 "por %%mm3, %%mm1 \n\t"\ | |
| 1998 "por %%mm4, %%mm2 \n\t"\ | |
| 1999 PAVGB(%%mm2, %%mm1)\ | |
| 2000 PAVGB(%%mm1, %%mm0)\ | |
| 2001 \ | |
| 2002 "movq %%mm0, %%mm3 \n\t"\ | |
| 2003 "movq %%mm0, %%mm4 \n\t"\ | |
| 2004 "movq %%mm0, %%mm5 \n\t"\ | |
| 2005 "psrlq $16, %%mm3 \n\t"\ | |
| 2006 "psllq $16, %%mm4 \n\t"\ | |
| 2007 "pand bm11000000, %%mm5 \n\t"\ | |
| 2008 "por %%mm5, %%mm3 \n\t"\ | |
| 2009 "movq %%mm0, %%mm5 \n\t"\ | |
| 2010 "pand bm00000011, %%mm5 \n\t"\ | |
| 2011 "por %%mm5, %%mm4 \n\t"\ | |
| 2012 PAVGB(%%mm3, %%mm4)\ | |
| 2013 PAVGB(%%mm4, %%mm0)\ | |
| 2014 "movd %%mm0, (%0) \n\t"\ | |
| 2015 "psrlq $32, %%mm0 \n\t"\ | |
| 2016 "movd %%mm0, 4(%0) \n\t" | |
| 2017 #endif | |
| 2018 | |
| 2019 /* uses the 7-Tap Filter: 1112111 */ | |
| 2020 #define NEW_HLP(src, dst)\ | |
| 2021 "movq " #src "(%%eax), %%mm1 \n\t"\ | |
| 2022 "movq " #src "(%%eax), %%mm2 \n\t"\ | |
| 2023 "psllq $8, %%mm1 \n\t"\ | |
| 2024 "psrlq $8, %%mm2 \n\t"\ | |
| 2025 "movd -4" #dst ", %%mm3 \n\t" /*0001000*/\ | |
| 2026 "movd 8" #dst ", %%mm4 \n\t" /*0001000*/\ | |
| 2027 "psrlq $24, %%mm3 \n\t"\ | |
| 2028 "psllq $56, %%mm4 \n\t"\ | |
| 2029 "por %%mm3, %%mm1 \n\t"\ | |
| 2030 "por %%mm4, %%mm2 \n\t"\ | |
| 2031 "movq %%mm1, %%mm5 \n\t"\ | |
| 2032 PAVGB(%%mm2, %%mm1)\ | |
| 2033 "movq " #src "(%%eax), %%mm0 \n\t"\ | |
| 2034 PAVGB(%%mm1, %%mm0)\ | |
| 2035 "psllq $8, %%mm5 \n\t"\ | |
| 2036 "psrlq $8, %%mm2 \n\t"\ | |
| 2037 "por %%mm3, %%mm5 \n\t"\ | |
| 2038 "por %%mm4, %%mm2 \n\t"\ | |
| 2039 "movq %%mm5, %%mm1 \n\t"\ | |
| 2040 PAVGB(%%mm2, %%mm5)\ | |
| 2041 "psllq $8, %%mm1 \n\t"\ | |
| 2042 "psrlq $8, %%mm2 \n\t"\ | |
| 2043 "por %%mm3, %%mm1 \n\t"\ | |
| 2044 "por %%mm4, %%mm2 \n\t"\ | |
| 2045 PAVGB(%%mm2, %%mm1)\ | |
| 2046 PAVGB(%%mm1, %%mm5)\ | |
| 2047 PAVGB(%%mm5, %%mm0)\ | |
| 2048 "movd %%mm0, " #dst " \n\t"\ | |
| 2049 "psrlq $32, %%mm0 \n\t"\ | |
| 2050 "movd %%mm0, 4" #dst " \n\t" | |
| 2051 | |
| 2052 /* uses the 9-Tap Filter: 112242211 */ | |
| 2053 #define NEW_HLP2(i)\ | |
| 2054 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\ | |
| 2055 "movq %%mm0, %%mm1 \n\t" /*0001000*/\ | |
| 2056 "movq %%mm0, %%mm2 \n\t" /*0001000*/\ | |
| 2057 "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |
| 2058 "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |
| 2059 "psllq $8, %%mm1 \n\t"\ | |
| 2060 "psrlq $8, %%mm2 \n\t"\ | |
| 2061 "psrlq $24, %%mm3 \n\t"\ | |
| 2062 "psllq $56, %%mm4 \n\t"\ | |
| 2063 "por %%mm3, %%mm1 \n\t" /*0010000*/\ | |
| 2064 "por %%mm4, %%mm2 \n\t" /*0000100*/\ | |
| 2065 "movq %%mm1, %%mm5 \n\t" /*0010000*/\ | |
| 2066 PAVGB(%%mm2, %%mm1) /*0010100*/\ | |
| 2067 PAVGB(%%mm1, %%mm0) /*0012100*/\ | |
| 2068 "psllq $8, %%mm5 \n\t"\ | |
| 2069 "psrlq $8, %%mm2 \n\t"\ | |
| 2070 "por %%mm3, %%mm5 \n\t" /*0100000*/\ | |
| 2071 "por %%mm4, %%mm2 \n\t" /*0000010*/\ | |
| 2072 "movq %%mm5, %%mm1 \n\t" /*0100000*/\ | |
| 2073 PAVGB(%%mm2, %%mm5) /*0100010*/\ | |
| 2074 "psllq $8, %%mm1 \n\t"\ | |
| 2075 "psrlq $8, %%mm2 \n\t"\ | |
| 2076 "por %%mm3, %%mm1 \n\t" /*1000000*/\ | |
| 2077 "por %%mm4, %%mm2 \n\t" /*0000001*/\ | |
| 2078 "movq %%mm1, %%mm6 \n\t" /*1000000*/\ | |
| 2079 PAVGB(%%mm2, %%mm1) /*1000001*/\ | |
| 2080 "psllq $8, %%mm6 \n\t"\ | |
| 2081 "psrlq $8, %%mm2 \n\t"\ | |
| 2082 "por %%mm3, %%mm6 \n\t"/*100000000*/\ | |
| 2083 "por %%mm4, %%mm2 \n\t"/*000000001*/\ | |
| 2084 PAVGB(%%mm2, %%mm6) /*100000001*/\ | |
| 2085 PAVGB(%%mm6, %%mm1) /*110000011*/\ | |
| 2086 PAVGB(%%mm1, %%mm5) /*112000211*/\ | |
| 2087 PAVGB(%%mm5, %%mm0) /*112242211*/\ | |
| 2088 "movd %%mm0, (%0) \n\t"\ | |
| 2089 "psrlq $32, %%mm0 \n\t"\ | |
| 2090 "movd %%mm0, 4(%0) \n\t" | |
| 2091 | |
| 2092 #define HLP(src, dst) NEW_HLP(src, dst) | |
| 2093 | |
| 2094 HLP(0, (%0)) | |
| 2095 HLP(8, (%%ecx)) | |
| 2096 HLP(16, (%%ecx, %1)) | |
| 2097 HLP(24, (%%ecx, %1, 2)) | |
| 2098 HLP(32, (%0, %1, 4)) | |
| 2099 HLP(40, (%%ebx)) | |
| 2100 HLP(48, (%%ebx, %1)) | |
| 2101 HLP(56, (%%ebx, %1, 2)) | |
| 2102 | |
| 2103 : | |
| 2104 : "r" (dst), "r" (stride) | |
| 2105 : "%eax", "%ebx", "%ecx" | |
| 2106 ); | |
| 2107 | |
| 2108 #else | |
| 2109 int y; | |
| 2110 for(y=0; y<BLOCK_SIZE; y++) | |
| 2111 { | |
| 2112 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; | |
| 2113 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; | |
| 2114 | |
| 2115 int sums[9]; | |
| 2116 sums[0] = first + dst[0]; | |
| 2117 sums[1] = dst[0] + dst[1]; | |
| 2118 sums[2] = dst[1] + dst[2]; | |
| 2119 sums[3] = dst[2] + dst[3]; | |
| 2120 sums[4] = dst[3] + dst[4]; | |
| 2121 sums[5] = dst[4] + dst[5]; | |
| 2122 sums[6] = dst[5] + dst[6]; | |
| 2123 sums[7] = dst[6] + dst[7]; | |
| 2124 sums[8] = dst[7] + last; | |
| 2125 | |
| 2126 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | |
| 2127 dst[1]= ((dst[1]<<2) + ((first + sums[0] + sums[3])<<1) + sums[5] + 8)>>4; | |
| 2128 dst[2]= ((dst[2]<<2) + ((first + sums[1] + sums[4])<<1) + sums[6] + 8)>>4; | |
| 2129 dst[3]= ((dst[3]<<2) + ((sums[2] + sums[5])<<1) + sums[0] + sums[7] + 8)>>4; | |
| 2130 dst[4]= ((dst[4]<<2) + ((sums[3] + sums[6])<<1) + sums[1] + sums[8] + 8)>>4; | |
| 2131 dst[5]= ((dst[5]<<2) + ((last + sums[7] + sums[4])<<1) + sums[2] + 8)>>4; | |
| 2132 dst[6]= (((last + dst[6])<<2) + ((dst[7] + sums[5])<<1) + sums[3] + 8)>>4; | |
| 2133 dst[7]= ((sums[8]<<2) + ((last + sums[6])<<1) + sums[4] + 8)>>4; | |
| 2134 | |
| 2135 dst+= stride; | |
| 2136 } | 1437 } |
| 2137 #endif | 1438 #endif |
| 2138 } | 1439 } |
| 2139 | 1440 |
| 2140 static inline void dering(uint8_t src[], int stride, int QP) | 1441 static inline void dering(uint8_t src[], int stride, int QP) |
| 3531 | 2832 |
| 3532 horizontal_size >>= 1; | 2833 horizontal_size >>= 1; |
| 3533 vertical_size >>= 1; | 2834 vertical_size >>= 1; |
| 3534 src_stride >>= 1; | 2835 src_stride >>= 1; |
| 3535 dst_stride >>= 1; | 2836 dst_stride >>= 1; |
| 3536 // mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER | | |
| 3537 // MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER); | |
| 3538 | 2837 |
| 3539 if(1) | 2838 if(1) |
| 3540 { | 2839 { |
| 3541 postProcess(src[1], src_stride, dst[1], dst_stride, | 2840 postProcess(src[1], src_stride, dst[1], dst_stride, |
| 3542 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode); | 2841 horizontal_size, vertical_size, QP_store, QP_stride, 1, &ppMode); |
| 3636 * Copies a block from src to dst and fixes the blacklevel | 2935 * Copies a block from src to dst and fixes the blacklevel |
| 3637 * numLines must be a multiple of 4 | 2936 * numLines must be a multiple of 4 |
| 3638 * levelFix == 0 -> dont touch the brighness & contrast | 2937 * levelFix == 0 -> dont touch the brighness & contrast |
| 3639 */ | 2938 */ |
| 3640 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, | 2939 static inline void blockCopy(uint8_t dst[], int dstStride, uint8_t src[], int srcStride, |
| 3641 int numLines, int levelFix) | 2940 int levelFix) |
| 3642 { | 2941 { |
| 3643 #ifndef HAVE_MMX | 2942 #ifndef HAVE_MMX |
| 3644 int i; | 2943 int i; |
| 3645 #endif | 2944 #endif |
| 3646 if(levelFix) | 2945 if(levelFix) |
| 3693 :"r" (srcStride), | 2992 :"r" (srcStride), |
| 3694 "r" (dstStride) | 2993 "r" (dstStride) |
| 3695 : "%eax", "%ebx" | 2994 : "%eax", "%ebx" |
| 3696 ); | 2995 ); |
| 3697 #else | 2996 #else |
| 3698 for(i=0; i<numLines; i++) | 2997 for(i=0; i<8; i++) |
| 3699 memcpy( &(dst[dstStride*i]), | 2998 memcpy( &(dst[dstStride*i]), |
| 3700 &(src[srcStride*i]), BLOCK_SIZE); | 2999 &(src[srcStride*i]), BLOCK_SIZE); |
| 3701 #endif | 3000 #endif |
| 3702 } | 3001 } |
| 3703 else | 3002 else |
| 3704 { | 3003 { |
| 3705 #ifdef HAVE_MMX | 3004 #ifdef HAVE_MMX |
| 3706 asm volatile( | 3005 asm volatile( |
| 3707 "movl %4, %%eax \n\t" | |
| 3708 "movl %%eax, temp0\n\t" | |
| 3709 "pushl %0 \n\t" | 3006 "pushl %0 \n\t" |
| 3710 "pushl %1 \n\t" | 3007 "pushl %1 \n\t" |
| 3711 "leal (%2,%2), %%eax \n\t" | 3008 "leal (%2,%2), %%eax \n\t" |
| 3712 "leal (%3,%3), %%ebx \n\t" | 3009 "leal (%3,%3), %%ebx \n\t" |
| 3713 "movq packedYOffset, %%mm2 \n\t" | |
| 3714 "movq packedYScale, %%mm3 \n\t" | |
| 3715 | 3010 |
| 3716 #define SIMPLE_CPY \ | 3011 #define SIMPLE_CPY \ |
| 3717 "movq (%0), %%mm0 \n\t"\ | 3012 "movq (%0), %%mm0 \n\t"\ |
| 3718 "movq (%0,%2), %%mm1 \n\t"\ | 3013 "movq (%0,%2), %%mm1 \n\t"\ |
| 3719 "movq %%mm0, (%1) \n\t"\ | 3014 "movq %%mm0, (%1) \n\t"\ |
| 3720 "movq %%mm1, (%1, %3) \n\t"\ | 3015 "movq %%mm1, (%1, %3) \n\t"\ |
| 3721 | 3016 |
| 3722 "1: \n\t" | |
| 3723 SIMPLE_CPY | 3017 SIMPLE_CPY |
| 3724 "addl %%eax, %0 \n\t" | 3018 "addl %%eax, %0 \n\t" |
| 3725 "addl %%ebx, %1 \n\t" | 3019 "addl %%ebx, %1 \n\t" |
| 3726 SIMPLE_CPY | 3020 SIMPLE_CPY |
| 3727 "addl %%eax, %0 \n\t" | 3021 "addl %%eax, %0 \n\t" |
| 3728 "addl %%ebx, %1 \n\t" | 3022 "addl %%ebx, %1 \n\t" |
| 3729 "decl temp0 \n\t" | 3023 SIMPLE_CPY |
| 3730 "jnz 1b \n\t" | 3024 "addl %%eax, %0 \n\t" |
| 3025 "addl %%ebx, %1 \n\t" | |
| 3026 SIMPLE_CPY | |
| 3731 | 3027 |
| 3732 "popl %1 \n\t" | 3028 "popl %1 \n\t" |
| 3733 "popl %0 \n\t" | 3029 "popl %0 \n\t" |
| 3734 : : "r" (src), | 3030 : : "r" (src), |
| 3735 "r" (dst), | 3031 "r" (dst), |
| 3736 "r" (srcStride), | 3032 "r" (srcStride), |
| 3737 "r" (dstStride), | 3033 "r" (dstStride) |
| 3738 "m" (numLines>>2) | |
| 3739 : "%eax", "%ebx" | 3034 : "%eax", "%ebx" |
| 3740 ); | 3035 ); |
| 3741 #else | 3036 #else |
| 3742 for(i=0; i<numLines; i++) | 3037 for(i=0; i<8; i++) |
| 3743 memcpy( &(dst[dstStride*i]), | 3038 memcpy( &(dst[dstStride*i]), |
| 3744 &(src[srcStride*i]), BLOCK_SIZE); | 3039 &(src[srcStride*i]), BLOCK_SIZE); |
| 3745 #endif | 3040 #endif |
| 3746 } | 3041 } |
| 3747 } | 3042 } |
| 3772 | 3067 |
| 3773 /* Temporal noise reducing buffers */ | 3068 /* Temporal noise reducing buffers */ |
| 3774 static uint8_t *tempBlured[3]= {NULL,NULL,NULL}; | 3069 static uint8_t *tempBlured[3]= {NULL,NULL,NULL}; |
| 3775 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL}; | 3070 static uint32_t *tempBluredPast[3]= {NULL,NULL,NULL}; |
| 3776 | 3071 |
| 3072 int copyAhead; | |
| 3073 | |
| 3777 #ifdef PP_FUNNY_STRIDE | 3074 #ifdef PP_FUNNY_STRIDE |
| 3778 uint8_t *dstBlockPtrBackup; | 3075 uint8_t *dstBlockPtrBackup; |
| 3779 uint8_t *srcBlockPtrBackup; | 3076 uint8_t *srcBlockPtrBackup; |
| 3780 #endif | 3077 #endif |
| 3781 | 3078 |
| 3790 #ifdef HAVE_MMX | 3087 #ifdef HAVE_MMX |
| 3791 maxTmpNoise[0]= ppMode->maxTmpNoise[0]; | 3088 maxTmpNoise[0]= ppMode->maxTmpNoise[0]; |
| 3792 maxTmpNoise[1]= ppMode->maxTmpNoise[1]; | 3089 maxTmpNoise[1]= ppMode->maxTmpNoise[1]; |
| 3793 maxTmpNoise[2]= ppMode->maxTmpNoise[2]; | 3090 maxTmpNoise[2]= ppMode->maxTmpNoise[2]; |
| 3794 #endif | 3091 #endif |
| 3092 | |
| 3093 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; | |
| 3094 else if(mode & LINEAR_BLEND_DEINT_FILTER) copyAhead=14; | |
| 3095 else if( (mode & V_DEBLOCK) | |
| 3096 || (mode & LINEAR_IPOL_DEINT_FILTER) | |
| 3097 || (mode & MEDIAN_DEINT_FILTER)) copyAhead=13; | |
| 3098 else if(mode & V_X1_FILTER) copyAhead=11; | |
| 3099 else if(mode & V_RK1_FILTER) copyAhead=10; | |
| 3100 else if(mode & DERING) copyAhead=9; | |
| 3101 else copyAhead=8; | |
| 3102 | |
| 3103 copyAhead-= 8; | |
| 3795 | 3104 |
| 3796 if(tempDst==NULL) | 3105 if(tempDst==NULL) |
| 3797 { | 3106 { |
| 3798 tempDst= (uint8_t*)memalign(8, 1024*24); | 3107 tempDst= (uint8_t*)memalign(8, 1024*24); |
| 3799 tempSrc= (uint8_t*)memalign(8, 1024*24); | 3108 tempSrc= (uint8_t*)memalign(8, 1024*24); |
| 3896 for(x=0; x<width; x+=BLOCK_SIZE) | 3205 for(x=0; x<width; x+=BLOCK_SIZE) |
| 3897 { | 3206 { |
| 3898 | 3207 |
| 3899 #ifdef HAVE_MMX2 | 3208 #ifdef HAVE_MMX2 |
| 3900 /* | 3209 /* |
| 3901 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
| 3902 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 3903 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 3904 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 3905 */ | |
| 3906 /* | |
| 3907 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | 3210 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
| 3908 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | 3211 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
| 3909 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | 3212 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
| 3910 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | 3213 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
| 3911 */ | 3214 */ |
| 3912 | 3215 |
| 3913 asm( | 3216 asm( |
| 3914 "movl %4, %%eax \n\t" | 3217 "movl %4, %%eax \n\t" |
| 3915 "shrl $2, %%eax \n\t" | 3218 "shrl $2, %%eax \n\t" |
| 3916 "andl $6, %%eax \n\t" | 3219 "andl $6, %%eax \n\t" |
| 3917 "addl $8, %%eax \n\t" | 3220 "addl %5, %%eax \n\t" |
| 3918 "movl %%eax, %%ebx \n\t" | 3221 "movl %%eax, %%ebx \n\t" |
| 3919 "imul %1, %%eax \n\t" | 3222 "imul %1, %%eax \n\t" |
| 3920 "imul %3, %%ebx \n\t" | 3223 "imul %3, %%ebx \n\t" |
| 3921 "prefetchnta 32(%%eax, %0) \n\t" | 3224 "prefetchnta 32(%%eax, %0) \n\t" |
| 3922 "prefetcht0 32(%%ebx, %2) \n\t" | 3225 "prefetcht0 32(%%ebx, %2) \n\t" |
| 3923 "addl %1, %%eax \n\t" | 3226 "addl %1, %%eax \n\t" |
| 3924 "addl %3, %%ebx \n\t" | 3227 "addl %3, %%ebx \n\t" |
| 3925 "prefetchnta 32(%%eax, %0) \n\t" | 3228 "prefetchnta 32(%%eax, %0) \n\t" |
| 3926 "prefetcht0 32(%%ebx, %2) \n\t" | 3229 "prefetcht0 32(%%ebx, %2) \n\t" |
| 3927 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | 3230 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
| 3928 "m" (x) | 3231 "m" (x), "m" (copyAhead) |
| 3929 : "%eax", "%ebx" | 3232 : "%eax", "%ebx" |
| 3930 ); | 3233 ); |
| 3931 | 3234 |
| 3932 #elif defined(HAVE_3DNOW) | 3235 #elif defined(HAVE_3DNOW) |
| 3933 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | 3236 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
| 3936 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 3239 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
| 3937 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 3240 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
| 3938 */ | 3241 */ |
| 3939 #endif | 3242 #endif |
| 3940 | 3243 |
| 3941 blockCopy(dstBlock + dstStride*8, dstStride, | 3244 blockCopy(dstBlock + dstStride*copyAhead, dstStride, |
| 3942 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); | 3245 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); |
| 3943 | 3246 |
| 3944 if(mode & LINEAR_IPOL_DEINT_FILTER) | 3247 if(mode & LINEAR_IPOL_DEINT_FILTER) |
| 3945 deInterlaceInterpolateLinear(dstBlock, dstStride); | 3248 deInterlaceInterpolateLinear(dstBlock, dstStride); |
| 3946 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 3249 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 3947 deInterlaceBlendLinear(dstBlock, dstStride); | 3250 deInterlaceBlendLinear(dstBlock, dstStride); |
| 3953 deInterlaceBlendCubic(dstBlock, dstStride); | 3256 deInterlaceBlendCubic(dstBlock, dstStride); |
| 3954 */ | 3257 */ |
| 3955 dstBlock+=8; | 3258 dstBlock+=8; |
| 3956 srcBlock+=8; | 3259 srcBlock+=8; |
| 3957 } | 3260 } |
| 3958 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride ); | 3261 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, copyAhead*dstStride ); |
| 3959 } | 3262 } |
| 3960 | 3263 |
| 3961 for(y=0; y<height; y+=BLOCK_SIZE) | 3264 for(y=0; y<height; y+=BLOCK_SIZE) |
| 3962 { | 3265 { |
| 3963 //1% speedup if these are here instead of the inner loop | 3266 //1% speedup if these are here instead of the inner loop |
| 3974 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards | 3277 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
| 3975 if not than use a temporary buffer */ | 3278 if not than use a temporary buffer */ |
| 3976 if(y+15 >= height) | 3279 if(y+15 >= height) |
| 3977 { | 3280 { |
| 3978 int i; | 3281 int i; |
| 3979 /* copy from line 8 to 15 of src, these will be copied with | 3282 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
| 3980 blockcopy to dst later */ | 3283 blockcopy to dst later */ |
| 3981 memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8, | 3284 memcpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
| 3982 srcStride*MAX(height-y-8, 0) ); | 3285 srcStride*MAX(height-y-copyAhead, 0) ); |
| 3983 | 3286 |
| 3984 /* duplicate last line of src to fill the void upto line 15 */ | 3287 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ |
| 3985 for(i=MAX(height-y, 8); i<=15; i++) | 3288 for(i=MAX(height-y, 8); i<copyAhead+8; i++) |
| 3986 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); | 3289 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), srcStride); |
| 3987 | 3290 |
| 3988 /* copy up to 9 lines of dst (line -1 to 7)*/ | 3291 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
| 3989 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) ); | 3292 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, copyAhead+1) ); |
| 3990 | 3293 |
| 3991 /* duplicate last line of dst to fill the void upto line 8 */ | 3294 /* duplicate last line of dst to fill the void upto line (copyAhead) */ |
| 3992 for(i=height-y+1; i<=8; i++) | 3295 for(i=height-y+1; i<=copyAhead; i++) |
| 3993 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); | 3296 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), dstStride); |
| 3994 | 3297 |
| 3995 dstBlock= tempDst + dstStride; | 3298 dstBlock= tempDst + dstStride; |
| 3996 srcBlock= tempSrc; | 3299 srcBlock= tempSrc; |
| 3997 } | 3300 } |
| 4039 T0= rdtsc(); | 3342 T0= rdtsc(); |
| 4040 #endif | 3343 #endif |
| 4041 | 3344 |
| 4042 #ifdef HAVE_MMX2 | 3345 #ifdef HAVE_MMX2 |
| 4043 /* | 3346 /* |
| 4044 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
| 4045 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 4046 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 4047 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 4048 */ | |
| 4049 /* | |
| 4050 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | 3347 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
| 4051 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | 3348 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
| 4052 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | 3349 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
| 4053 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | 3350 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
| 4054 */ | 3351 */ |
| 4055 | 3352 |
| 4056 asm( | 3353 asm( |
| 4057 "movl %4, %%eax \n\t" | 3354 "movl %4, %%eax \n\t" |
| 4058 "shrl $2, %%eax \n\t" | 3355 "shrl $2, %%eax \n\t" |
| 4059 "andl $6, %%eax \n\t" | 3356 "andl $6, %%eax \n\t" |
| 4060 "addl $8, %%eax \n\t" | 3357 "addl %5, %%eax \n\t" |
| 4061 "movl %%eax, %%ebx \n\t" | 3358 "movl %%eax, %%ebx \n\t" |
| 4062 "imul %1, %%eax \n\t" | 3359 "imul %1, %%eax \n\t" |
| 4063 "imul %3, %%ebx \n\t" | 3360 "imul %3, %%ebx \n\t" |
| 4064 "prefetchnta 32(%%eax, %0) \n\t" | 3361 "prefetchnta 32(%%eax, %0) \n\t" |
| 4065 "prefetcht0 32(%%ebx, %2) \n\t" | 3362 "prefetcht0 32(%%ebx, %2) \n\t" |
| 4066 "addl %1, %%eax \n\t" | 3363 "addl %1, %%eax \n\t" |
| 4067 "addl %3, %%ebx \n\t" | 3364 "addl %3, %%ebx \n\t" |
| 4068 "prefetchnta 32(%%eax, %0) \n\t" | 3365 "prefetchnta 32(%%eax, %0) \n\t" |
| 4069 "prefetcht0 32(%%ebx, %2) \n\t" | 3366 "prefetcht0 32(%%ebx, %2) \n\t" |
| 4070 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | 3367 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), |
| 4071 "m" (x) | 3368 "m" (x), "m" (copyAhead) |
| 4072 : "%eax", "%ebx" | 3369 : "%eax", "%ebx" |
| 4073 ); | 3370 ); |
| 4074 | 3371 |
| 4075 #elif defined(HAVE_3DNOW) | 3372 #elif defined(HAVE_3DNOW) |
| 4076 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | 3373 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
| 4098 dstBlock= tempDstBlock; | 3395 dstBlock= tempDstBlock; |
| 4099 srcBlock= tempSrcBlock; | 3396 srcBlock= tempSrcBlock; |
| 4100 } | 3397 } |
| 4101 #endif | 3398 #endif |
| 4102 | 3399 |
| 4103 blockCopy(dstBlock + dstStride*8, dstStride, | 3400 blockCopy(dstBlock + dstStride*copyAhead, dstStride, |
| 4104 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); | 3401 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX); |
| 4105 | 3402 |
| 4106 if(mode & LINEAR_IPOL_DEINT_FILTER) | 3403 if(mode & LINEAR_IPOL_DEINT_FILTER) |
| 4107 deInterlaceInterpolateLinear(dstBlock, dstStride); | 3404 deInterlaceInterpolateLinear(dstBlock, dstStride); |
| 4108 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 3405 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 4109 deInterlaceBlendLinear(dstBlock, dstStride); | 3406 deInterlaceBlendLinear(dstBlock, dstStride); |
| 4158 vertRK1Filter(tempBlock1, 16, QP); | 3455 vertRK1Filter(tempBlock1, 16, QP); |
| 4159 else if(mode & H_X1_FILTER) | 3456 else if(mode & H_X1_FILTER) |
| 4160 vertX1Filter(tempBlock1, 16, QP); | 3457 vertX1Filter(tempBlock1, 16, QP); |
| 4161 else if(mode & H_DEBLOCK) | 3458 else if(mode & H_DEBLOCK) |
| 4162 { | 3459 { |
| 4163 if( isVertDC(tempBlock1, 16)) | 3460 if( isVertDC(tempBlock1, 16) ) |
| 4164 { | 3461 { |
| 4165 if(isVertMinMaxOk(tempBlock1, 16, QP)) | 3462 if(isVertMinMaxOk(tempBlock1, 16, QP)) |
| 4166 doVertLowPass(tempBlock1, 16, QP); | 3463 doVertLowPass(tempBlock1, 16, QP); |
| 4167 } | 3464 } |
| 4168 else | 3465 else |
| 4250 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); | 3547 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); |
| 4251 } | 3548 } |
| 4252 /* | 3549 /* |
| 4253 for(x=0; x<width; x+=32) | 3550 for(x=0; x<width; x+=32) |
| 4254 { | 3551 { |
| 4255 int i; | 3552 volatile int i; |
| 4256 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] | 3553 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
| 4257 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] | 3554 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] |
| 4258 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride] | 3555 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
| 4259 + dstBlock[x +13*dstStride] + dstBlock[x +14*dstStride] | 3556 // + dstBlock[x +13*dstStride] |
| 4260 + dstBlock[x +15*dstStride]; | 3557 // + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; |
| 4261 } | 3558 }*/ |
| 4262 */ } | 3559 } |
| 4263 #ifdef HAVE_3DNOW | 3560 #ifdef HAVE_3DNOW |
| 4264 asm volatile("femms"); | 3561 asm volatile("femms"); |
| 4265 #elif defined (HAVE_MMX) | 3562 #elif defined (HAVE_MMX) |
| 4266 asm volatile("emms"); | 3563 asm volatile("emms"); |
| 4267 #endif | 3564 #endif |
