Mercurial > libavcodec.hg
comparison libpostproc/postprocess.c @ 167:2d97f0157a79 libavcodec
faster dering
| author | michael |
|---|---|
| date | Sat, 24 Nov 2001 01:38:30 +0000 |
| parents | ec349ac7869b |
| children | 712c7a115164 |
comparison
equal
deleted
inserted
replaced
| 166:ec349ac7869b | 167:2d97f0157a79 |
|---|---|
| 45 c = checked against the other implementations (-vo md5) | 45 c = checked against the other implementations (-vo md5) |
| 46 */ | 46 */ |
| 47 | 47 |
| 48 /* | 48 /* |
| 49 TODO: | 49 TODO: |
| 50 verify that everything workes as it should (how?) | |
| 51 reduce the time wasted on the mem transfer | 50 reduce the time wasted on the mem transfer |
| 52 implement everything in C at least (done at the moment but ...) | 51 implement everything in C at least (done at the moment but ...) |
| 53 unroll stuff if instructions depend too much on the prior one | 52 unroll stuff if instructions depend too much on the prior one |
| 54 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? | 53 we use 8x8 blocks for the horizontal filters, opendivx seems to use 8x4? |
| 55 move YScale thing to the end instead of fixing QP | 54 move YScale thing to the end instead of fixing QP |
| 60 split this huge file | 59 split this huge file |
| 61 border remover | 60 border remover |
| 62 optimize c versions | 61 optimize c versions |
| 63 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks | 62 try to unroll inner for(x=0 ... loop to avoid these damn if(x ... checks |
| 64 smart blur | 63 smart blur |
| 65 commandline option for the deblock thresholds | 64 commandline option for the deblock / dering thresholds |
| 65 memcpy chrominance if no chroma filtering is done | |
| 66 ... | 66 ... |
| 67 */ | 67 */ |
| 68 | 68 |
| 69 //Changelog: use the CVS log | 69 //Changelog: use the CVS log |
| 70 | 70 |
| 160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code | 160 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code |
| 161 #endif | 161 #endif |
| 162 | 162 |
| 163 int hFlatnessThreshold= 56 - 16; | 163 int hFlatnessThreshold= 56 - 16; |
| 164 int vFlatnessThreshold= 56 - 16; | 164 int vFlatnessThreshold= 56 - 16; |
| 165 int deringThreshold= 20; | |
| 165 | 166 |
| 166 //amount of "black" u r willing to loose to get a brightness corrected picture | 167 //amount of "black" u r willing to loose to get a brightness corrected picture |
| 167 double maxClippedThreshold= 0.01; | 168 double maxClippedThreshold= 0.01; |
| 168 | 169 |
| 169 int maxAllowedY=234; | 170 int maxAllowedY=234; |
| 308 "paddb %%mm7, %%mm2 \n\t" | 309 "paddb %%mm7, %%mm2 \n\t" |
| 309 "pcmpgtb %%mm6, %%mm2 \n\t" | 310 "pcmpgtb %%mm6, %%mm2 \n\t" |
| 310 "paddb %%mm2, %%mm0 \n\t" | 311 "paddb %%mm2, %%mm0 \n\t" |
| 311 | 312 |
| 312 " \n\t" | 313 " \n\t" |
| 314 #ifdef HAVE_MMX2 | |
| 315 "pxor %%mm7, %%mm7 \n\t" | |
| 316 "psadbw %%mm7, %%mm0 \n\t" | |
| 317 #else | |
| 313 "movq %%mm0, %%mm1 \n\t" | 318 "movq %%mm0, %%mm1 \n\t" |
| 314 "psrlw $8, %%mm0 \n\t" | 319 "psrlw $8, %%mm0 \n\t" |
| 315 "paddb %%mm1, %%mm0 \n\t" | 320 "paddb %%mm1, %%mm0 \n\t" |
| 316 #ifdef HAVE_MMX2 | |
| 317 "pshufw $0xF9, %%mm0, %%mm1 \n\t" | |
| 318 "paddb %%mm1, %%mm0 \n\t" | |
| 319 "pshufw $0xFE, %%mm0, %%mm1 \n\t" | |
| 320 #else | |
| 321 "movq %%mm0, %%mm1 \n\t" | 321 "movq %%mm0, %%mm1 \n\t" |
| 322 "psrlq $16, %%mm0 \n\t" | 322 "psrlq $16, %%mm0 \n\t" |
| 323 "paddb %%mm1, %%mm0 \n\t" | 323 "paddb %%mm1, %%mm0 \n\t" |
| 324 "movq %%mm0, %%mm1 \n\t" | 324 "movq %%mm0, %%mm1 \n\t" |
| 325 "psrlq $32, %%mm0 \n\t" | 325 "psrlq $32, %%mm0 \n\t" |
| 326 #endif | |
| 327 "paddb %%mm1, %%mm0 \n\t" | 326 "paddb %%mm1, %%mm0 \n\t" |
| 327 #endif | |
| 328 "movd %%mm0, %0 \n\t" | 328 "movd %%mm0, %0 \n\t" |
| 329 : "=r" (numEq) | 329 : "=r" (numEq) |
| 330 : "r" (src), "r" (stride) | 330 : "r" (src), "r" (stride) |
| 331 : "%eax", "%ebx" | 331 : "%ebx" |
| 332 ); | 332 ); |
| 333 | 333 numEq= (-numEq) &0xFF; |
| 334 numEq= (256 - numEq) &0xFF; | |
| 335 | 334 |
| 336 #else | 335 #else |
| 337 for(y=0; y<BLOCK_SIZE-1; y++) | 336 for(y=0; y<BLOCK_SIZE-1; y++) |
| 338 { | 337 { |
| 339 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; | 338 if(((src[0] - src[0+stride] + 1)&0xFFFF) < 3) numEq++; |
| 1589 "leal (%0, %1), %%eax \n\t" | 1588 "leal (%0, %1), %%eax \n\t" |
| 1590 "leal (%%eax, %1, 4), %%ebx \n\t" | 1589 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 1591 // 0 1 2 3 4 5 6 7 8 9 | 1590 // 0 1 2 3 4 5 6 7 8 9 |
| 1592 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1591 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
| 1593 | 1592 |
| 1594 "pcmpeqb %%mm6, %%mm6 \n\t" | 1593 "pcmpeqb %%mm7, %%mm7 \n\t" |
| 1595 "pxor %%mm7, %%mm7 \n\t" | 1594 "pxor %%mm6, %%mm6 \n\t" |
| 1596 #ifdef HAVE_MMX2 | 1595 #ifdef HAVE_MMX2 |
| 1597 #define FIND_MIN_MAX(addr)\ | 1596 #define FIND_MIN_MAX(addr)\ |
| 1598 "movq " #addr ", %%mm0 \n\t"\ | 1597 "movq " #addr ", %%mm0 \n\t"\ |
| 1599 "pminub %%mm0, %%mm6 \n\t"\ | 1598 "pminub %%mm0, %%mm7 \n\t"\ |
| 1600 "pmaxub %%mm0, %%mm7 \n\t" | 1599 "pmaxub %%mm0, %%mm6 \n\t" |
| 1601 #else | 1600 #else |
| 1602 #define FIND_MIN_MAX(addr)\ | 1601 #define FIND_MIN_MAX(addr)\ |
| 1603 "movq " #addr ", %%mm0 \n\t"\ | 1602 "movq " #addr ", %%mm0 \n\t"\ |
| 1604 "movq %%mm6, %%mm1 \n\t"\ | 1603 "movq %%mm7, %%mm1 \n\t"\ |
| 1605 "psubusb %%mm0, %%mm7 \n\t"\ | 1604 "psubusb %%mm0, %%mm6 \n\t"\ |
| 1606 "paddb %%mm0, %%mm7 \n\t"\ | 1605 "paddb %%mm0, %%mm6 \n\t"\ |
| 1607 "psubusb %%mm0, %%mm1 \n\t"\ | 1606 "psubusb %%mm0, %%mm1 \n\t"\ |
| 1608 "psubb %%mm1, %%mm6 \n\t" | 1607 "psubb %%mm1, %%mm7 \n\t" |
| 1609 #endif | 1608 #endif |
| 1610 | 1609 |
| 1611 FIND_MIN_MAX((%%eax)) | 1610 FIND_MIN_MAX((%%eax)) |
| 1612 FIND_MIN_MAX((%%eax, %1)) | 1611 FIND_MIN_MAX((%%eax, %1)) |
| 1613 FIND_MIN_MAX((%%eax, %1, 2)) | 1612 FIND_MIN_MAX((%%eax, %1, 2)) |
| 1615 FIND_MIN_MAX((%%ebx)) | 1614 FIND_MIN_MAX((%%ebx)) |
| 1616 FIND_MIN_MAX((%%ebx, %1)) | 1615 FIND_MIN_MAX((%%ebx, %1)) |
| 1617 FIND_MIN_MAX((%%ebx, %1, 2)) | 1616 FIND_MIN_MAX((%%ebx, %1, 2)) |
| 1618 FIND_MIN_MAX((%0, %1, 8)) | 1617 FIND_MIN_MAX((%0, %1, 8)) |
| 1619 | 1618 |
| 1619 "movq %%mm7, %%mm4 \n\t" | |
| 1620 "psrlq $8, %%mm7 \n\t" | |
| 1621 #ifdef HAVE_MMX2 | |
| 1622 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1623 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
| 1624 "pminub %%mm4, %%mm7 \n\t" // min of pixels | |
| 1625 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
| 1626 "pminub %%mm4, %%mm7 \n\t" | |
| 1627 #else | |
| 1628 "movq %%mm7, %%mm1 \n\t" | |
| 1629 "psubusb %%mm4, %%mm1 \n\t" | |
| 1630 "psubb %%mm1, %%mm7 \n\t" | |
| 1631 "movq %%mm7, %%mm4 \n\t" | |
| 1632 "psrlq $16, %%mm7 \n\t" | |
| 1633 "movq %%mm7, %%mm1 \n\t" | |
| 1634 "psubusb %%mm4, %%mm1 \n\t" | |
| 1635 "psubb %%mm1, %%mm7 \n\t" | |
| 1636 "movq %%mm7, %%mm4 \n\t" | |
| 1637 "psrlq $32, %%mm7 \n\t" | |
| 1638 "movq %%mm7, %%mm1 \n\t" | |
| 1639 "psubusb %%mm4, %%mm1 \n\t" | |
| 1640 "psubb %%mm1, %%mm7 \n\t" | |
| 1641 #endif | |
| 1642 | |
| 1643 | |
| 1620 "movq %%mm6, %%mm4 \n\t" | 1644 "movq %%mm6, %%mm4 \n\t" |
| 1621 "psrlq $8, %%mm6 \n\t" | 1645 "psrlq $8, %%mm6 \n\t" |
| 1622 #ifdef HAVE_MMX2 | 1646 #ifdef HAVE_MMX2 |
| 1623 "pminub %%mm4, %%mm6 \n\t" // min of pixels | 1647 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
| 1624 "pshufw $0xF9, %%mm6, %%mm4 \n\t" | 1648 "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
| 1625 "pminub %%mm4, %%mm6 \n\t" // min of pixels | 1649 "pmaxub %%mm4, %%mm6 \n\t" |
| 1626 "pshufw $0xFE, %%mm6, %%mm4 \n\t" | 1650 "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
| 1627 "pminub %%mm4, %%mm6 \n\t" | 1651 "pmaxub %%mm4, %%mm6 \n\t" |
| 1628 #else | 1652 #else |
| 1629 "movq %%mm6, %%mm1 \n\t" | 1653 "psubusb %%mm4, %%mm6 \n\t" |
| 1630 "psubusb %%mm4, %%mm1 \n\t" | 1654 "paddb %%mm4, %%mm6 \n\t" |
| 1631 "psubb %%mm1, %%mm6 \n\t" | |
| 1632 "movq %%mm6, %%mm4 \n\t" | 1655 "movq %%mm6, %%mm4 \n\t" |
| 1633 "psrlq $16, %%mm6 \n\t" | 1656 "psrlq $16, %%mm6 \n\t" |
| 1634 "movq %%mm6, %%mm1 \n\t" | 1657 "psubusb %%mm4, %%mm6 \n\t" |
| 1635 "psubusb %%mm4, %%mm1 \n\t" | 1658 "paddb %%mm4, %%mm6 \n\t" |
| 1636 "psubb %%mm1, %%mm6 \n\t" | |
| 1637 "movq %%mm6, %%mm4 \n\t" | 1659 "movq %%mm6, %%mm4 \n\t" |
| 1638 "psrlq $32, %%mm6 \n\t" | 1660 "psrlq $32, %%mm6 \n\t" |
| 1639 "movq %%mm6, %%mm1 \n\t" | 1661 "psubusb %%mm4, %%mm6 \n\t" |
| 1640 "psubusb %%mm4, %%mm1 \n\t" | 1662 "paddb %%mm4, %%mm6 \n\t" |
| 1641 "psubb %%mm1, %%mm6 \n\t" | 1663 #endif |
| 1642 #endif | 1664 "movq %%mm6, %%mm0 \n\t" // max |
| 1643 | 1665 "psubb %%mm7, %%mm6 \n\t" // max - min |
| 1644 | 1666 "movd %%mm6, %%ecx \n\t" |
| 1645 "movq %%mm7, %%mm4 \n\t" | 1667 "cmpb deringThreshold, %%cl \n\t" |
| 1646 "psrlq $8, %%mm7 \n\t" | 1668 " jb 1f \n\t" |
| 1647 #ifdef HAVE_MMX2 | 1669 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
| 1648 "pmaxub %%mm4, %%mm7 \n\t" // max of pixels | |
| 1649 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | |
| 1650 "pmaxub %%mm4, %%mm7 \n\t" | |
| 1651 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | |
| 1652 "pmaxub %%mm4, %%mm7 \n\t" | |
| 1653 #else | |
| 1654 "psubusb %%mm4, %%mm7 \n\t" | |
| 1655 "paddb %%mm4, %%mm7 \n\t" | |
| 1656 "movq %%mm7, %%mm4 \n\t" | |
| 1657 "psrlq $16, %%mm7 \n\t" | |
| 1658 "psubusb %%mm4, %%mm7 \n\t" | |
| 1659 "paddb %%mm4, %%mm7 \n\t" | |
| 1660 "movq %%mm7, %%mm4 \n\t" | |
| 1661 "psrlq $32, %%mm7 \n\t" | |
| 1662 "psubusb %%mm4, %%mm7 \n\t" | |
| 1663 "paddb %%mm4, %%mm7 \n\t" | |
| 1664 #endif | |
| 1665 PAVGB(%%mm6, %%mm7) // a=(max + min)/2 | |
| 1666 "punpcklbw %%mm7, %%mm7 \n\t" | 1670 "punpcklbw %%mm7, %%mm7 \n\t" |
| 1667 "punpcklbw %%mm7, %%mm7 \n\t" | 1671 "punpcklbw %%mm7, %%mm7 \n\t" |
| 1668 "punpcklbw %%mm7, %%mm7 \n\t" | 1672 "punpcklbw %%mm7, %%mm7 \n\t" |
| 1669 "movq %%mm7, temp0 \n\t" | 1673 "movq %%mm7, temp0 \n\t" |
| 1670 | 1674 |
| 1783 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | 1787 DERING_CORE((%%ebx),(%%ebx, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
| 1784 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) | 1788 DERING_CORE((%%ebx, %1), (%%ebx, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
| 1785 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) | 1789 DERING_CORE((%%ebx, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
| 1786 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) | 1790 DERING_CORE((%0, %1, 8),(%%ebx, %1, 4) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
| 1787 | 1791 |
| 1788 | 1792 "1: \n\t" |
| 1789 : : "r" (src), "r" (stride), "r" (QP) | 1793 : : "r" (src), "r" (stride), "r" (QP) |
| 1790 : "%eax", "%ebx" | 1794 : "%eax", "%ebx", "%ecx" |
| 1791 ); | 1795 ); |
| 1792 #else | 1796 #else |
| 1793 int y; | 1797 int y; |
| 1794 int min=255; | 1798 int min=255; |
| 1795 int max=0; | 1799 int max=0; |
| 1807 if(*p > max) max= *p; | 1811 if(*p > max) max= *p; |
| 1808 if(*p < min) min= *p; | 1812 if(*p < min) min= *p; |
| 1809 } | 1813 } |
| 1810 } | 1814 } |
| 1811 avg= (min + max + 1)/2; | 1815 avg= (min + max + 1)/2; |
| 1816 | |
| 1817 if(max - min <deringThreshold) return; | |
| 1812 | 1818 |
| 1813 for(y=0; y<10; y++) | 1819 for(y=0; y<10; y++) |
| 1814 { | 1820 { |
| 1815 int x; | 1821 int x; |
| 1816 int t = 0; | 1822 int t = 0; |
| 1840 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) | 1846 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) |
| 1841 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) | 1847 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) |
| 1842 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); | 1848 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); |
| 1843 f= (f + 8)>>4; | 1849 f= (f + 8)>>4; |
| 1844 | 1850 |
| 1851 #ifdef DEBUG_DERING_THRESHOLD | |
| 1852 asm volatile("emms\n\t":); | |
| 1853 { | |
| 1854 static long long numPixels=0; | |
| 1855 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; | |
| 1856 // if((max-min)<20 || (max-min)*QP<200) | |
| 1857 // if((max-min)*QP < 500) | |
| 1858 // if(max-min<QP/2) | |
| 1859 if(max-min < 20) | |
| 1860 { | |
| 1861 static int numSkiped=0; | |
| 1862 static int errorSum=0; | |
| 1863 static int worstQP=0; | |
| 1864 static int worstRange=0; | |
| 1865 static int worstDiff=0; | |
| 1866 int diff= (f - *p); | |
| 1867 int absDiff= ABS(diff); | |
| 1868 int error= diff*diff; | |
| 1869 | |
| 1870 if(x==1 || x==8 || y==1 || y==8) continue; | |
| 1871 | |
| 1872 numSkiped++; | |
| 1873 if(absDiff > worstDiff) | |
| 1874 { | |
| 1875 worstDiff= absDiff; | |
| 1876 worstQP= QP; | |
| 1877 worstRange= max-min; | |
| 1878 } | |
| 1879 errorSum+= error; | |
| 1880 | |
| 1881 if(1024LL*1024LL*1024LL % numSkiped == 0) | |
| 1882 { | |
| 1883 printf( "sum:%1.3f, skip:%d, wQP:%d, " | |
| 1884 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", | |
| 1885 (float)errorSum/numSkiped, numSkiped, worstQP, worstRange, | |
| 1886 worstDiff, (float)numSkiped/numPixels); | |
| 1887 } | |
| 1888 } | |
| 1889 } | |
| 1890 #endif | |
| 1845 if (*p + 2*QP < f) *p= *p + 2*QP; | 1891 if (*p + 2*QP < f) *p= *p + 2*QP; |
| 1846 else if(*p - 2*QP > f) *p= *p - 2*QP; | 1892 else if(*p - 2*QP > f) *p= *p - 2*QP; |
| 1847 else *p=f; | 1893 else *p=f; |
| 1848 } | 1894 } |
| 1849 } | 1895 } |
| 1850 } | 1896 } |
| 1851 | 1897 #ifdef DEBUG_DERING_THRESHOLD |
| 1898 if(max-min < 20) | |
| 1899 { | |
| 1900 for(y=1; y<9; y++) | |
| 1901 { | |
| 1902 int x; | |
| 1903 int t = 0; | |
| 1904 p= src + stride*y; | |
| 1905 for(x=1; x<9; x++) | |
| 1906 { | |
| 1907 p++; | |
| 1908 *p = MIN(*p + 20, 255); | |
| 1909 } | |
| 1910 } | |
| 1911 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | |
| 1912 } | |
| 1913 #endif | |
| 1852 #endif | 1914 #endif |
| 1853 } | 1915 } |
| 1854 | 1916 |
| 1855 /** | 1917 /** |
| 1856 * Deinterlaces the given block | 1918 * Deinterlaces the given block |
