comparison libpostproc/postprocess.c @ 121:3ecf2a90c65e libavcodec

more speed
author michael
date Tue, 23 Oct 2001 15:55:54 +0000
parents b0b89f5d0288
children 55f57883bbf5
comparison
equal deleted inserted replaced
120:b0b89f5d0288 121:3ecf2a90c65e
58 (the if/else stuff per block is slowing things down) 58 (the if/else stuff per block is slowing things down)
59 compare the quality & speed of all filters 59 compare the quality & speed of all filters
60 split this huge file 60 split this huge file
61 fix warnings (unused vars, ...) 61 fix warnings (unused vars, ...)
62 noise reduction filters 62 noise reduction filters
63 write an exact implementation of the horizontal delocking filter
63 ... 64 ...
64 65
65 Notes: 66 Notes:
66 67
67 */ 68 */
1448 1449
1449 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) 1450 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP)
1450 { 1451 {
1451 #ifdef HAVE_MMX 1452 #ifdef HAVE_MMX
1452 asm volatile( 1453 asm volatile(
1453 "pushl %0 \n\t" 1454 "leal (%0, %1), %%ecx \n\t"
1455 "leal (%%ecx, %1, 4), %%ebx \n\t"
1456 // 0 1 2 3 4 5 6 7 8 9
1457 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1454 "pxor %%mm7, %%mm7 \n\t" 1458 "pxor %%mm7, %%mm7 \n\t"
1455 "movq bm00001000, %%mm6 \n\t" 1459 "movq bm00001000, %%mm6 \n\t"
1456 "movd %2, %%mm5 \n\t" // QP 1460 "movd %2, %%mm5 \n\t" // QP
1457 "movq %%mm5, %%mm4 \n\t" 1461 "movq %%mm5, %%mm4 \n\t"
1458 "paddusb %%mm5, %%mm5 \n\t" // 2QP 1462 "paddusb %%mm5, %%mm5 \n\t" // 2QP
1462 "psubb %%mm4, %%mm5 \n\t" // -QP 1466 "psubb %%mm4, %%mm5 \n\t" // -QP
1463 "leal tempBlock, %%eax \n\t" 1467 "leal tempBlock, %%eax \n\t"
1464 1468
1465 //FIXME? "unroll by 2" and mix 1469 //FIXME? "unroll by 2" and mix
1466 #ifdef HAVE_MMX2 1470 #ifdef HAVE_MMX2
1467 #define HDF(i) \ 1471 #define HDF(src, dst) \
1468 "movq " #i "(%%eax), %%mm0 \n\t"\ 1472 "movq " #src "(%%eax), %%mm0 \n\t"\
1469 "movq %%mm0, %%mm1 \n\t"\ 1473 "movq " #src "(%%eax), %%mm1 \n\t"\
1470 "movq %%mm0, %%mm2 \n\t"\ 1474 "movq " #src "(%%eax), %%mm2 \n\t"\
1471 "psrlq $8, %%mm1 \n\t"\ 1475 "psrlq $8, %%mm1 \n\t"\
1472 "psubusb %%mm1, %%mm2 \n\t"\ 1476 "psubusb %%mm1, %%mm2 \n\t"\
1473 "psubusb %%mm0, %%mm1 \n\t"\ 1477 "psubusb %%mm0, %%mm1 \n\t"\
1474 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ 1478 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\
1475 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ 1479 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\
1484 "psubb %%mm2, %%mm1 \n\t"\ 1488 "psubb %%mm2, %%mm1 \n\t"\
1485 "pand %%mm6, %%mm1 \n\t"\ 1489 "pand %%mm6, %%mm1 \n\t"\
1486 "psubb %%mm1, %%mm0 \n\t"\ 1490 "psubb %%mm1, %%mm0 \n\t"\
1487 "psllq $8, %%mm1 \n\t"\ 1491 "psllq $8, %%mm1 \n\t"\
1488 "paddb %%mm1, %%mm0 \n\t"\ 1492 "paddb %%mm1, %%mm0 \n\t"\
1489 "movd %%mm0, (%0) \n\t"\ 1493 "movd %%mm0, " #dst" \n\t"\
1490 "psrlq $32, %%mm0 \n\t"\ 1494 "psrlq $32, %%mm0 \n\t"\
1491 "movd %%mm0, 4(%0) \n\t" 1495 "movd %%mm0, 4" #dst" \n\t"
1492 #else 1496 #else
1493 #define HDF(i)\ 1497 #define HDF(src, dst)\
1494 "movq " #i "(%%eax), %%mm0 \n\t"\ 1498 "movq " #src "(%%eax), %%mm0 \n\t"\
1495 "movq %%mm0, %%mm1 \n\t"\ 1499 "movq %%mm0, %%mm1 \n\t"\
1496 "movq %%mm0, %%mm2 \n\t"\ 1500 "movq %%mm0, %%mm2 \n\t"\
1497 "psrlq $8, %%mm1 \n\t"\ 1501 "psrlq $8, %%mm1 \n\t"\
1498 "psubusb %%mm1, %%mm2 \n\t"\ 1502 "psubusb %%mm1, %%mm2 \n\t"\
1499 "psubusb %%mm0, %%mm1 \n\t"\ 1503 "psubusb %%mm0, %%mm1 \n\t"\
1513 "psubb %%mm2, %%mm1 \n\t"\ 1517 "psubb %%mm2, %%mm1 \n\t"\
1514 "pand %%mm6, %%mm1 \n\t"\ 1518 "pand %%mm6, %%mm1 \n\t"\
1515 "psubb %%mm1, %%mm0 \n\t"\ 1519 "psubb %%mm1, %%mm0 \n\t"\
1516 "psllq $8, %%mm1 \n\t"\ 1520 "psllq $8, %%mm1 \n\t"\
1517 "paddb %%mm1, %%mm0 \n\t"\ 1521 "paddb %%mm1, %%mm0 \n\t"\
1518 "movd %%mm0, (%0) \n\t"\ 1522 "movd %%mm0, " #dst " \n\t"\
1519 "psrlq $32, %%mm0 \n\t"\ 1523 "psrlq $32, %%mm0 \n\t"\
1520 "movd %%mm0, 4(%0) \n\t" 1524 "movd %%mm0, 4" #dst " \n\t"
1521 #endif 1525 #endif
1522 HDF(0) 1526 HDF(0,(%0))
1523 "addl %1, %0 \n\t" 1527 HDF(8,(%%ecx))
1524 HDF(8) 1528 HDF(16,(%%ecx, %1))
1525 "addl %1, %0 \n\t" 1529 HDF(24,(%%ecx, %1, 2))
1526 HDF(16) 1530 HDF(32,(%0, %1, 4))
1527 "addl %1, %0 \n\t" 1531 HDF(40,(%%ebx))
1528 HDF(24) 1532 HDF(48,(%%ebx, %1))
1529 "addl %1, %0 \n\t" 1533 HDF(56,(%%ebx, %1, 2))
1530 HDF(32)
1531 "addl %1, %0 \n\t"
1532 HDF(40)
1533 "addl %1, %0 \n\t"
1534 HDF(48)
1535 "addl %1, %0 \n\t"
1536 HDF(56)
1537 "popl %0 \n\t"
1538 : 1534 :
1539 : "r" (dst), "r" (stride), "r" (QP) 1535 : "r" (dst), "r" (stride), "r" (QP)
1540 : "%eax" 1536 : "%eax", "%ebx", "%ecx"
1541 ); 1537 );
1542 #else 1538 #else
1543 uint8_t *src= tempBlock; 1539 uint8_t *src= tempBlock;
1544 1540
1545 int y; 1541 int y;
1595 */ 1591 */
1596 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) 1592 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP)
1597 { 1593 {
1598 //return; 1594 //return;
1599 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1595 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1600 asm volatile( //"movv %0 %1 %2\n\t" 1596 asm volatile(
1601 "pushl %0\n\t" 1597 "leal (%0, %1), %%ecx \n\t"
1598 "leal (%%ecx, %1, 4), %%ebx \n\t"
1599 // 0 1 2 3 4 5 6 7 8 9
1600 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1602 "pxor %%mm7, %%mm7 \n\t" 1601 "pxor %%mm7, %%mm7 \n\t"
1603 "leal tempBlock, %%eax \n\t" 1602 "leal tempBlock, %%eax \n\t"
1604 /* 1603 /*
1605 #define HLP1 "movq (%0), %%mm0 \n\t"\ 1604 #define HLP1 "movq (%0), %%mm0 \n\t"\
1606 "movq %%mm0, %%mm1 \n\t"\ 1605 "movq %%mm0, %%mm1 \n\t"\
1712 "psrlq $32, %%mm0 \n\t"\ 1711 "psrlq $32, %%mm0 \n\t"\
1713 "movd %%mm0, 4(%0) \n\t" 1712 "movd %%mm0, 4(%0) \n\t"
1714 #endif 1713 #endif
1715 1714
1716 /* uses the 7-Tap Filter: 1112111 */ 1715 /* uses the 7-Tap Filter: 1112111 */
1717 #define NEW_HLP(i)\ 1716 #define NEW_HLP(src, dst)\
1718 "movq " #i "(%%eax), %%mm0 \n\t"\ 1717 "movq " #src "(%%eax), %%mm1 \n\t"\
1719 "movq %%mm0, %%mm1 \n\t"\ 1718 "movq " #src "(%%eax), %%mm2 \n\t"\
1720 "movq %%mm0, %%mm2 \n\t"\
1721 "movd -4(%0), %%mm3 \n\t" /*0001000*/\
1722 "movd 8(%0), %%mm4 \n\t" /*0001000*/\
1723 "psllq $8, %%mm1 \n\t"\ 1719 "psllq $8, %%mm1 \n\t"\
1724 "psrlq $8, %%mm2 \n\t"\ 1720 "psrlq $8, %%mm2 \n\t"\
1721 "movd -4" #dst ", %%mm3 \n\t" /*0001000*/\
1722 "movd 8" #dst ", %%mm4 \n\t" /*0001000*/\
1725 "psrlq $24, %%mm3 \n\t"\ 1723 "psrlq $24, %%mm3 \n\t"\
1726 "psllq $56, %%mm4 \n\t"\ 1724 "psllq $56, %%mm4 \n\t"\
1727 "por %%mm3, %%mm1 \n\t"\ 1725 "por %%mm3, %%mm1 \n\t"\
1728 "por %%mm4, %%mm2 \n\t"\ 1726 "por %%mm4, %%mm2 \n\t"\
1729 "movq %%mm1, %%mm5 \n\t"\ 1727 "movq %%mm1, %%mm5 \n\t"\
1730 PAVGB(%%mm2, %%mm1)\ 1728 PAVGB(%%mm2, %%mm1)\
1729 "movq " #src "(%%eax), %%mm0 \n\t"\
1731 PAVGB(%%mm1, %%mm0)\ 1730 PAVGB(%%mm1, %%mm0)\
1732 "psllq $8, %%mm5 \n\t"\ 1731 "psllq $8, %%mm5 \n\t"\
1733 "psrlq $8, %%mm2 \n\t"\ 1732 "psrlq $8, %%mm2 \n\t"\
1734 "por %%mm3, %%mm5 \n\t"\ 1733 "por %%mm3, %%mm5 \n\t"\
1735 "por %%mm4, %%mm2 \n\t"\ 1734 "por %%mm4, %%mm2 \n\t"\
1740 "por %%mm3, %%mm1 \n\t"\ 1739 "por %%mm3, %%mm1 \n\t"\
1741 "por %%mm4, %%mm2 \n\t"\ 1740 "por %%mm4, %%mm2 \n\t"\
1742 PAVGB(%%mm2, %%mm1)\ 1741 PAVGB(%%mm2, %%mm1)\
1743 PAVGB(%%mm1, %%mm5)\ 1742 PAVGB(%%mm1, %%mm5)\
1744 PAVGB(%%mm5, %%mm0)\ 1743 PAVGB(%%mm5, %%mm0)\
1745 "movd %%mm0, (%0) \n\t"\ 1744 "movd %%mm0, " #dst " \n\t"\
1746 "psrlq $32, %%mm0 \n\t"\ 1745 "psrlq $32, %%mm0 \n\t"\
1747 "movd %%mm0, 4(%0) \n\t" 1746 "movd %%mm0, 4" #dst " \n\t"
1748 1747
1749 /* uses the 9-Tap Filter: 112242211 */ 1748 /* uses the 9-Tap Filter: 112242211 */
1750 #define NEW_HLP2(i)\ 1749 #define NEW_HLP2(i)\
1751 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\ 1750 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\
1752 "movq %%mm0, %%mm1 \n\t" /*0001000*/\ 1751 "movq %%mm0, %%mm1 \n\t" /*0001000*/\
1784 PAVGB(%%mm5, %%mm0) /*112242211*/\ 1783 PAVGB(%%mm5, %%mm0) /*112242211*/\
1785 "movd %%mm0, (%0) \n\t"\ 1784 "movd %%mm0, (%0) \n\t"\
1786 "psrlq $32, %%mm0 \n\t"\ 1785 "psrlq $32, %%mm0 \n\t"\
1787 "movd %%mm0, 4(%0) \n\t" 1786 "movd %%mm0, 4(%0) \n\t"
1788 1787
1789 #define HLP(i) NEW_HLP(i) 1788 #define HLP(src, dst) NEW_HLP(src, dst)
1790 1789
1791 HLP(0) 1790 HLP(0, (%0))
1792 "addl %1, %0 \n\t" 1791 HLP(8, (%%ecx))
1793 HLP(8) 1792 HLP(16, (%%ecx, %1))
1794 "addl %1, %0 \n\t" 1793 HLP(24, (%%ecx, %1, 2))
1795 HLP(16) 1794 HLP(32, (%0, %1, 4))
1796 "addl %1, %0 \n\t" 1795 HLP(40, (%%ebx))
1797 HLP(24) 1796 HLP(48, (%%ebx, %1))
1798 "addl %1, %0 \n\t" 1797 HLP(56, (%%ebx, %1, 2))
1799 HLP(32) 1798
1800 "addl %1, %0 \n\t"
1801 HLP(40)
1802 "addl %1, %0 \n\t"
1803 HLP(48)
1804 "addl %1, %0 \n\t"
1805 HLP(56)
1806
1807 "popl %0\n\t"
1808 : 1799 :
1809 : "r" (dst), "r" (stride) 1800 : "r" (dst), "r" (stride)
1810 : "%eax", "%ebx" 1801 : "%eax", "%ebx", "%ecx"
1811 ); 1802 );
1812 1803
1813 #else 1804 #else
1814 uint8_t *temp= tempBlock; 1805 uint8_t *temp= tempBlock;
1815 int y; 1806 int y;
2741 // finish 1 block before the next otherwise we´ll might have a problem 2732 // finish 1 block before the next otherwise we´ll might have a problem
2742 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 2733 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2743 for(x=0; x<width; x+=BLOCK_SIZE) 2734 for(x=0; x<width; x+=BLOCK_SIZE)
2744 { 2735 {
2745 const int stride= dstStride; 2736 const int stride= dstStride;
2746 int QP= isColor ? 2737 int QP;
2747 QPs[(y>>3)*QPStride + (x>>3)]: 2738 if(isColor)
2748 QPs[(y>>4)*QPStride + (x>>4)]; 2739 {
2749 if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8; 2740 QP=QPs[(y>>3)*QPStride + (x>>3)];
2741 }
2742 else
2743 {
2744 QP= QPs[(y>>4)*QPStride + (x>>4)];
2745 if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8;
2746 yHistogram[ srcBlock[srcStride*5] ]++;
2747 }
2750 #ifdef HAVE_MMX 2748 #ifdef HAVE_MMX
2751 asm volatile( 2749 asm volatile(
2752 "movd %0, %%mm7 \n\t" 2750 "movd %0, %%mm7 \n\t"
2753 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP 2751 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
2754 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP 2752 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
2773 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 2771 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32);
2774 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 2772 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32);
2775 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 2773 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32);
2776 */ 2774 */
2777 #endif 2775 #endif
2778
2779 if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++;
2780 2776
2781 #ifdef PP_FUNNY_STRIDE 2777 #ifdef PP_FUNNY_STRIDE
2782 //can we mess with a 8x16 block, if not use a temp buffer, yes again 2778 //can we mess with a 8x16 block, if not use a temp buffer, yes again
2783 if(x+7 >= width) 2779 if(x+7 >= width)
2784 { 2780 {