Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 121:3ecf2a90c65e libavcodec
more speed
| author | michael |
|---|---|
| date | Tue, 23 Oct 2001 15:55:54 +0000 |
| parents | b0b89f5d0288 |
| children | 55f57883bbf5 |
comparison
equal
deleted
inserted
replaced
| 120:b0b89f5d0288 | 121:3ecf2a90c65e |
|---|---|
| 58 (the if/else stuff per block is slowing things down) | 58 (the if/else stuff per block is slowing things down) |
| 59 compare the quality & speed of all filters | 59 compare the quality & speed of all filters |
| 60 split this huge file | 60 split this huge file |
| 61 fix warnings (unused vars, ...) | 61 fix warnings (unused vars, ...) |
| 62 noise reduction filters | 62 noise reduction filters |
| 63 write an exact implementation of the horizontal delocking filter | |
| 63 ... | 64 ... |
| 64 | 65 |
| 65 Notes: | 66 Notes: |
| 66 | 67 |
| 67 */ | 68 */ |
| 1448 | 1449 |
| 1449 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) | 1450 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) |
| 1450 { | 1451 { |
| 1451 #ifdef HAVE_MMX | 1452 #ifdef HAVE_MMX |
| 1452 asm volatile( | 1453 asm volatile( |
| 1453 "pushl %0 \n\t" | 1454 "leal (%0, %1), %%ecx \n\t" |
| 1455 "leal (%%ecx, %1, 4), %%ebx \n\t" | |
| 1456 // 0 1 2 3 4 5 6 7 8 9 | |
| 1457 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 1454 "pxor %%mm7, %%mm7 \n\t" | 1458 "pxor %%mm7, %%mm7 \n\t" |
| 1455 "movq bm00001000, %%mm6 \n\t" | 1459 "movq bm00001000, %%mm6 \n\t" |
| 1456 "movd %2, %%mm5 \n\t" // QP | 1460 "movd %2, %%mm5 \n\t" // QP |
| 1457 "movq %%mm5, %%mm4 \n\t" | 1461 "movq %%mm5, %%mm4 \n\t" |
| 1458 "paddusb %%mm5, %%mm5 \n\t" // 2QP | 1462 "paddusb %%mm5, %%mm5 \n\t" // 2QP |
| 1462 "psubb %%mm4, %%mm5 \n\t" // -QP | 1466 "psubb %%mm4, %%mm5 \n\t" // -QP |
| 1463 "leal tempBlock, %%eax \n\t" | 1467 "leal tempBlock, %%eax \n\t" |
| 1464 | 1468 |
| 1465 //FIXME? "unroll by 2" and mix | 1469 //FIXME? "unroll by 2" and mix |
| 1466 #ifdef HAVE_MMX2 | 1470 #ifdef HAVE_MMX2 |
| 1467 #define HDF(i) \ | 1471 #define HDF(src, dst) \ |
| 1468 "movq " #i "(%%eax), %%mm0 \n\t"\ | 1472 "movq " #src "(%%eax), %%mm0 \n\t"\ |
| 1469 "movq %%mm0, %%mm1 \n\t"\ | 1473 "movq " #src "(%%eax), %%mm1 \n\t"\ |
| 1470 "movq %%mm0, %%mm2 \n\t"\ | 1474 "movq " #src "(%%eax), %%mm2 \n\t"\ |
| 1471 "psrlq $8, %%mm1 \n\t"\ | 1475 "psrlq $8, %%mm1 \n\t"\ |
| 1472 "psubusb %%mm1, %%mm2 \n\t"\ | 1476 "psubusb %%mm1, %%mm2 \n\t"\ |
| 1473 "psubusb %%mm0, %%mm1 \n\t"\ | 1477 "psubusb %%mm0, %%mm1 \n\t"\ |
| 1474 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ | 1478 "por %%mm2, %%mm1 \n\t" /* p´x = |px - p(x+1)| */\ |
| 1475 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ | 1479 "pcmpeqb %%mm7, %%mm2 \n\t" /* p´x = sgn[px - p(x+1)] */\ |
| 1484 "psubb %%mm2, %%mm1 \n\t"\ | 1488 "psubb %%mm2, %%mm1 \n\t"\ |
| 1485 "pand %%mm6, %%mm1 \n\t"\ | 1489 "pand %%mm6, %%mm1 \n\t"\ |
| 1486 "psubb %%mm1, %%mm0 \n\t"\ | 1490 "psubb %%mm1, %%mm0 \n\t"\ |
| 1487 "psllq $8, %%mm1 \n\t"\ | 1491 "psllq $8, %%mm1 \n\t"\ |
| 1488 "paddb %%mm1, %%mm0 \n\t"\ | 1492 "paddb %%mm1, %%mm0 \n\t"\ |
| 1489 "movd %%mm0, (%0) \n\t"\ | 1493 "movd %%mm0, " #dst" \n\t"\ |
| 1490 "psrlq $32, %%mm0 \n\t"\ | 1494 "psrlq $32, %%mm0 \n\t"\ |
| 1491 "movd %%mm0, 4(%0) \n\t" | 1495 "movd %%mm0, 4" #dst" \n\t" |
| 1492 #else | 1496 #else |
| 1493 #define HDF(i)\ | 1497 #define HDF(src, dst)\ |
| 1494 "movq " #i "(%%eax), %%mm0 \n\t"\ | 1498 "movq " #src "(%%eax), %%mm0 \n\t"\ |
| 1495 "movq %%mm0, %%mm1 \n\t"\ | 1499 "movq %%mm0, %%mm1 \n\t"\ |
| 1496 "movq %%mm0, %%mm2 \n\t"\ | 1500 "movq %%mm0, %%mm2 \n\t"\ |
| 1497 "psrlq $8, %%mm1 \n\t"\ | 1501 "psrlq $8, %%mm1 \n\t"\ |
| 1498 "psubusb %%mm1, %%mm2 \n\t"\ | 1502 "psubusb %%mm1, %%mm2 \n\t"\ |
| 1499 "psubusb %%mm0, %%mm1 \n\t"\ | 1503 "psubusb %%mm0, %%mm1 \n\t"\ |
| 1513 "psubb %%mm2, %%mm1 \n\t"\ | 1517 "psubb %%mm2, %%mm1 \n\t"\ |
| 1514 "pand %%mm6, %%mm1 \n\t"\ | 1518 "pand %%mm6, %%mm1 \n\t"\ |
| 1515 "psubb %%mm1, %%mm0 \n\t"\ | 1519 "psubb %%mm1, %%mm0 \n\t"\ |
| 1516 "psllq $8, %%mm1 \n\t"\ | 1520 "psllq $8, %%mm1 \n\t"\ |
| 1517 "paddb %%mm1, %%mm0 \n\t"\ | 1521 "paddb %%mm1, %%mm0 \n\t"\ |
| 1518 "movd %%mm0, (%0) \n\t"\ | 1522 "movd %%mm0, " #dst " \n\t"\ |
| 1519 "psrlq $32, %%mm0 \n\t"\ | 1523 "psrlq $32, %%mm0 \n\t"\ |
| 1520 "movd %%mm0, 4(%0) \n\t" | 1524 "movd %%mm0, 4" #dst " \n\t" |
| 1521 #endif | 1525 #endif |
| 1522 HDF(0) | 1526 HDF(0,(%0)) |
| 1523 "addl %1, %0 \n\t" | 1527 HDF(8,(%%ecx)) |
| 1524 HDF(8) | 1528 HDF(16,(%%ecx, %1)) |
| 1525 "addl %1, %0 \n\t" | 1529 HDF(24,(%%ecx, %1, 2)) |
| 1526 HDF(16) | 1530 HDF(32,(%0, %1, 4)) |
| 1527 "addl %1, %0 \n\t" | 1531 HDF(40,(%%ebx)) |
| 1528 HDF(24) | 1532 HDF(48,(%%ebx, %1)) |
| 1529 "addl %1, %0 \n\t" | 1533 HDF(56,(%%ebx, %1, 2)) |
| 1530 HDF(32) | |
| 1531 "addl %1, %0 \n\t" | |
| 1532 HDF(40) | |
| 1533 "addl %1, %0 \n\t" | |
| 1534 HDF(48) | |
| 1535 "addl %1, %0 \n\t" | |
| 1536 HDF(56) | |
| 1537 "popl %0 \n\t" | |
| 1538 : | 1534 : |
| 1539 : "r" (dst), "r" (stride), "r" (QP) | 1535 : "r" (dst), "r" (stride), "r" (QP) |
| 1540 : "%eax" | 1536 : "%eax", "%ebx", "%ecx" |
| 1541 ); | 1537 ); |
| 1542 #else | 1538 #else |
| 1543 uint8_t *src= tempBlock; | 1539 uint8_t *src= tempBlock; |
| 1544 | 1540 |
| 1545 int y; | 1541 int y; |
| 1595 */ | 1591 */ |
| 1596 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | 1592 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) |
| 1597 { | 1593 { |
| 1598 //return; | 1594 //return; |
| 1599 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1595 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 1600 asm volatile( //"movv %0 %1 %2\n\t" | 1596 asm volatile( |
| 1601 "pushl %0\n\t" | 1597 "leal (%0, %1), %%ecx \n\t" |
| 1598 "leal (%%ecx, %1, 4), %%ebx \n\t" | |
| 1599 // 0 1 2 3 4 5 6 7 8 9 | |
| 1600 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 1602 "pxor %%mm7, %%mm7 \n\t" | 1601 "pxor %%mm7, %%mm7 \n\t" |
| 1603 "leal tempBlock, %%eax \n\t" | 1602 "leal tempBlock, %%eax \n\t" |
| 1604 /* | 1603 /* |
| 1605 #define HLP1 "movq (%0), %%mm0 \n\t"\ | 1604 #define HLP1 "movq (%0), %%mm0 \n\t"\ |
| 1606 "movq %%mm0, %%mm1 \n\t"\ | 1605 "movq %%mm0, %%mm1 \n\t"\ |
| 1712 "psrlq $32, %%mm0 \n\t"\ | 1711 "psrlq $32, %%mm0 \n\t"\ |
| 1713 "movd %%mm0, 4(%0) \n\t" | 1712 "movd %%mm0, 4(%0) \n\t" |
| 1714 #endif | 1713 #endif |
| 1715 | 1714 |
| 1716 /* uses the 7-Tap Filter: 1112111 */ | 1715 /* uses the 7-Tap Filter: 1112111 */ |
| 1717 #define NEW_HLP(i)\ | 1716 #define NEW_HLP(src, dst)\ |
| 1718 "movq " #i "(%%eax), %%mm0 \n\t"\ | 1717 "movq " #src "(%%eax), %%mm1 \n\t"\ |
| 1719 "movq %%mm0, %%mm1 \n\t"\ | 1718 "movq " #src "(%%eax), %%mm2 \n\t"\ |
| 1720 "movq %%mm0, %%mm2 \n\t"\ | |
| 1721 "movd -4(%0), %%mm3 \n\t" /*0001000*/\ | |
| 1722 "movd 8(%0), %%mm4 \n\t" /*0001000*/\ | |
| 1723 "psllq $8, %%mm1 \n\t"\ | 1719 "psllq $8, %%mm1 \n\t"\ |
| 1724 "psrlq $8, %%mm2 \n\t"\ | 1720 "psrlq $8, %%mm2 \n\t"\ |
| 1721 "movd -4" #dst ", %%mm3 \n\t" /*0001000*/\ | |
| 1722 "movd 8" #dst ", %%mm4 \n\t" /*0001000*/\ | |
| 1725 "psrlq $24, %%mm3 \n\t"\ | 1723 "psrlq $24, %%mm3 \n\t"\ |
| 1726 "psllq $56, %%mm4 \n\t"\ | 1724 "psllq $56, %%mm4 \n\t"\ |
| 1727 "por %%mm3, %%mm1 \n\t"\ | 1725 "por %%mm3, %%mm1 \n\t"\ |
| 1728 "por %%mm4, %%mm2 \n\t"\ | 1726 "por %%mm4, %%mm2 \n\t"\ |
| 1729 "movq %%mm1, %%mm5 \n\t"\ | 1727 "movq %%mm1, %%mm5 \n\t"\ |
| 1730 PAVGB(%%mm2, %%mm1)\ | 1728 PAVGB(%%mm2, %%mm1)\ |
| 1729 "movq " #src "(%%eax), %%mm0 \n\t"\ | |
| 1731 PAVGB(%%mm1, %%mm0)\ | 1730 PAVGB(%%mm1, %%mm0)\ |
| 1732 "psllq $8, %%mm5 \n\t"\ | 1731 "psllq $8, %%mm5 \n\t"\ |
| 1733 "psrlq $8, %%mm2 \n\t"\ | 1732 "psrlq $8, %%mm2 \n\t"\ |
| 1734 "por %%mm3, %%mm5 \n\t"\ | 1733 "por %%mm3, %%mm5 \n\t"\ |
| 1735 "por %%mm4, %%mm2 \n\t"\ | 1734 "por %%mm4, %%mm2 \n\t"\ |
| 1740 "por %%mm3, %%mm1 \n\t"\ | 1739 "por %%mm3, %%mm1 \n\t"\ |
| 1741 "por %%mm4, %%mm2 \n\t"\ | 1740 "por %%mm4, %%mm2 \n\t"\ |
| 1742 PAVGB(%%mm2, %%mm1)\ | 1741 PAVGB(%%mm2, %%mm1)\ |
| 1743 PAVGB(%%mm1, %%mm5)\ | 1742 PAVGB(%%mm1, %%mm5)\ |
| 1744 PAVGB(%%mm5, %%mm0)\ | 1743 PAVGB(%%mm5, %%mm0)\ |
| 1745 "movd %%mm0, (%0) \n\t"\ | 1744 "movd %%mm0, " #dst " \n\t"\ |
| 1746 "psrlq $32, %%mm0 \n\t"\ | 1745 "psrlq $32, %%mm0 \n\t"\ |
| 1747 "movd %%mm0, 4(%0) \n\t" | 1746 "movd %%mm0, 4" #dst " \n\t" |
| 1748 | 1747 |
| 1749 /* uses the 9-Tap Filter: 112242211 */ | 1748 /* uses the 9-Tap Filter: 112242211 */ |
| 1750 #define NEW_HLP2(i)\ | 1749 #define NEW_HLP2(i)\ |
| 1751 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\ | 1750 "movq " #i "(%%eax), %%mm0 \n\t" /*0001000*/\ |
| 1752 "movq %%mm0, %%mm1 \n\t" /*0001000*/\ | 1751 "movq %%mm0, %%mm1 \n\t" /*0001000*/\ |
| 1784 PAVGB(%%mm5, %%mm0) /*112242211*/\ | 1783 PAVGB(%%mm5, %%mm0) /*112242211*/\ |
| 1785 "movd %%mm0, (%0) \n\t"\ | 1784 "movd %%mm0, (%0) \n\t"\ |
| 1786 "psrlq $32, %%mm0 \n\t"\ | 1785 "psrlq $32, %%mm0 \n\t"\ |
| 1787 "movd %%mm0, 4(%0) \n\t" | 1786 "movd %%mm0, 4(%0) \n\t" |
| 1788 | 1787 |
| 1789 #define HLP(i) NEW_HLP(i) | 1788 #define HLP(src, dst) NEW_HLP(src, dst) |
| 1790 | 1789 |
| 1791 HLP(0) | 1790 HLP(0, (%0)) |
| 1792 "addl %1, %0 \n\t" | 1791 HLP(8, (%%ecx)) |
| 1793 HLP(8) | 1792 HLP(16, (%%ecx, %1)) |
| 1794 "addl %1, %0 \n\t" | 1793 HLP(24, (%%ecx, %1, 2)) |
| 1795 HLP(16) | 1794 HLP(32, (%0, %1, 4)) |
| 1796 "addl %1, %0 \n\t" | 1795 HLP(40, (%%ebx)) |
| 1797 HLP(24) | 1796 HLP(48, (%%ebx, %1)) |
| 1798 "addl %1, %0 \n\t" | 1797 HLP(56, (%%ebx, %1, 2)) |
| 1799 HLP(32) | 1798 |
| 1800 "addl %1, %0 \n\t" | |
| 1801 HLP(40) | |
| 1802 "addl %1, %0 \n\t" | |
| 1803 HLP(48) | |
| 1804 "addl %1, %0 \n\t" | |
| 1805 HLP(56) | |
| 1806 | |
| 1807 "popl %0\n\t" | |
| 1808 : | 1799 : |
| 1809 : "r" (dst), "r" (stride) | 1800 : "r" (dst), "r" (stride) |
| 1810 : "%eax", "%ebx" | 1801 : "%eax", "%ebx", "%ecx" |
| 1811 ); | 1802 ); |
| 1812 | 1803 |
| 1813 #else | 1804 #else |
| 1814 uint8_t *temp= tempBlock; | 1805 uint8_t *temp= tempBlock; |
| 1815 int y; | 1806 int y; |
| 2741 // finish 1 block before the next otherwise we´ll might have a problem | 2732 // finish 1 block before the next otherwise we´ll might have a problem |
| 2742 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | 2733 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
| 2743 for(x=0; x<width; x+=BLOCK_SIZE) | 2734 for(x=0; x<width; x+=BLOCK_SIZE) |
| 2744 { | 2735 { |
| 2745 const int stride= dstStride; | 2736 const int stride= dstStride; |
| 2746 int QP= isColor ? | 2737 int QP; |
| 2747 QPs[(y>>3)*QPStride + (x>>3)]: | 2738 if(isColor) |
| 2748 QPs[(y>>4)*QPStride + (x>>4)]; | 2739 { |
| 2749 if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8; | 2740 QP=QPs[(y>>3)*QPStride + (x>>3)]; |
| 2741 } | |
| 2742 else | |
| 2743 { | |
| 2744 QP= QPs[(y>>4)*QPStride + (x>>4)]; | |
| 2745 if(mode & LEVEL_FIX) QP= (QP* (packedYScale &0xFFFF))>>8; | |
| 2746 yHistogram[ srcBlock[srcStride*5] ]++; | |
| 2747 } | |
| 2750 #ifdef HAVE_MMX | 2748 #ifdef HAVE_MMX |
| 2751 asm volatile( | 2749 asm volatile( |
| 2752 "movd %0, %%mm7 \n\t" | 2750 "movd %0, %%mm7 \n\t" |
| 2753 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | 2751 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
| 2754 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | 2752 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP |
| 2773 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | 2771 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
| 2774 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 2772 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
| 2775 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 2773 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
| 2776 */ | 2774 */ |
| 2777 #endif | 2775 #endif |
| 2778 | |
| 2779 if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++; | |
| 2780 | 2776 |
| 2781 #ifdef PP_FUNNY_STRIDE | 2777 #ifdef PP_FUNNY_STRIDE |
| 2782 //can we mess with a 8x16 block, if not use a temp buffer, yes again | 2778 //can we mess with a 8x16 block, if not use a temp buffer, yes again |
| 2783 if(x+7 >= width) | 2779 if(x+7 >= width) |
| 2784 { | 2780 { |
