Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 128:e5266b8e79be libavcodec
much better horizontal filters (transpose & use the vertical ones) :)
bugfix
bugs?
| author | michael |
|---|---|
| date | Wed, 24 Oct 2001 16:39:40 +0000 |
| parents | 55f57883bbf5 |
| children | be35346e27c1 |
comparison
equal
deleted
inserted
replaced
| 127:2fe8f116576c | 128:e5266b8e79be |
|---|---|
| 21 isVertDC Ec Ec | 21 isVertDC Ec Ec |
| 22 isVertMinMaxOk Ec Ec | 22 isVertMinMaxOk Ec Ec |
| 23 doVertLowPass E e e | 23 doVertLowPass E e e |
| 24 doVertDefFilter Ec Ec Ec | 24 doVertDefFilter Ec Ec Ec |
| 25 isHorizDC Ec Ec | 25 isHorizDC Ec Ec |
| 26 isHorizMinMaxOk a | 26 isHorizMinMaxOk a E |
| 27 doHorizLowPass E a a | 27 doHorizLowPass E e e |
| 28 doHorizDefFilter E ac ac | 28 doHorizDefFilter E E E |
| 29 deRing | 29 deRing |
| 30 Vertical RKAlgo1 E a a | 30 Vertical RKAlgo1 E a a |
| 31 Vertical X1 a E E | 31 Vertical X1 a E E |
| 32 Horizontal X1 a E E | 32 Horizontal X1 a E E |
| 33 LinIpolDeinterlace e E E* | 33 LinIpolDeinterlace e E E* |
| 58 (the if/else stuff per block is slowing things down) | 58 (the if/else stuff per block is slowing things down) |
| 59 compare the quality & speed of all filters | 59 compare the quality & speed of all filters |
| 60 split this huge file | 60 split this huge file |
| 61 fix warnings (unused vars, ...) | 61 fix warnings (unused vars, ...) |
| 62 noise reduction filters | 62 noise reduction filters |
| 63 write an exact implementation of the horizontal delocking filter | |
| 64 ... | 63 ... |
| 65 | 64 |
| 66 Notes: | 65 Notes: |
| 67 | 66 |
| 68 */ | 67 */ |
| 126 static uint64_t temp2=0; | 125 static uint64_t temp2=0; |
| 127 static uint64_t temp3=0; | 126 static uint64_t temp3=0; |
| 128 static uint64_t temp4=0; | 127 static uint64_t temp4=0; |
| 129 static uint64_t temp5=0; | 128 static uint64_t temp5=0; |
| 130 static uint64_t pQPb=0; | 129 static uint64_t pQPb=0; |
| 131 static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data | 130 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code |
| 132 | 131 |
| 133 int hFlatnessThreshold= 56 - 16; | 132 int hFlatnessThreshold= 56 - 16; |
| 134 int vFlatnessThreshold= 56 - 16; | 133 int vFlatnessThreshold= 56 - 16; |
| 135 | 134 |
| 136 //amount of "black" u r willing to loose to get a brightness corrected picture | 135 //amount of "black" u r willing to loose to get a brightness corrected picture |
| 275 "psrlq $32, %%mm0 \n\t" | 274 "psrlq $32, %%mm0 \n\t" |
| 276 "paddb %%mm1, %%mm0 \n\t" | 275 "paddb %%mm1, %%mm0 \n\t" |
| 277 "movd %%mm0, %0 \n\t" | 276 "movd %%mm0, %0 \n\t" |
| 278 : "=r" (numEq) | 277 : "=r" (numEq) |
| 279 : "r" (src), "r" (stride) | 278 : "r" (src), "r" (stride) |
| 279 : "%eax", "%ebx" | |
| 280 ); | 280 ); |
| 281 | 281 |
| 282 numEq= (256 - numEq) &0xFF; | 282 numEq= (256 - numEq) &0xFF; |
| 283 | 283 |
| 284 #else | 284 #else |
| 848 (D<<24) | (C<<16) | (B<<8) | (A); | 848 (D<<24) | (C<<16) | (B<<8) | (A); |
| 849 //lut[i] = (v<<32) | (v<<24); | 849 //lut[i] = (v<<32) | (v<<24); |
| 850 } | 850 } |
| 851 } | 851 } |
| 852 | 852 |
| 853 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 853 #if 0 |
| 854 asm volatile( | 854 asm volatile( |
| 855 "pxor %%mm7, %%mm7 \n\t" // 0 | 855 "pxor %%mm7, %%mm7 \n\t" // 0 |
| 856 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | 856 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 857 "leal (%0, %1), %%eax \n\t" | 857 "leal (%0, %1), %%eax \n\t" |
| 858 "leal (%%eax, %1, 4), %%ebx \n\t" | 858 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 1293 #endif | 1293 #endif |
| 1294 } | 1294 } |
| 1295 | 1295 |
| 1296 //FIXME? |255-0| = 1 | 1296 //FIXME? |255-0| = 1 |
| 1297 /** | 1297 /** |
| 1298 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. | 1298 * Check if the given 8x8 Block is mostly "flat" |
| 1299 */ | 1299 */ |
| 1300 static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride) | 1300 static inline int isHorizDC(uint8_t src[], int stride) |
| 1301 { | 1301 { |
| 1302 // src++; | 1302 // src++; |
| 1303 int numEq= 0; | 1303 int numEq= 0; |
| 1304 #ifdef HAVE_MMX | 1304 #if 0 |
| 1305 asm volatile ( | 1305 asm volatile ( |
| 1306 // "int $3 \n\t" | 1306 // "int $3 \n\t" |
| 1307 "leal (%1, %2), %%ecx \n\t" | 1307 "leal (%1, %2), %%ecx \n\t" |
| 1308 "leal (%%ecx, %2, 4), %%ebx \n\t" | 1308 "leal (%%ecx, %2, 4), %%ebx \n\t" |
| 1309 // 0 1 2 3 4 5 6 7 8 9 | 1309 // 0 1 2 3 4 5 6 7 8 9 |
| 1384 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; | 1384 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; |
| 1385 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; | 1385 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; |
| 1386 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; | 1386 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; |
| 1387 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; | 1387 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; |
| 1388 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; | 1388 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; |
| 1389 tempBlock[0 + y*TEMP_STRIDE] = src[0]; | |
| 1390 tempBlock[1 + y*TEMP_STRIDE] = src[1]; | |
| 1391 tempBlock[2 + y*TEMP_STRIDE] = src[2]; | |
| 1392 tempBlock[3 + y*TEMP_STRIDE] = src[3]; | |
| 1393 tempBlock[4 + y*TEMP_STRIDE] = src[4]; | |
| 1394 tempBlock[5 + y*TEMP_STRIDE] = src[5]; | |
| 1395 tempBlock[6 + y*TEMP_STRIDE] = src[6]; | |
| 1396 tempBlock[7 + y*TEMP_STRIDE] = src[7]; | |
| 1397 src+= stride; | 1389 src+= stride; |
| 1398 } | 1390 } |
| 1399 #endif | 1391 #endif |
| 1400 /* if(abs(numEq - asmEq) > 0) | 1392 /* if(abs(numEq - asmEq) > 0) |
| 1401 { | 1393 { |
| 1414 return numEq > hFlatnessThreshold; | 1406 return numEq > hFlatnessThreshold; |
| 1415 } | 1407 } |
| 1416 | 1408 |
| 1417 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) | 1409 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) |
| 1418 { | 1410 { |
| 1419 #ifdef MMX_FIXME | |
| 1420 FIXME | |
| 1421 int isOk; | |
| 1422 asm volatile( | |
| 1423 // "int $3 \n\t" | |
| 1424 "movq (%1, %2), %%mm0 \n\t" | |
| 1425 "movq (%1, %2, 8), %%mm1 \n\t" | |
| 1426 "movq %%mm0, %%mm2 \n\t" | |
| 1427 "psubusb %%mm1, %%mm0 \n\t" | |
| 1428 "psubusb %%mm2, %%mm1 \n\t" | |
| 1429 "por %%mm1, %%mm0 \n\t" // ABS Diff | |
| 1430 | |
| 1431 "movq pQPb, %%mm7 \n\t" // QP,..., QP | |
| 1432 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | |
| 1433 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 | |
| 1434 "pcmpeqd b00, %%mm0 \n\t" | |
| 1435 "psrlq $16, %%mm0 \n\t" | |
| 1436 "pcmpeqd bFF, %%mm0 \n\t" | |
| 1437 // "movd %%mm0, (%1, %2, 4)\n\t" | |
| 1438 "movd %%mm0, %0 \n\t" | |
| 1439 : "=r" (isOk) | |
| 1440 : "r" (src), "r" (stride) | |
| 1441 ); | |
| 1442 return isOk; | |
| 1443 #else | |
| 1444 if(abs(src[0] - src[7]) > 2*QP) return 0; | 1411 if(abs(src[0] - src[7]) > 2*QP) return 0; |
| 1445 | 1412 |
| 1446 return 1; | 1413 return 1; |
| 1447 #endif | |
| 1448 } | 1414 } |
| 1449 | 1415 |
| 1450 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) | 1416 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP) |
| 1451 { | 1417 { |
| 1452 #ifdef HAVE_MMX | 1418 #if 0 |
| 1453 asm volatile( | 1419 asm volatile( |
| 1454 "leal (%0, %1), %%ecx \n\t" | 1420 "leal (%0, %1), %%ecx \n\t" |
| 1455 "leal (%%ecx, %1, 4), %%ebx \n\t" | 1421 "leal (%%ecx, %1, 4), %%ebx \n\t" |
| 1456 // 0 1 2 3 4 5 6 7 8 9 | 1422 // 0 1 2 3 4 5 6 7 8 9 |
| 1457 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1423 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
| 1534 : | 1500 : |
| 1535 : "r" (dst), "r" (stride), "r" (QP) | 1501 : "r" (dst), "r" (stride), "r" (QP) |
| 1536 : "%eax", "%ebx", "%ecx" | 1502 : "%eax", "%ebx", "%ecx" |
| 1537 ); | 1503 ); |
| 1538 #else | 1504 #else |
| 1539 uint8_t *src= tempBlock; | |
| 1540 | |
| 1541 int y; | 1505 int y; |
| 1542 for(y=0; y<BLOCK_SIZE; y++) | 1506 for(y=0; y<BLOCK_SIZE; y++) |
| 1543 { | 1507 { |
| 1544 const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]); | 1508 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]); |
| 1545 | |
| 1546 dst[0] = src[0]; | |
| 1547 dst[1] = src[1]; | |
| 1548 dst[2] = src[2]; | |
| 1549 dst[3] = src[3]; | |
| 1550 dst[4] = src[4]; | |
| 1551 dst[5] = src[5]; | |
| 1552 dst[6] = src[6]; | |
| 1553 dst[7] = src[7]; | |
| 1554 | 1509 |
| 1555 if(ABS(middleEnergy) < 8*QP) | 1510 if(ABS(middleEnergy) < 8*QP) |
| 1556 { | 1511 { |
| 1557 const int q=(src[3] - src[4])/2; | 1512 const int q=(dst[3] - dst[4])/2; |
| 1558 const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]); | 1513 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]); |
| 1559 const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]); | 1514 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]); |
| 1560 | 1515 |
| 1561 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); | 1516 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); |
| 1562 d= MAX(d, 0); | 1517 d= MAX(d, 0); |
| 1563 | 1518 |
| 1564 d= (5*d + 32) >> 6; | 1519 d= (5*d + 32) >> 6; |
| 1577 | 1532 |
| 1578 dst[3]-= d; | 1533 dst[3]-= d; |
| 1579 dst[4]+= d; | 1534 dst[4]+= d; |
| 1580 } | 1535 } |
| 1581 dst+= stride; | 1536 dst+= stride; |
| 1582 src+= TEMP_STRIDE; | |
| 1583 } | 1537 } |
| 1584 #endif | 1538 #endif |
| 1585 } | 1539 } |
| 1586 | 1540 |
| 1587 /** | 1541 /** |
| 1588 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) | 1542 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) |
| 1589 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) | 1543 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) |
| 1590 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) | 1544 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) |
| 1591 */ | 1545 */ |
| 1592 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) | 1546 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP) |
| 1593 { | 1547 { |
| 1594 //return; | 1548 |
| 1595 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1549 #if 0 |
| 1596 asm volatile( | 1550 asm volatile( |
| 1597 "leal (%0, %1), %%ecx \n\t" | 1551 "leal (%0, %1), %%ecx \n\t" |
| 1598 "leal (%%ecx, %1, 4), %%ebx \n\t" | 1552 "leal (%%ecx, %1, 4), %%ebx \n\t" |
| 1599 // 0 1 2 3 4 5 6 7 8 9 | 1553 // 0 1 2 3 4 5 6 7 8 9 |
| 1600 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1554 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
| 1800 : "r" (dst), "r" (stride) | 1754 : "r" (dst), "r" (stride) |
| 1801 : "%eax", "%ebx", "%ecx" | 1755 : "%eax", "%ebx", "%ecx" |
| 1802 ); | 1756 ); |
| 1803 | 1757 |
| 1804 #else | 1758 #else |
| 1805 uint8_t *temp= tempBlock; | |
| 1806 int y; | 1759 int y; |
| 1807 for(y=0; y<BLOCK_SIZE; y++) | 1760 for(y=0; y<BLOCK_SIZE; y++) |
| 1808 { | 1761 { |
| 1809 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; | 1762 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; |
| 1810 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; | 1763 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; |
| 1811 | 1764 |
| 1812 int sums[9]; | 1765 int sums[9]; |
| 1813 sums[0] = first + temp[0]; | 1766 sums[0] = first + dst[0]; |
| 1814 sums[1] = temp[0] + temp[1]; | 1767 sums[1] = dst[0] + dst[1]; |
| 1815 sums[2] = temp[1] + temp[2]; | 1768 sums[2] = dst[1] + dst[2]; |
| 1816 sums[3] = temp[2] + temp[3]; | 1769 sums[3] = dst[2] + dst[3]; |
| 1817 sums[4] = temp[3] + temp[4]; | 1770 sums[4] = dst[3] + dst[4]; |
| 1818 sums[5] = temp[4] + temp[5]; | 1771 sums[5] = dst[4] + dst[5]; |
| 1819 sums[6] = temp[5] + temp[6]; | 1772 sums[6] = dst[5] + dst[6]; |
| 1820 sums[7] = temp[6] + temp[7]; | 1773 sums[7] = dst[6] + dst[7]; |
| 1821 sums[8] = temp[7] + last; | 1774 sums[8] = dst[7] + last; |
| 1822 | 1775 |
| 1823 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; | 1776 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; |
| 1824 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; | 1777 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; |
| 1825 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4; | 1778 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4; |
| 1826 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4; | 1779 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4; |
| 1828 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4; | 1781 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4; |
| 1829 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4; | 1782 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4; |
| 1830 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; | 1783 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; |
| 1831 | 1784 |
| 1832 dst+= stride; | 1785 dst+= stride; |
| 1833 temp+= TEMP_STRIDE; | |
| 1834 } | 1786 } |
| 1835 #endif | 1787 #endif |
| 1836 } | 1788 } |
| 1837 | |
| 1838 | 1789 |
| 1839 static inline void dering(uint8_t src[], int stride, int QP) | 1790 static inline void dering(uint8_t src[], int stride, int QP) |
| 1840 { | 1791 { |
| 1841 //FIXME | 1792 //FIXME |
| 1842 | 1793 |
| 2183 src++; | 2134 src++; |
| 2184 } | 2135 } |
| 2185 #endif | 2136 #endif |
| 2186 } | 2137 } |
| 2187 | 2138 |
| 2139 /** | |
| 2140 * transposes and shift the given 8x8 Block into dst1 and dst2 | |
| 2141 */ | |
| 2142 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) | |
| 2143 { | |
| 2144 asm( | |
| 2145 "leal (%0, %1), %%eax \n\t" | |
| 2146 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 2147 // 0 1 2 3 4 5 6 7 8 9 | |
| 2148 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 2149 "movq (%0), %%mm0 \n\t" // 12345678 | |
| 2150 "movq (%%eax), %%mm1 \n\t" // abcdefgh | |
| 2151 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
| 2152 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
| 2153 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
| 2154 | |
| 2155 "movq (%%eax, %1), %%mm1 \n\t" | |
| 2156 "movq (%%eax, %1, 2), %%mm3 \n\t" | |
| 2157 "movq %%mm1, %%mm4 \n\t" | |
| 2158 "punpcklbw %%mm3, %%mm1 \n\t" | |
| 2159 "punpckhbw %%mm3, %%mm4 \n\t" | |
| 2160 | |
| 2161 "movq %%mm0, %%mm3 \n\t" | |
| 2162 "punpcklwd %%mm1, %%mm0 \n\t" | |
| 2163 "punpckhwd %%mm1, %%mm3 \n\t" | |
| 2164 "movq %%mm2, %%mm1 \n\t" | |
| 2165 "punpcklwd %%mm4, %%mm2 \n\t" | |
| 2166 "punpckhwd %%mm4, %%mm1 \n\t" | |
| 2167 | |
| 2168 "movd %%mm0, 128(%2) \n\t" | |
| 2169 "psrlq $32, %%mm0 \n\t" | |
| 2170 "movd %%mm0, 144(%2) \n\t" | |
| 2171 "movd %%mm3, 160(%2) \n\t" | |
| 2172 "psrlq $32, %%mm3 \n\t" | |
| 2173 "movd %%mm3, 176(%2) \n\t" | |
| 2174 "movd %%mm3, 48(%3) \n\t" | |
| 2175 "movd %%mm2, 192(%2) \n\t" | |
| 2176 "movd %%mm2, 64(%3) \n\t" | |
| 2177 "psrlq $32, %%mm2 \n\t" | |
| 2178 "movd %%mm2, 80(%3) \n\t" | |
| 2179 "movd %%mm1, 96(%3) \n\t" | |
| 2180 "psrlq $32, %%mm1 \n\t" | |
| 2181 "movd %%mm1, 112(%3) \n\t" | |
| 2182 | |
| 2183 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 | |
| 2184 "movq (%%ebx), %%mm1 \n\t" // abcdefgh | |
| 2185 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
| 2186 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
| 2187 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
| 2188 | |
| 2189 "movq (%%ebx, %1), %%mm1 \n\t" | |
| 2190 "movq (%%ebx, %1, 2), %%mm3 \n\t" | |
| 2191 "movq %%mm1, %%mm4 \n\t" | |
| 2192 "punpcklbw %%mm3, %%mm1 \n\t" | |
| 2193 "punpckhbw %%mm3, %%mm4 \n\t" | |
| 2194 | |
| 2195 "movq %%mm0, %%mm3 \n\t" | |
| 2196 "punpcklwd %%mm1, %%mm0 \n\t" | |
| 2197 "punpckhwd %%mm1, %%mm3 \n\t" | |
| 2198 "movq %%mm2, %%mm1 \n\t" | |
| 2199 "punpcklwd %%mm4, %%mm2 \n\t" | |
| 2200 "punpckhwd %%mm4, %%mm1 \n\t" | |
| 2201 | |
| 2202 "movd %%mm0, 132(%2) \n\t" | |
| 2203 "psrlq $32, %%mm0 \n\t" | |
| 2204 "movd %%mm0, 148(%2) \n\t" | |
| 2205 "movd %%mm3, 164(%2) \n\t" | |
| 2206 "psrlq $32, %%mm3 \n\t" | |
| 2207 "movd %%mm3, 180(%2) \n\t" | |
| 2208 "movd %%mm3, 52(%3) \n\t" | |
| 2209 "movd %%mm2, 196(%2) \n\t" | |
| 2210 "movd %%mm2, 68(%3) \n\t" | |
| 2211 "psrlq $32, %%mm2 \n\t" | |
| 2212 "movd %%mm2, 84(%3) \n\t" | |
| 2213 "movd %%mm1, 100(%3) \n\t" | |
| 2214 "psrlq $32, %%mm1 \n\t" | |
| 2215 "movd %%mm1, 116(%3) \n\t" | |
| 2216 | |
| 2217 | |
| 2218 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2) | |
| 2219 : "%eax", "%ebx" | |
| 2220 ); | |
| 2221 } | |
| 2222 | |
| 2223 /** | |
| 2224 * transposes the given 8x8 block | |
| 2225 */ | |
| 2226 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src) | |
| 2227 { | |
| 2228 asm( | |
| 2229 "leal (%0, %1), %%eax \n\t" | |
| 2230 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 2231 // 0 1 2 3 4 5 6 7 8 9 | |
| 2232 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 2233 "movq (%2), %%mm0 \n\t" // 12345678 | |
| 2234 "movq 16(%2), %%mm1 \n\t" // abcdefgh | |
| 2235 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
| 2236 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
| 2237 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
| 2238 | |
| 2239 "movq 32(%2), %%mm1 \n\t" | |
| 2240 "movq 48(%2), %%mm3 \n\t" | |
| 2241 "movq %%mm1, %%mm4 \n\t" | |
| 2242 "punpcklbw %%mm3, %%mm1 \n\t" | |
| 2243 "punpckhbw %%mm3, %%mm4 \n\t" | |
| 2244 | |
| 2245 "movq %%mm0, %%mm3 \n\t" | |
| 2246 "punpcklwd %%mm1, %%mm0 \n\t" | |
| 2247 "punpckhwd %%mm1, %%mm3 \n\t" | |
| 2248 "movq %%mm2, %%mm1 \n\t" | |
| 2249 "punpcklwd %%mm4, %%mm2 \n\t" | |
| 2250 "punpckhwd %%mm4, %%mm1 \n\t" | |
| 2251 | |
| 2252 "movd %%mm0, (%0) \n\t" | |
| 2253 "psrlq $32, %%mm0 \n\t" | |
| 2254 "movd %%mm0, (%%eax) \n\t" | |
| 2255 "movd %%mm3, (%%eax, %1) \n\t" | |
| 2256 "psrlq $32, %%mm3 \n\t" | |
| 2257 "movd %%mm3, (%%eax, %1, 2) \n\t" | |
| 2258 "movd %%mm2, (%0, %1, 4) \n\t" | |
| 2259 "psrlq $32, %%mm2 \n\t" | |
| 2260 "movd %%mm2, (%%ebx) \n\t" | |
| 2261 "movd %%mm1, (%%ebx, %1) \n\t" | |
| 2262 "psrlq $32, %%mm1 \n\t" | |
| 2263 "movd %%mm1, (%%ebx, %1, 2) \n\t" | |
| 2264 | |
| 2265 | |
| 2266 "movq 64(%2), %%mm0 \n\t" // 12345678 | |
| 2267 "movq 80(%2), %%mm1 \n\t" // abcdefgh | |
| 2268 "movq %%mm0, %%mm2 \n\t" // 12345678 | |
| 2269 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d | |
| 2270 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h | |
| 2271 | |
| 2272 "movq 96(%2), %%mm1 \n\t" | |
| 2273 "movq 112(%2), %%mm3 \n\t" | |
| 2274 "movq %%mm1, %%mm4 \n\t" | |
| 2275 "punpcklbw %%mm3, %%mm1 \n\t" | |
| 2276 "punpckhbw %%mm3, %%mm4 \n\t" | |
| 2277 | |
| 2278 "movq %%mm0, %%mm3 \n\t" | |
| 2279 "punpcklwd %%mm1, %%mm0 \n\t" | |
| 2280 "punpckhwd %%mm1, %%mm3 \n\t" | |
| 2281 "movq %%mm2, %%mm1 \n\t" | |
| 2282 "punpcklwd %%mm4, %%mm2 \n\t" | |
| 2283 "punpckhwd %%mm4, %%mm1 \n\t" | |
| 2284 | |
| 2285 "movd %%mm0, 4(%0) \n\t" | |
| 2286 "psrlq $32, %%mm0 \n\t" | |
| 2287 "movd %%mm0, 4(%%eax) \n\t" | |
| 2288 "movd %%mm3, 4(%%eax, %1) \n\t" | |
| 2289 "psrlq $32, %%mm3 \n\t" | |
| 2290 "movd %%mm3, 4(%%eax, %1, 2) \n\t" | |
| 2291 "movd %%mm2, 4(%0, %1, 4) \n\t" | |
| 2292 "psrlq $32, %%mm2 \n\t" | |
| 2293 "movd %%mm2, 4(%%ebx) \n\t" | |
| 2294 "movd %%mm1, 4(%%ebx, %1) \n\t" | |
| 2295 "psrlq $32, %%mm1 \n\t" | |
| 2296 "movd %%mm1, 4(%%ebx, %1, 2) \n\t" | |
| 2297 | |
| 2298 :: "r" (dst), "r" (dstStride), "r" (src) | |
| 2299 : "%eax", "%ebx" | |
| 2300 ); | |
| 2301 } | |
| 2302 | |
| 2303 | |
| 2188 #ifdef HAVE_ODIVX_POSTPROCESS | 2304 #ifdef HAVE_ODIVX_POSTPROCESS |
| 2189 #include "../opendivx/postprocess.h" | 2305 #include "../opendivx/postprocess.h" |
| 2190 int use_old_pp=0; | 2306 int use_old_pp=0; |
| 2191 #endif | 2307 #endif |
| 2192 | 2308 |
| 2708 uint8_t *dstBlock= &(dst[y*dstStride]); | 2824 uint8_t *dstBlock= &(dst[y*dstStride]); |
| 2709 #ifdef ARCH_X86 | 2825 #ifdef ARCH_X86 |
| 2710 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; | 2826 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; |
| 2711 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); | 2827 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); |
| 2712 int QPFrac= QPDelta; | 2828 int QPFrac= QPDelta; |
| 2829 uint8_t *tempBlock1= tempBlocks; | |
| 2830 uint8_t *tempBlock2= tempBlocks + 8; | |
| 2713 #endif | 2831 #endif |
| 2714 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not | 2832 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not |
| 2715 than use a temporary buffer */ | 2833 than use a temporary buffer */ |
| 2716 if(y+15 >= height) | 2834 if(y+15 >= height) |
| 2717 { | 2835 { |
| 2740 // finish 1 block before the next otherwise weŽll might have a problem | 2858 // finish 1 block before the next otherwise weŽll might have a problem |
| 2741 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | 2859 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
| 2742 for(x=0; x<width; x+=BLOCK_SIZE) | 2860 for(x=0; x<width; x+=BLOCK_SIZE) |
| 2743 { | 2861 { |
| 2744 const int stride= dstStride; | 2862 const int stride= dstStride; |
| 2863 uint8_t *tmpXchg; | |
| 2745 #ifdef ARCH_X86 | 2864 #ifdef ARCH_X86 |
| 2746 int QP= *QPptr; | 2865 int QP= *QPptr; |
| 2747 asm volatile( | 2866 asm volatile( |
| 2748 "addl %2, %1 \n\t" | 2867 "addl %2, %1 \n\t" |
| 2749 "sbbl %%eax, %%eax \n\t" | 2868 "sbbl %%eax, %%eax \n\t" |
| 2880 T1= rdtsc(); | 2999 T1= rdtsc(); |
| 2881 vertTime+= T1-T0; | 3000 vertTime+= T1-T0; |
| 2882 T0=T1; | 3001 T0=T1; |
| 2883 #endif | 3002 #endif |
| 2884 } | 3003 } |
| 2885 | 3004 #ifdef HAVE_MMX |
| 3005 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride); | |
| 3006 #endif | |
| 2886 /* check if we have a previous block to deblock it with dstBlock */ | 3007 /* check if we have a previous block to deblock it with dstBlock */ |
| 2887 if(x - 8 >= 0) | 3008 if(x - 8 >= 0) |
| 2888 { | 3009 { |
| 2889 #ifdef MORE_TIMING | 3010 #ifdef MORE_TIMING |
| 2890 T0= rdtsc(); | 3011 T0= rdtsc(); |
| 2891 #endif | 3012 #endif |
| 3013 #ifdef HAVE_MMX | |
| 3014 if(mode & H_RK1_FILTER) | |
| 3015 vertRK1Filter(tempBlock1, 16, QP); | |
| 3016 else if(mode & H_X1_FILTER) | |
| 3017 vertX1Filter(tempBlock1, 16, QP); | |
| 3018 else if(mode & H_DEBLOCK) | |
| 3019 { | |
| 3020 if( isVertDC(tempBlock1, 16)) | |
| 3021 { | |
| 3022 if(isVertMinMaxOk(tempBlock1, 16, QP)) | |
| 3023 doVertLowPass(tempBlock1, 16, QP); | |
| 3024 } | |
| 3025 else | |
| 3026 doVertDefFilter(tempBlock1, 16, QP); | |
| 3027 } | |
| 3028 | |
| 3029 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16); | |
| 3030 | |
| 3031 #else | |
| 2892 if(mode & H_X1_FILTER) | 3032 if(mode & H_X1_FILTER) |
| 2893 horizX1Filter(dstBlock-4, stride, QP); | 3033 horizX1Filter(dstBlock-4, stride, QP); |
| 2894 else if(mode & H_DEBLOCK) | 3034 else if(mode & H_DEBLOCK) |
| 2895 { | 3035 { |
| 2896 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) | 3036 if( isHorizDC(dstBlock-4, stride)) |
| 2897 { | 3037 { |
| 2898 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) | 3038 if(isHorizMinMaxOk(dstBlock-4, stride, QP)) |
| 2899 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); | 3039 doHorizLowPass(dstBlock-4, stride, QP); |
| 2900 } | 3040 } |
| 2901 else | 3041 else |
| 2902 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); | 3042 doHorizDefFilter(dstBlock-4, stride, QP); |
| 2903 } | 3043 } |
| 3044 #endif | |
| 2904 #ifdef MORE_TIMING | 3045 #ifdef MORE_TIMING |
| 2905 T1= rdtsc(); | 3046 T1= rdtsc(); |
| 2906 horizTime+= T1-T0; | 3047 horizTime+= T1-T0; |
| 2907 T0=T1; | 3048 T0=T1; |
| 2908 #endif | 3049 #endif |
| 2927 } | 3068 } |
| 2928 #endif | 3069 #endif |
| 2929 | 3070 |
| 2930 dstBlock+=8; | 3071 dstBlock+=8; |
| 2931 srcBlock+=8; | 3072 srcBlock+=8; |
| 3073 | |
| 3074 tmpXchg= tempBlock1; | |
| 3075 tempBlock1= tempBlock2; | |
| 3076 tempBlock2 = tmpXchg; | |
| 2932 } | 3077 } |
| 2933 | 3078 |
| 2934 /* did we use a tmp buffer */ | 3079 /* did we use a tmp buffer */ |
| 2935 if(y+15 >= height) | 3080 if(y+15 >= height) |
| 2936 { | 3081 { |
