comparison libpostproc/postprocess_template.c @ 128:e5266b8e79be libavcodec

much better horizontal filters (transpose & use the vertical ones) :) bugfix bugs?
author michael
date Wed, 24 Oct 2001 16:39:40 +0000
parents 55f57883bbf5
children be35346e27c1
comparison
equal deleted inserted replaced
127:2fe8f116576c 128:e5266b8e79be
21 isVertDC Ec Ec 21 isVertDC Ec Ec
22 isVertMinMaxOk Ec Ec 22 isVertMinMaxOk Ec Ec
23 doVertLowPass E e e 23 doVertLowPass E e e
24 doVertDefFilter Ec Ec Ec 24 doVertDefFilter Ec Ec Ec
25 isHorizDC Ec Ec 25 isHorizDC Ec Ec
26 isHorizMinMaxOk a 26 isHorizMinMaxOk a E
27 doHorizLowPass E a a 27 doHorizLowPass E e e
28 doHorizDefFilter E ac ac 28 doHorizDefFilter E E E
29 deRing 29 deRing
30 Vertical RKAlgo1 E a a 30 Vertical RKAlgo1 E a a
31 Vertical X1 a E E 31 Vertical X1 a E E
32 Horizontal X1 a E E 32 Horizontal X1 a E E
33 LinIpolDeinterlace e E E* 33 LinIpolDeinterlace e E E*
58 (the if/else stuff per block is slowing things down) 58 (the if/else stuff per block is slowing things down)
59 compare the quality & speed of all filters 59 compare the quality & speed of all filters
60 split this huge file 60 split this huge file
61 fix warnings (unused vars, ...) 61 fix warnings (unused vars, ...)
62 noise reduction filters 62 noise reduction filters
63 write an exact implementation of the horizontal delocking filter
64 ... 63 ...
65 64
66 Notes: 65 Notes:
67 66
68 */ 67 */
126 static uint64_t temp2=0; 125 static uint64_t temp2=0;
127 static uint64_t temp3=0; 126 static uint64_t temp3=0;
128 static uint64_t temp4=0; 127 static uint64_t temp4=0;
129 static uint64_t temp5=0; 128 static uint64_t temp5=0;
130 static uint64_t pQPb=0; 129 static uint64_t pQPb=0;
131 static uint8_t tempBlock[16*16]; //used so the horizontal code gets aligned data 130 static uint8_t tempBlocks[8*16*2]; //used for the horizontal code
132 131
133 int hFlatnessThreshold= 56 - 16; 132 int hFlatnessThreshold= 56 - 16;
134 int vFlatnessThreshold= 56 - 16; 133 int vFlatnessThreshold= 56 - 16;
135 134
136 //amount of "black" u r willing to loose to get a brightness corrected picture 135 //amount of "black" u r willing to loose to get a brightness corrected picture
275 "psrlq $32, %%mm0 \n\t" 274 "psrlq $32, %%mm0 \n\t"
276 "paddb %%mm1, %%mm0 \n\t" 275 "paddb %%mm1, %%mm0 \n\t"
277 "movd %%mm0, %0 \n\t" 276 "movd %%mm0, %0 \n\t"
278 : "=r" (numEq) 277 : "=r" (numEq)
279 : "r" (src), "r" (stride) 278 : "r" (src), "r" (stride)
279 : "%eax", "%ebx"
280 ); 280 );
281 281
282 numEq= (256 - numEq) &0xFF; 282 numEq= (256 - numEq) &0xFF;
283 283
284 #else 284 #else
848 (D<<24) | (C<<16) | (B<<8) | (A); 848 (D<<24) | (C<<16) | (B<<8) | (A);
849 //lut[i] = (v<<32) | (v<<24); 849 //lut[i] = (v<<32) | (v<<24);
850 } 850 }
851 } 851 }
852 852
853 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 853 #if 0
854 asm volatile( 854 asm volatile(
855 "pxor %%mm7, %%mm7 \n\t" // 0 855 "pxor %%mm7, %%mm7 \n\t" // 0
856 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE 856 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE
857 "leal (%0, %1), %%eax \n\t" 857 "leal (%0, %1), %%eax \n\t"
858 "leal (%%eax, %1, 4), %%ebx \n\t" 858 "leal (%%eax, %1, 4), %%ebx \n\t"
1293 #endif 1293 #endif
1294 } 1294 }
1295 1295
1296 //FIXME? |255-0| = 1 1296 //FIXME? |255-0| = 1
1297 /** 1297 /**
1298 * Check if the given 8x8 Block is mostly "flat" and copy the unaliged data into tempBlock. 1298 * Check if the given 8x8 Block is mostly "flat"
1299 */ 1299 */
1300 static inline int isHorizDCAndCopy2Temp(uint8_t src[], int stride) 1300 static inline int isHorizDC(uint8_t src[], int stride)
1301 { 1301 {
1302 // src++; 1302 // src++;
1303 int numEq= 0; 1303 int numEq= 0;
1304 #ifdef HAVE_MMX 1304 #if 0
1305 asm volatile ( 1305 asm volatile (
1306 // "int $3 \n\t" 1306 // "int $3 \n\t"
1307 "leal (%1, %2), %%ecx \n\t" 1307 "leal (%1, %2), %%ecx \n\t"
1308 "leal (%%ecx, %2, 4), %%ebx \n\t" 1308 "leal (%%ecx, %2, 4), %%ebx \n\t"
1309 // 0 1 2 3 4 5 6 7 8 9 1309 // 0 1 2 3 4 5 6 7 8 9
1384 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++; 1384 if(((src[2] - src[3] + 1) & 0xFFFF) < 3) numEq++;
1385 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++; 1385 if(((src[3] - src[4] + 1) & 0xFFFF) < 3) numEq++;
1386 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++; 1386 if(((src[4] - src[5] + 1) & 0xFFFF) < 3) numEq++;
1387 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++; 1387 if(((src[5] - src[6] + 1) & 0xFFFF) < 3) numEq++;
1388 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++; 1388 if(((src[6] - src[7] + 1) & 0xFFFF) < 3) numEq++;
1389 tempBlock[0 + y*TEMP_STRIDE] = src[0];
1390 tempBlock[1 + y*TEMP_STRIDE] = src[1];
1391 tempBlock[2 + y*TEMP_STRIDE] = src[2];
1392 tempBlock[3 + y*TEMP_STRIDE] = src[3];
1393 tempBlock[4 + y*TEMP_STRIDE] = src[4];
1394 tempBlock[5 + y*TEMP_STRIDE] = src[5];
1395 tempBlock[6 + y*TEMP_STRIDE] = src[6];
1396 tempBlock[7 + y*TEMP_STRIDE] = src[7];
1397 src+= stride; 1389 src+= stride;
1398 } 1390 }
1399 #endif 1391 #endif
1400 /* if(abs(numEq - asmEq) > 0) 1392 /* if(abs(numEq - asmEq) > 0)
1401 { 1393 {
1414 return numEq > hFlatnessThreshold; 1406 return numEq > hFlatnessThreshold;
1415 } 1407 }
1416 1408
1417 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP) 1409 static inline int isHorizMinMaxOk(uint8_t src[], int stride, int QP)
1418 { 1410 {
1419 #ifdef MMX_FIXME
1420 FIXME
1421 int isOk;
1422 asm volatile(
1423 // "int $3 \n\t"
1424 "movq (%1, %2), %%mm0 \n\t"
1425 "movq (%1, %2, 8), %%mm1 \n\t"
1426 "movq %%mm0, %%mm2 \n\t"
1427 "psubusb %%mm1, %%mm0 \n\t"
1428 "psubusb %%mm2, %%mm1 \n\t"
1429 "por %%mm1, %%mm0 \n\t" // ABS Diff
1430
1431 "movq pQPb, %%mm7 \n\t" // QP,..., QP
1432 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
1433 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0
1434 "pcmpeqd b00, %%mm0 \n\t"
1435 "psrlq $16, %%mm0 \n\t"
1436 "pcmpeqd bFF, %%mm0 \n\t"
1437 // "movd %%mm0, (%1, %2, 4)\n\t"
1438 "movd %%mm0, %0 \n\t"
1439 : "=r" (isOk)
1440 : "r" (src), "r" (stride)
1441 );
1442 return isOk;
1443 #else
1444 if(abs(src[0] - src[7]) > 2*QP) return 0; 1411 if(abs(src[0] - src[7]) > 2*QP) return 0;
1445 1412
1446 return 1; 1413 return 1;
1447 #endif
1448 } 1414 }
1449 1415
1450 static inline void doHorizDefFilterAndCopyBack(uint8_t dst[], int stride, int QP) 1416 static inline void doHorizDefFilter(uint8_t dst[], int stride, int QP)
1451 { 1417 {
1452 #ifdef HAVE_MMX 1418 #if 0
1453 asm volatile( 1419 asm volatile(
1454 "leal (%0, %1), %%ecx \n\t" 1420 "leal (%0, %1), %%ecx \n\t"
1455 "leal (%%ecx, %1, 4), %%ebx \n\t" 1421 "leal (%%ecx, %1, 4), %%ebx \n\t"
1456 // 0 1 2 3 4 5 6 7 8 9 1422 // 0 1 2 3 4 5 6 7 8 9
1457 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1423 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1534 : 1500 :
1535 : "r" (dst), "r" (stride), "r" (QP) 1501 : "r" (dst), "r" (stride), "r" (QP)
1536 : "%eax", "%ebx", "%ecx" 1502 : "%eax", "%ebx", "%ecx"
1537 ); 1503 );
1538 #else 1504 #else
1539 uint8_t *src= tempBlock;
1540
1541 int y; 1505 int y;
1542 for(y=0; y<BLOCK_SIZE; y++) 1506 for(y=0; y<BLOCK_SIZE; y++)
1543 { 1507 {
1544 const int middleEnergy= 5*(src[4] - src[5]) + 2*(src[2] - src[5]); 1508 const int middleEnergy= 5*(dst[4] - dst[5]) + 2*(dst[2] - dst[5]);
1545
1546 dst[0] = src[0];
1547 dst[1] = src[1];
1548 dst[2] = src[2];
1549 dst[3] = src[3];
1550 dst[4] = src[4];
1551 dst[5] = src[5];
1552 dst[6] = src[6];
1553 dst[7] = src[7];
1554 1509
1555 if(ABS(middleEnergy) < 8*QP) 1510 if(ABS(middleEnergy) < 8*QP)
1556 { 1511 {
1557 const int q=(src[3] - src[4])/2; 1512 const int q=(dst[3] - dst[4])/2;
1558 const int leftEnergy= 5*(src[2] - src[1]) + 2*(src[0] - src[3]); 1513 const int leftEnergy= 5*(dst[2] - dst[1]) + 2*(dst[0] - dst[3]);
1559 const int rightEnergy= 5*(src[6] - src[5]) + 2*(src[4] - src[7]); 1514 const int rightEnergy= 5*(dst[6] - dst[5]) + 2*(dst[4] - dst[7]);
1560 1515
1561 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) ); 1516 int d= ABS(middleEnergy) - MIN( ABS(leftEnergy), ABS(rightEnergy) );
1562 d= MAX(d, 0); 1517 d= MAX(d, 0);
1563 1518
1564 d= (5*d + 32) >> 6; 1519 d= (5*d + 32) >> 6;
1577 1532
1578 dst[3]-= d; 1533 dst[3]-= d;
1579 dst[4]+= d; 1534 dst[4]+= d;
1580 } 1535 }
1581 dst+= stride; 1536 dst+= stride;
1582 src+= TEMP_STRIDE;
1583 } 1537 }
1584 #endif 1538 #endif
1585 } 1539 }
1586 1540
1587 /** 1541 /**
1588 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block) 1542 * Do a horizontal low pass filter on the 10x8 block (dst points to middle 8x8 Block)
1589 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version) 1543 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 (C version)
1590 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version) 1544 * using the 7-Tap Filter (2,2,2,4,2,2,2)/16 (MMX2/3DNOW version)
1591 */ 1545 */
1592 static inline void doHorizLowPassAndCopyBack(uint8_t dst[], int stride, int QP) 1546 static inline void doHorizLowPass(uint8_t dst[], int stride, int QP)
1593 { 1547 {
1594 //return; 1548
1595 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) 1549 #if 0
1596 asm volatile( 1550 asm volatile(
1597 "leal (%0, %1), %%ecx \n\t" 1551 "leal (%0, %1), %%ecx \n\t"
1598 "leal (%%ecx, %1, 4), %%ebx \n\t" 1552 "leal (%%ecx, %1, 4), %%ebx \n\t"
1599 // 0 1 2 3 4 5 6 7 8 9 1553 // 0 1 2 3 4 5 6 7 8 9
1600 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 1554 // %0 ecx ecx+%1 ecx+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
1800 : "r" (dst), "r" (stride) 1754 : "r" (dst), "r" (stride)
1801 : "%eax", "%ebx", "%ecx" 1755 : "%eax", "%ebx", "%ecx"
1802 ); 1756 );
1803 1757
1804 #else 1758 #else
1805 uint8_t *temp= tempBlock;
1806 int y; 1759 int y;
1807 for(y=0; y<BLOCK_SIZE; y++) 1760 for(y=0; y<BLOCK_SIZE; y++)
1808 { 1761 {
1809 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0]; 1762 const int first= ABS(dst[-1] - dst[0]) < QP ? dst[-1] : dst[0];
1810 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7]; 1763 const int last= ABS(dst[8] - dst[7]) < QP ? dst[8] : dst[7];
1811 1764
1812 int sums[9]; 1765 int sums[9];
1813 sums[0] = first + temp[0]; 1766 sums[0] = first + dst[0];
1814 sums[1] = temp[0] + temp[1]; 1767 sums[1] = dst[0] + dst[1];
1815 sums[2] = temp[1] + temp[2]; 1768 sums[2] = dst[1] + dst[2];
1816 sums[3] = temp[2] + temp[3]; 1769 sums[3] = dst[2] + dst[3];
1817 sums[4] = temp[3] + temp[4]; 1770 sums[4] = dst[3] + dst[4];
1818 sums[5] = temp[4] + temp[5]; 1771 sums[5] = dst[4] + dst[5];
1819 sums[6] = temp[5] + temp[6]; 1772 sums[6] = dst[5] + dst[6];
1820 sums[7] = temp[6] + temp[7]; 1773 sums[7] = dst[6] + dst[7];
1821 sums[8] = temp[7] + last; 1774 sums[8] = dst[7] + last;
1822 1775
1823 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4; 1776 dst[0]= ((sums[0]<<2) + ((first + sums[2])<<1) + sums[4] + 8)>>4;
1824 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4; 1777 dst[1]= ((dst[1]<<2) + (first + sums[0] + sums[3]<<1) + sums[5] + 8)>>4;
1825 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4; 1778 dst[2]= ((dst[2]<<2) + (first + sums[1] + sums[4]<<1) + sums[6] + 8)>>4;
1826 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4; 1779 dst[3]= ((dst[3]<<2) + (sums[2] + sums[5]<<1) + sums[0] + sums[7] + 8)>>4;
1828 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4; 1781 dst[5]= ((dst[5]<<2) + (last + sums[7] + sums[4]<<1) + sums[2] + 8)>>4;
1829 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4; 1782 dst[6]= ((last + dst[6]<<2) + (dst[7] + sums[5]<<1) + sums[3] + 8)>>4;
1830 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4; 1783 dst[7]= ((sums[8]<<2) + (last + sums[6]<<1) + sums[4] + 8)>>4;
1831 1784
1832 dst+= stride; 1785 dst+= stride;
1833 temp+= TEMP_STRIDE;
1834 } 1786 }
1835 #endif 1787 #endif
1836 } 1788 }
1837
1838 1789
1839 static inline void dering(uint8_t src[], int stride, int QP) 1790 static inline void dering(uint8_t src[], int stride, int QP)
1840 { 1791 {
1841 //FIXME 1792 //FIXME
1842 1793
2183 src++; 2134 src++;
2184 } 2135 }
2185 #endif 2136 #endif
2186 } 2137 }
2187 2138
2139 /**
2140 * transposes and shift the given 8x8 Block into dst1 and dst2
2141 */
2142 static inline void transpose1(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
2143 {
2144 asm(
2145 "leal (%0, %1), %%eax \n\t"
2146 "leal (%%eax, %1, 4), %%ebx \n\t"
2147 // 0 1 2 3 4 5 6 7 8 9
2148 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2149 "movq (%0), %%mm0 \n\t" // 12345678
2150 "movq (%%eax), %%mm1 \n\t" // abcdefgh
2151 "movq %%mm0, %%mm2 \n\t" // 12345678
2152 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2153 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2154
2155 "movq (%%eax, %1), %%mm1 \n\t"
2156 "movq (%%eax, %1, 2), %%mm3 \n\t"
2157 "movq %%mm1, %%mm4 \n\t"
2158 "punpcklbw %%mm3, %%mm1 \n\t"
2159 "punpckhbw %%mm3, %%mm4 \n\t"
2160
2161 "movq %%mm0, %%mm3 \n\t"
2162 "punpcklwd %%mm1, %%mm0 \n\t"
2163 "punpckhwd %%mm1, %%mm3 \n\t"
2164 "movq %%mm2, %%mm1 \n\t"
2165 "punpcklwd %%mm4, %%mm2 \n\t"
2166 "punpckhwd %%mm4, %%mm1 \n\t"
2167
2168 "movd %%mm0, 128(%2) \n\t"
2169 "psrlq $32, %%mm0 \n\t"
2170 "movd %%mm0, 144(%2) \n\t"
2171 "movd %%mm3, 160(%2) \n\t"
2172 "psrlq $32, %%mm3 \n\t"
2173 "movd %%mm3, 176(%2) \n\t"
2174 "movd %%mm3, 48(%3) \n\t"
2175 "movd %%mm2, 192(%2) \n\t"
2176 "movd %%mm2, 64(%3) \n\t"
2177 "psrlq $32, %%mm2 \n\t"
2178 "movd %%mm2, 80(%3) \n\t"
2179 "movd %%mm1, 96(%3) \n\t"
2180 "psrlq $32, %%mm1 \n\t"
2181 "movd %%mm1, 112(%3) \n\t"
2182
2183 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2184 "movq (%%ebx), %%mm1 \n\t" // abcdefgh
2185 "movq %%mm0, %%mm2 \n\t" // 12345678
2186 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2187 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2188
2189 "movq (%%ebx, %1), %%mm1 \n\t"
2190 "movq (%%ebx, %1, 2), %%mm3 \n\t"
2191 "movq %%mm1, %%mm4 \n\t"
2192 "punpcklbw %%mm3, %%mm1 \n\t"
2193 "punpckhbw %%mm3, %%mm4 \n\t"
2194
2195 "movq %%mm0, %%mm3 \n\t"
2196 "punpcklwd %%mm1, %%mm0 \n\t"
2197 "punpckhwd %%mm1, %%mm3 \n\t"
2198 "movq %%mm2, %%mm1 \n\t"
2199 "punpcklwd %%mm4, %%mm2 \n\t"
2200 "punpckhwd %%mm4, %%mm1 \n\t"
2201
2202 "movd %%mm0, 132(%2) \n\t"
2203 "psrlq $32, %%mm0 \n\t"
2204 "movd %%mm0, 148(%2) \n\t"
2205 "movd %%mm3, 164(%2) \n\t"
2206 "psrlq $32, %%mm3 \n\t"
2207 "movd %%mm3, 180(%2) \n\t"
2208 "movd %%mm3, 52(%3) \n\t"
2209 "movd %%mm2, 196(%2) \n\t"
2210 "movd %%mm2, 68(%3) \n\t"
2211 "psrlq $32, %%mm2 \n\t"
2212 "movd %%mm2, 84(%3) \n\t"
2213 "movd %%mm1, 100(%3) \n\t"
2214 "psrlq $32, %%mm1 \n\t"
2215 "movd %%mm1, 116(%3) \n\t"
2216
2217
2218 :: "r" (src), "r" (srcStride), "r" (dst1), "r" (dst2)
2219 : "%eax", "%ebx"
2220 );
2221 }
2222
2223 /**
2224 * transposes the given 8x8 block
2225 */
2226 static inline void transpose2(uint8_t *dst, int dstStride, uint8_t *src)
2227 {
2228 asm(
2229 "leal (%0, %1), %%eax \n\t"
2230 "leal (%%eax, %1, 4), %%ebx \n\t"
2231 // 0 1 2 3 4 5 6 7 8 9
2232 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1
2233 "movq (%2), %%mm0 \n\t" // 12345678
2234 "movq 16(%2), %%mm1 \n\t" // abcdefgh
2235 "movq %%mm0, %%mm2 \n\t" // 12345678
2236 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2237 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2238
2239 "movq 32(%2), %%mm1 \n\t"
2240 "movq 48(%2), %%mm3 \n\t"
2241 "movq %%mm1, %%mm4 \n\t"
2242 "punpcklbw %%mm3, %%mm1 \n\t"
2243 "punpckhbw %%mm3, %%mm4 \n\t"
2244
2245 "movq %%mm0, %%mm3 \n\t"
2246 "punpcklwd %%mm1, %%mm0 \n\t"
2247 "punpckhwd %%mm1, %%mm3 \n\t"
2248 "movq %%mm2, %%mm1 \n\t"
2249 "punpcklwd %%mm4, %%mm2 \n\t"
2250 "punpckhwd %%mm4, %%mm1 \n\t"
2251
2252 "movd %%mm0, (%0) \n\t"
2253 "psrlq $32, %%mm0 \n\t"
2254 "movd %%mm0, (%%eax) \n\t"
2255 "movd %%mm3, (%%eax, %1) \n\t"
2256 "psrlq $32, %%mm3 \n\t"
2257 "movd %%mm3, (%%eax, %1, 2) \n\t"
2258 "movd %%mm2, (%0, %1, 4) \n\t"
2259 "psrlq $32, %%mm2 \n\t"
2260 "movd %%mm2, (%%ebx) \n\t"
2261 "movd %%mm1, (%%ebx, %1) \n\t"
2262 "psrlq $32, %%mm1 \n\t"
2263 "movd %%mm1, (%%ebx, %1, 2) \n\t"
2264
2265
2266 "movq 64(%2), %%mm0 \n\t" // 12345678
2267 "movq 80(%2), %%mm1 \n\t" // abcdefgh
2268 "movq %%mm0, %%mm2 \n\t" // 12345678
2269 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2270 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2271
2272 "movq 96(%2), %%mm1 \n\t"
2273 "movq 112(%2), %%mm3 \n\t"
2274 "movq %%mm1, %%mm4 \n\t"
2275 "punpcklbw %%mm3, %%mm1 \n\t"
2276 "punpckhbw %%mm3, %%mm4 \n\t"
2277
2278 "movq %%mm0, %%mm3 \n\t"
2279 "punpcklwd %%mm1, %%mm0 \n\t"
2280 "punpckhwd %%mm1, %%mm3 \n\t"
2281 "movq %%mm2, %%mm1 \n\t"
2282 "punpcklwd %%mm4, %%mm2 \n\t"
2283 "punpckhwd %%mm4, %%mm1 \n\t"
2284
2285 "movd %%mm0, 4(%0) \n\t"
2286 "psrlq $32, %%mm0 \n\t"
2287 "movd %%mm0, 4(%%eax) \n\t"
2288 "movd %%mm3, 4(%%eax, %1) \n\t"
2289 "psrlq $32, %%mm3 \n\t"
2290 "movd %%mm3, 4(%%eax, %1, 2) \n\t"
2291 "movd %%mm2, 4(%0, %1, 4) \n\t"
2292 "psrlq $32, %%mm2 \n\t"
2293 "movd %%mm2, 4(%%ebx) \n\t"
2294 "movd %%mm1, 4(%%ebx, %1) \n\t"
2295 "psrlq $32, %%mm1 \n\t"
2296 "movd %%mm1, 4(%%ebx, %1, 2) \n\t"
2297
2298 :: "r" (dst), "r" (dstStride), "r" (src)
2299 : "%eax", "%ebx"
2300 );
2301 }
2302
2303
2188 #ifdef HAVE_ODIVX_POSTPROCESS 2304 #ifdef HAVE_ODIVX_POSTPROCESS
2189 #include "../opendivx/postprocess.h" 2305 #include "../opendivx/postprocess.h"
2190 int use_old_pp=0; 2306 int use_old_pp=0;
2191 #endif 2307 #endif
2192 2308
2708 uint8_t *dstBlock= &(dst[y*dstStride]); 2824 uint8_t *dstBlock= &(dst[y*dstStride]);
2709 #ifdef ARCH_X86 2825 #ifdef ARCH_X86
2710 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride]; 2826 int *QPptr= isColor ? &QPs[(y>>3)*QPStride] :&QPs[(y>>4)*QPStride];
2711 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4); 2827 int QPDelta= isColor ? 1<<(32-3) : 1<<(32-4);
2712 int QPFrac= QPDelta; 2828 int QPFrac= QPDelta;
2829 uint8_t *tempBlock1= tempBlocks;
2830 uint8_t *tempBlock2= tempBlocks + 8;
2713 #endif 2831 #endif
2714 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not 2832 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not
2715 than use a temporary buffer */ 2833 than use a temporary buffer */
2716 if(y+15 >= height) 2834 if(y+15 >= height)
2717 { 2835 {
2740 // finish 1 block before the next otherwise weŽll might have a problem 2858 // finish 1 block before the next otherwise weŽll might have a problem
2741 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 2859 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing
2742 for(x=0; x<width; x+=BLOCK_SIZE) 2860 for(x=0; x<width; x+=BLOCK_SIZE)
2743 { 2861 {
2744 const int stride= dstStride; 2862 const int stride= dstStride;
2863 uint8_t *tmpXchg;
2745 #ifdef ARCH_X86 2864 #ifdef ARCH_X86
2746 int QP= *QPptr; 2865 int QP= *QPptr;
2747 asm volatile( 2866 asm volatile(
2748 "addl %2, %1 \n\t" 2867 "addl %2, %1 \n\t"
2749 "sbbl %%eax, %%eax \n\t" 2868 "sbbl %%eax, %%eax \n\t"
2880 T1= rdtsc(); 2999 T1= rdtsc();
2881 vertTime+= T1-T0; 3000 vertTime+= T1-T0;
2882 T0=T1; 3001 T0=T1;
2883 #endif 3002 #endif
2884 } 3003 }
2885 3004 #ifdef HAVE_MMX
3005 transpose1(tempBlock1, tempBlock2, dstBlock, dstStride);
3006 #endif
2886 /* check if we have a previous block to deblock it with dstBlock */ 3007 /* check if we have a previous block to deblock it with dstBlock */
2887 if(x - 8 >= 0) 3008 if(x - 8 >= 0)
2888 { 3009 {
2889 #ifdef MORE_TIMING 3010 #ifdef MORE_TIMING
2890 T0= rdtsc(); 3011 T0= rdtsc();
2891 #endif 3012 #endif
3013 #ifdef HAVE_MMX
3014 if(mode & H_RK1_FILTER)
3015 vertRK1Filter(tempBlock1, 16, QP);
3016 else if(mode & H_X1_FILTER)
3017 vertX1Filter(tempBlock1, 16, QP);
3018 else if(mode & H_DEBLOCK)
3019 {
3020 if( isVertDC(tempBlock1, 16))
3021 {
3022 if(isVertMinMaxOk(tempBlock1, 16, QP))
3023 doVertLowPass(tempBlock1, 16, QP);
3024 }
3025 else
3026 doVertDefFilter(tempBlock1, 16, QP);
3027 }
3028
3029 transpose2(dstBlock-4, dstStride, tempBlock1 + 4*16);
3030
3031 #else
2892 if(mode & H_X1_FILTER) 3032 if(mode & H_X1_FILTER)
2893 horizX1Filter(dstBlock-4, stride, QP); 3033 horizX1Filter(dstBlock-4, stride, QP);
2894 else if(mode & H_DEBLOCK) 3034 else if(mode & H_DEBLOCK)
2895 { 3035 {
2896 if( isHorizDCAndCopy2Temp(dstBlock-4, stride)) 3036 if( isHorizDC(dstBlock-4, stride))
2897 { 3037 {
2898 if(isHorizMinMaxOk(tempBlock, TEMP_STRIDE, QP)) 3038 if(isHorizMinMaxOk(dstBlock-4, stride, QP))
2899 doHorizLowPassAndCopyBack(dstBlock-4, stride, QP); 3039 doHorizLowPass(dstBlock-4, stride, QP);
2900 } 3040 }
2901 else 3041 else
2902 doHorizDefFilterAndCopyBack(dstBlock-4, stride, QP); 3042 doHorizDefFilter(dstBlock-4, stride, QP);
2903 } 3043 }
3044 #endif
2904 #ifdef MORE_TIMING 3045 #ifdef MORE_TIMING
2905 T1= rdtsc(); 3046 T1= rdtsc();
2906 horizTime+= T1-T0; 3047 horizTime+= T1-T0;
2907 T0=T1; 3048 T0=T1;
2908 #endif 3049 #endif
2927 } 3068 }
2928 #endif 3069 #endif
2929 3070
2930 dstBlock+=8; 3071 dstBlock+=8;
2931 srcBlock+=8; 3072 srcBlock+=8;
3073
3074 tmpXchg= tempBlock1;
3075 tempBlock1= tempBlock2;
3076 tempBlock2 = tmpXchg;
2932 } 3077 }
2933 3078
2934 /* did we use a tmp buffer */ 3079 /* did we use a tmp buffer */
2935 if(y+15 >= height) 3080 if(y+15 >= height)
2936 { 3081 {