Mercurial > libavcodec.hg
comparison libpostproc/postprocess.c @ 120:b0b89f5d0288 libavcodec
and another +2% speedup
| author | michael |
|---|---|
| date | Tue, 23 Oct 2001 12:05:34 +0000 |
| parents | b2f0e40866b1 |
| children | 3ecf2a90c65e |
comparison
equal
deleted
inserted
replaced
| 119:b2f0e40866b1 | 120:b0b89f5d0288 |
|---|---|
| 334 // "movd %%mm0, (%1, %2, 4)\n\t" | 334 // "movd %%mm0, (%1, %2, 4)\n\t" |
| 335 "movd %%mm0, %0 \n\t" | 335 "movd %%mm0, %0 \n\t" |
| 336 : "=r" (isOk) | 336 : "=r" (isOk) |
| 337 : "r" (src), "r" (stride) | 337 : "r" (src), "r" (stride) |
| 338 ); | 338 ); |
| 339 return isOk ? 1 : 0; | 339 return isOk; |
| 340 #else | 340 #else |
| 341 | 341 |
| 342 int isOk2= 1; | 342 int isOk2= 1; |
| 343 int x; | 343 int x; |
| 344 src+= stride*3; | 344 src+= stride*3; |
| 1301 // src++; | 1301 // src++; |
| 1302 int numEq= 0; | 1302 int numEq= 0; |
| 1303 #ifdef HAVE_MMX | 1303 #ifdef HAVE_MMX |
| 1304 asm volatile ( | 1304 asm volatile ( |
| 1305 // "int $3 \n\t" | 1305 // "int $3 \n\t" |
| 1306 "pushl %1\n\t" | 1306 "leal (%1, %2), %%ecx \n\t" |
| 1307 "leal (%%ecx, %2, 4), %%ebx \n\t" | |
| 1308 // 0 1 2 3 4 5 6 7 8 9 | |
| 1309 // %1 ecx ecx+%2 ecx+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 | |
| 1307 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F | 1310 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F |
| 1308 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D | 1311 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D |
| 1312 "pxor %%mm0, %%mm0 \n\t" | |
| 1313 "movl %1, %%eax \n\t" | |
| 1314 "andl $0x1F, %%eax \n\t" | |
| 1315 "cmpl $24, %%eax \n\t" | |
| 1309 "leal tempBlock, %%eax \n\t" | 1316 "leal tempBlock, %%eax \n\t" |
| 1310 "pxor %%mm0, %%mm0 \n\t" | 1317 "jb 1f \n\t" |
| 1311 | 1318 |
| 1312 #define HDC_CHECK_AND_CPY(i) \ | 1319 #define HDC_CHECK_AND_CPY(src, dst) \ |
| 1313 "movq -4(%1), %%mm2 \n\t"\ | 1320 "movd " #src ", %%mm2 \n\t"\ |
| 1314 "psrlq $32, %%mm2 \n\t"\ | 1321 "punpckldq 4" #src ", %%mm2 \n\t" /* (%1) */\ |
| 1315 "punpckldq 4(%1), %%mm2 \n\t" /* (%1) */\ | |
| 1316 "movq %%mm2, %%mm1 \n\t"\ | 1322 "movq %%mm2, %%mm1 \n\t"\ |
| 1317 "psrlq $8, %%mm2 \n\t"\ | 1323 "psrlq $8, %%mm2 \n\t"\ |
| 1318 "psubb %%mm1, %%mm2 \n\t"\ | 1324 "psubb %%mm1, %%mm2 \n\t"\ |
| 1319 "paddb %%mm7, %%mm2 \n\t"\ | 1325 "paddb %%mm7, %%mm2 \n\t"\ |
| 1320 "pcmpgtb %%mm6, %%mm2 \n\t"\ | 1326 "pcmpgtb %%mm6, %%mm2 \n\t"\ |
| 1321 "paddb %%mm2, %%mm0 \n\t"\ | 1327 "paddb %%mm2, %%mm0 \n\t"\ |
| 1322 "movq %%mm1," #i "(%%eax) \n\t" | 1328 "movq %%mm1," #dst "(%%eax) \n\t" |
| 1323 | 1329 |
| 1324 HDC_CHECK_AND_CPY(0) | 1330 HDC_CHECK_AND_CPY((%1),0) |
| 1325 "addl %2, %1 \n\t" | 1331 HDC_CHECK_AND_CPY((%%ecx),8) |
| 1326 HDC_CHECK_AND_CPY(8) | 1332 HDC_CHECK_AND_CPY((%%ecx, %2),16) |
| 1327 "addl %2, %1 \n\t" | 1333 HDC_CHECK_AND_CPY((%%ecx, %2, 2),24) |
| 1328 HDC_CHECK_AND_CPY(16) | 1334 HDC_CHECK_AND_CPY((%1, %2, 4),32) |
| 1329 "addl %2, %1 \n\t" | 1335 HDC_CHECK_AND_CPY((%%ebx),40) |
| 1330 HDC_CHECK_AND_CPY(24) | 1336 HDC_CHECK_AND_CPY((%%ebx, %2),48) |
| 1331 "addl %2, %1 \n\t" | 1337 HDC_CHECK_AND_CPY((%%ebx, %2, 2),56) |
| 1332 HDC_CHECK_AND_CPY(32) | 1338 "jmp 2f \n\t" |
| 1333 "addl %2, %1 \n\t" | 1339 "1: \n\t" |
| 1334 HDC_CHECK_AND_CPY(40) | 1340 // src does not cross a 32 byte cache line so dont waste time with alignment |
| 1335 "addl %2, %1 \n\t" | 1341 #define HDC_CHECK_AND_CPY2(src, dst) \ |
| 1336 HDC_CHECK_AND_CPY(48) | 1342 "movq " #src ", %%mm2 \n\t"\ |
| 1337 "addl %2, %1 \n\t" | 1343 "movq " #src ", %%mm1 \n\t"\ |
| 1338 HDC_CHECK_AND_CPY(56) | 1344 "psrlq $8, %%mm2 \n\t"\ |
| 1339 | 1345 "psubb %%mm1, %%mm2 \n\t"\ |
| 1346 "paddb %%mm7, %%mm2 \n\t"\ | |
| 1347 "pcmpgtb %%mm6, %%mm2 \n\t"\ | |
| 1348 "paddb %%mm2, %%mm0 \n\t"\ | |
| 1349 "movq %%mm1," #dst "(%%eax) \n\t" | |
| 1350 | |
| 1351 HDC_CHECK_AND_CPY2((%1),0) | |
| 1352 HDC_CHECK_AND_CPY2((%%ecx),8) | |
| 1353 HDC_CHECK_AND_CPY2((%%ecx, %2),16) | |
| 1354 HDC_CHECK_AND_CPY2((%%ecx, %2, 2),24) | |
| 1355 HDC_CHECK_AND_CPY2((%1, %2, 4),32) | |
| 1356 HDC_CHECK_AND_CPY2((%%ebx),40) | |
| 1357 HDC_CHECK_AND_CPY2((%%ebx, %2),48) | |
| 1358 HDC_CHECK_AND_CPY2((%%ebx, %2, 2),56) | |
| 1359 "2: \n\t" | |
| 1340 "psllq $8, %%mm0 \n\t" // remove dummy value | 1360 "psllq $8, %%mm0 \n\t" // remove dummy value |
| 1341 "movq %%mm0, %%mm1 \n\t" | 1361 "movq %%mm0, %%mm1 \n\t" |
| 1342 "psrlw $8, %%mm0 \n\t" | 1362 "psrlw $8, %%mm0 \n\t" |
| 1343 "paddb %%mm1, %%mm0 \n\t" | 1363 "paddb %%mm1, %%mm0 \n\t" |
| 1344 "movq %%mm0, %%mm1 \n\t" | 1364 "movq %%mm0, %%mm1 \n\t" |
| 1345 "psrlq $16, %%mm0 \n\t" | 1365 "psrlq $16, %%mm0 \n\t" |
| 1346 "paddb %%mm1, %%mm0 \n\t" | 1366 "paddb %%mm1, %%mm0 \n\t" |
| 1347 "movq %%mm0, %%mm1 \n\t" | 1367 "movq %%mm0, %%mm1 \n\t" |
| 1348 "psrlq $32, %%mm0 \n\t" | 1368 "psrlq $32, %%mm0 \n\t" |
| 1349 "paddb %%mm1, %%mm0 \n\t" | 1369 "paddb %%mm1, %%mm0 \n\t" |
| 1350 "popl %1\n\t" | |
| 1351 "movd %%mm0, %0 \n\t" | 1370 "movd %%mm0, %0 \n\t" |
| 1352 : "=r" (numEq) | 1371 : "=r" (numEq) |
| 1353 : "r" (src), "r" (stride) | 1372 : "r" (src), "r" (stride) |
| 1354 : "%eax" | 1373 : "%eax", "%ebx", "%ecx" |
| 1355 ); | 1374 ); |
| 1356 // printf("%d\n", numEq); | 1375 // printf("%d\n", numEq); |
| 1357 numEq= (256 - (numEq & 0xFF)) &0xFF; | 1376 numEq= (256 - numEq) &0xFF; |
| 1358 #else | 1377 #else |
| 1359 int y; | 1378 int y; |
| 1360 for(y=0; y<BLOCK_SIZE; y++) | 1379 for(y=0; y<BLOCK_SIZE; y++) |
| 1361 { | 1380 { |
| 1362 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; | 1381 if(((src[0] - src[1] + 1) & 0xFFFF) < 3) numEq++; |
