Mercurial > libavcodec.hg
comparison libpostproc/postprocess.c @ 111:8e4c5a16c9fc libavcodec
fixed the height%8!=0 bug
simplified a few things
removed last row variants of the deinterlace filters, they are not needed anymore
added cubic interpolating deinterlacer
| author | michael |
|---|---|
| date | Wed, 17 Oct 2001 20:42:07 +0000 |
| parents | dfa9fde4b72d |
| children | a2c063b6ecf9 |
comparison
equal
deleted
inserted
replaced
| 110:e00e5d93457c | 111:8e4c5a16c9fc |
|---|---|
| 28 doHorizDefFilter E ac ac | 28 doHorizDefFilter E ac ac |
| 29 deRing | 29 deRing |
| 30 Vertical RKAlgo1 E a a | 30 Vertical RKAlgo1 E a a |
| 31 Vertical X1 a E E | 31 Vertical X1 a E E |
| 32 Horizontal X1 a E E | 32 Horizontal X1 a E E |
| 33 LinIpolDeinterlace a E E* | 33 LinIpolDeinterlace e E E* |
| 34 LinBlendDeinterlace a E E* | 34 CubicIpolDeinterlace a e e* |
| 35 LinBlendDeinterlace e E E* | |
| 35 MedianDeinterlace Ec Ec | 36 MedianDeinterlace Ec Ec |
| 36 | 37 |
| 37 | 38 |
| 38 * i dont have a 3dnow CPU -> its untested | 39 * i dont have a 3dnow CPU -> its untested |
| 39 E = Exact implementation | 40 E = Exact implementation |
| 40 e = allmost exact implementation | 41 e = allmost exact implementation (slightly different rounding,...) |
| 41 a = alternative / approximate impl | 42 a = alternative / approximate impl |
| 42 c = checked against the other implementations (-vo md5) | 43 c = checked against the other implementations (-vo md5) |
| 43 */ | 44 */ |
| 44 | 45 |
| 45 /* | 46 /* |
| 60 fix warnings (unused vars, ...) | 61 fix warnings (unused vars, ...) |
| 61 noise reduction filters | 62 noise reduction filters |
| 62 ... | 63 ... |
| 63 | 64 |
| 64 Notes: | 65 Notes: |
| 65 | |
| 66 | 66 |
| 67 */ | 67 */ |
| 68 | 68 |
| 69 //Changelog: use the CVS log | 69 //Changelog: use the CVS log |
| 70 | 70 |
| 176 } | 176 } |
| 177 #endif | 177 #endif |
| 178 | 178 |
| 179 //FIXME? |255-0| = 1 (shouldnt be a problem ...) | 179 //FIXME? |255-0| = 1 (shouldnt be a problem ...) |
| 180 /** | 180 /** |
| 181 * Check if the middle 8x8 Block in the given 8x10 block is flat | 181 * Check if the middle 8x8 Block in the given 8x16 block is flat |
| 182 */ | 182 */ |
| 183 static inline int isVertDC(uint8_t src[], int stride){ | 183 static inline int isVertDC(uint8_t src[], int stride){ |
| 184 int numEq= 0; | 184 int numEq= 0; |
| 185 int y; | 185 int y; |
| 186 src+= stride; // src points to begin of the 8x8 Block | 186 src+= stride*4; // src points to begin of the 8x8 Block |
| 187 #ifdef HAVE_MMX | 187 #ifdef HAVE_MMX |
| 188 asm volatile( | 188 asm volatile( |
| 189 "pushl %1\n\t" | 189 "pushl %1\n\t" |
| 190 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F | 190 "movq b7E, %%mm7 \n\t" // mm7 = 0x7F |
| 191 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D | 191 "movq b7C, %%mm6 \n\t" // mm6 = 0x7D |
| 293 | 293 |
| 294 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) | 294 static inline int isVertMinMaxOk(uint8_t src[], int stride, int QP) |
| 295 { | 295 { |
| 296 #ifdef HAVE_MMX | 296 #ifdef HAVE_MMX |
| 297 int isOk; | 297 int isOk; |
| 298 src+= stride*3; | |
| 298 asm volatile( | 299 asm volatile( |
| 299 // "int $3 \n\t" | 300 // "int $3 \n\t" |
| 300 "movq (%1, %2), %%mm0 \n\t" | 301 "movq (%1, %2), %%mm0 \n\t" |
| 301 "movq (%1, %2, 8), %%mm1 \n\t" | 302 "movq (%1, %2, 8), %%mm1 \n\t" |
| 302 "movq %%mm0, %%mm2 \n\t" | 303 "movq %%mm0, %%mm2 \n\t" |
| 318 return isOk ? 1 : 0; | 319 return isOk ? 1 : 0; |
| 319 #else | 320 #else |
| 320 | 321 |
| 321 int isOk2= 1; | 322 int isOk2= 1; |
| 322 int x; | 323 int x; |
| 324 src+= stride*3; | |
| 323 for(x=0; x<BLOCK_SIZE; x++) | 325 for(x=0; x<BLOCK_SIZE; x++) |
| 324 { | 326 { |
| 325 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; | 327 if(abs((int)src[x + stride] - (int)src[x + (stride<<3)]) > 2*QP) isOk2=0; |
| 326 } | 328 } |
| 327 /* if(isOk && !isOk2 || !isOk && isOk2) | 329 /* if(isOk && !isOk2 || !isOk && isOk2) |
| 341 #endif | 343 #endif |
| 342 | 344 |
| 343 } | 345 } |
| 344 | 346 |
| 345 /** | 347 /** |
| 346 * Do a vertical low pass filter on the 8x10 block (only write to the 8x8 block in the middle) | 348 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
| 347 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 | 349 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
| 348 */ | 350 */ |
| 349 static inline void doVertLowPass(uint8_t *src, int stride, int QP) | 351 static inline void doVertLowPass(uint8_t *src, int stride, int QP) |
| 350 { | 352 { |
| 351 // QP= 64; | |
| 352 | |
| 353 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 353 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 354 //#ifdef HAVE_MMX2 | 354 src+= stride*3; |
| 355 asm volatile( //"movv %0 %1 %2\n\t" | 355 asm volatile( //"movv %0 %1 %2\n\t" |
| 356 "pushl %0 \n\t" | 356 "pushl %0 \n\t" |
| 357 "movq pQPb, %%mm0 \n\t" // QP,..., QP | 357 "movq pQPb, %%mm0 \n\t" // QP,..., QP |
| 358 // "movq bFF , %%mm0 \n\t" // QP,..., QP | |
| 359 | 358 |
| 360 "movq (%0), %%mm6 \n\t" | 359 "movq (%0), %%mm6 \n\t" |
| 361 "movq (%0, %1), %%mm5 \n\t" | 360 "movq (%0, %1), %%mm5 \n\t" |
| 362 "movq %%mm5, %%mm1 \n\t" | 361 "movq %%mm5, %%mm1 \n\t" |
| 363 "movq %%mm6, %%mm2 \n\t" | 362 "movq %%mm6, %%mm2 \n\t" |
| 393 // 1 2 3 4 5 6 7 8 | 392 // 1 2 3 4 5 6 7 8 |
| 394 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 | 393 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ebx eax+4%1 |
| 395 // 6 4 2 2 1 1 | 394 // 6 4 2 2 1 1 |
| 396 // 6 4 4 2 | 395 // 6 4 4 2 |
| 397 // 6 8 2 | 396 // 6 8 2 |
| 398 /* | 397 |
| 399 "movq %%mm6, %%mm2 \n\t" //1 | |
| 400 "movq %%mm6, %%mm3 \n\t" //1 | |
| 401 "paddusb b02, %%mm3 \n\t" | |
| 402 "psrlw $2, %%mm3 \n\t" //1 /4 | |
| 403 "pand b3F, %%mm3 \n\t" | |
| 404 "psubb %%mm3, %%mm2 \n\t" | |
| 405 "movq (%0, %1), %%mm0 \n\t" // 1 | |
| 406 "movq %%mm0, %%mm1 \n\t" // 1 | |
| 407 "paddusb b02, %%mm0 \n\t" | |
| 408 "psrlw $2, %%mm0 \n\t" // 1 /4 | |
| 409 "pand b3F, %%mm0 \n\t" | |
| 410 "paddusb %%mm2, %%mm0 \n\t" //3 1 /4 | |
| 411 */ | |
| 412 "movq (%0, %1), %%mm0 \n\t" // 1 | 398 "movq (%0, %1), %%mm0 \n\t" // 1 |
| 413 "movq %%mm0, %%mm1 \n\t" // 1 | 399 "movq %%mm0, %%mm1 \n\t" // 1 |
| 414 PAVGB(%%mm6, %%mm0) //1 1 /2 | 400 PAVGB(%%mm6, %%mm0) //1 1 /2 |
| 415 PAVGB(%%mm6, %%mm0) //3 1 /4 | 401 PAVGB(%%mm6, %%mm0) //3 1 /4 |
| 416 | 402 |
| 468 PAVGB(%%mm4, %%mm5) // 11 /2 | 454 PAVGB(%%mm4, %%mm5) // 11 /2 |
| 469 PAVGB(%%mm5, %%mm0) // 11 11 /4 | 455 PAVGB(%%mm5, %%mm0) // 11 11 /4 |
| 470 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 | 456 "movq (%%eax, %1, 2), %%mm6 \n\t" // 1 |
| 471 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 | 457 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
| 472 PAVGB(%%mm0, %%mm1) // 11224222 /16 | 458 PAVGB(%%mm0, %%mm1) // 11224222 /16 |
| 473 // "pxor %%mm1, %%mm1 \n\t" | |
| 474 "movq %%mm1, (%%eax, %1, 2) \n\t" // X | 459 "movq %%mm1, (%%eax, %1, 2) \n\t" // X |
| 475 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 | 460 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
| 476 PAVGB((%%ebx), %%mm2) // 112 4 /8 | 461 PAVGB((%%ebx), %%mm2) // 112 4 /8 |
| 477 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 | 462 "movq (%%eax, %1, 4), %%mm0 \n\t" // 1 |
| 478 PAVGB(%%mm0, %%mm6) // 1 1 /2 | 463 PAVGB(%%mm0, %%mm6) // 1 1 /2 |
| 479 PAVGB(%%mm7, %%mm6) // 1 12 /4 | 464 PAVGB(%%mm7, %%mm6) // 1 12 /4 |
| 480 PAVGB(%%mm2, %%mm6) // 1122424 /4 | 465 PAVGB(%%mm2, %%mm6) // 1122424 /4 |
| 481 // "pxor %%mm6, %%mm6 \n\t" | |
| 482 "movq %%mm6, (%%ebx) \n\t" // X | 466 "movq %%mm6, (%%ebx) \n\t" // X |
| 483 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 | 467 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
| 484 PAVGB(%%mm7, %%mm5) // 11 2 /4 | 468 PAVGB(%%mm7, %%mm5) // 11 2 /4 |
| 485 PAVGB(%%mm7, %%mm5) // 11 6 /8 | 469 PAVGB(%%mm7, %%mm5) // 11 6 /8 |
| 486 | 470 |
| 487 PAVGB(%%mm3, %%mm0) // 112 /4 | 471 PAVGB(%%mm3, %%mm0) // 112 /4 |
| 488 PAVGB(%%mm0, %%mm5) // 112246 /16 | 472 PAVGB(%%mm0, %%mm5) // 112246 /16 |
| 489 // "pxor %%mm5, %%mm5 \n\t" | |
| 490 // "movq pQPb, %%mm5 \n\t" | |
| 491 "movq %%mm5, (%%eax, %1, 4) \n\t" // X | 473 "movq %%mm5, (%%eax, %1, 4) \n\t" // X |
| 492 "popl %0\n\t" | 474 "popl %0\n\t" |
| 493 | 475 |
| 494 : | 476 : |
| 495 : "r" (src), "r" (stride) | 477 : "r" (src), "r" (stride) |
| 504 const int l6= stride + l5; | 486 const int l6= stride + l5; |
| 505 const int l7= stride + l6; | 487 const int l7= stride + l6; |
| 506 const int l8= stride + l7; | 488 const int l8= stride + l7; |
| 507 const int l9= stride + l8; | 489 const int l9= stride + l8; |
| 508 int x; | 490 int x; |
| 491 src+= stride*3; | |
| 509 for(x=0; x<BLOCK_SIZE; x++) | 492 for(x=0; x<BLOCK_SIZE; x++) |
| 510 { | 493 { |
| 511 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; | 494 const int first= ABS(src[0] - src[l1]) < QP ? src[0] : src[l1]; |
| 512 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; | 495 const int last= ABS(src[l8] - src[l9]) < QP ? src[l9] : src[l8]; |
| 513 | 496 |
| 549 1 12 12 23 | 532 1 12 12 23 |
| 550 */ | 533 */ |
| 551 static inline void vertRK1Filter(uint8_t *src, int stride, int QP) | 534 static inline void vertRK1Filter(uint8_t *src, int stride, int QP) |
| 552 { | 535 { |
| 553 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 536 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 537 src+= stride*3; | |
| 554 // FIXME rounding | 538 // FIXME rounding |
| 555 asm volatile( | 539 asm volatile( |
| 556 "pxor %%mm7, %%mm7 \n\t" // 0 | 540 "pxor %%mm7, %%mm7 \n\t" // 0 |
| 557 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | 541 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 558 "leal (%0, %1), %%eax \n\t" | 542 "leal (%0, %1), %%eax \n\t" |
| 620 const int l6= stride + l5; | 604 const int l6= stride + l5; |
| 621 const int l7= stride + l6; | 605 const int l7= stride + l6; |
| 622 const int l8= stride + l7; | 606 const int l8= stride + l7; |
| 623 const int l9= stride + l8; | 607 const int l9= stride + l8; |
| 624 int x; | 608 int x; |
| 609 src+= stride*3; | |
| 625 for(x=0; x<BLOCK_SIZE; x++) | 610 for(x=0; x<BLOCK_SIZE; x++) |
| 626 { | 611 { |
| 627 if(ABS(src[l4]-src[l5]) < QP + QP/4) | 612 if(ABS(src[l4]-src[l5]) < QP + QP/4) |
| 628 { | 613 { |
| 629 int v = (src[l5] - src[l4]); | 614 int v = (src[l5] - src[l4]); |
| 648 * MMX2 version does correct clipping C version doesnt | 633 * MMX2 version does correct clipping C version doesnt |
| 649 */ | 634 */ |
| 650 static inline void vertX1Filter(uint8_t *src, int stride, int QP) | 635 static inline void vertX1Filter(uint8_t *src, int stride, int QP) |
| 651 { | 636 { |
| 652 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 637 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 638 src+= stride*3; | |
| 639 | |
| 653 asm volatile( | 640 asm volatile( |
| 654 "pxor %%mm7, %%mm7 \n\t" // 0 | 641 "pxor %%mm7, %%mm7 \n\t" // 0 |
| 655 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | 642 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 656 "leal (%0, %1), %%eax \n\t" | 643 "leal (%0, %1), %%eax \n\t" |
| 657 "leal (%%eax, %1, 4), %%ebx \n\t" | 644 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 742 const int l6= stride + l5; | 729 const int l6= stride + l5; |
| 743 const int l7= stride + l6; | 730 const int l7= stride + l6; |
| 744 const int l8= stride + l7; | 731 const int l8= stride + l7; |
| 745 const int l9= stride + l8; | 732 const int l9= stride + l8; |
| 746 int x; | 733 int x; |
| 734 | |
| 735 src+= stride*3; | |
| 747 for(x=0; x<BLOCK_SIZE; x++) | 736 for(x=0; x<BLOCK_SIZE; x++) |
| 748 { | 737 { |
| 749 int a= src[l3] - src[l4]; | 738 int a= src[l3] - src[l4]; |
| 750 int b= src[l4] - src[l5]; | 739 int b= src[l4] - src[l5]; |
| 751 int c= src[l5] - src[l6]; | 740 int c= src[l5] - src[l6]; |
| 1005 | 994 |
| 1006 | 995 |
| 1007 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) | 996 static inline void doVertDefFilter(uint8_t src[], int stride, int QP) |
| 1008 { | 997 { |
| 1009 #ifdef HAVE_MMX | 998 #ifdef HAVE_MMX |
| 1010 src+= stride; | 999 src+= stride*4; |
| 1011 //FIXME try pmul for *5 stuff | 1000 //FIXME try pmul for *5 stuff |
| 1012 // src[0]=0; | 1001 // src[0]=0; |
| 1013 asm volatile( | 1002 asm volatile( |
| 1014 "pxor %%mm7, %%mm7 \n\t" | 1003 "pxor %%mm7, %%mm7 \n\t" |
| 1015 "leal (%0, %1), %%eax \n\t" | 1004 "leal (%0, %1), %%eax \n\t" |
| 1152 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) | 1141 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
| 1153 "pxor %%mm7, %%mm5 \n\t" | 1142 "pxor %%mm7, %%mm5 \n\t" |
| 1154 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| | 1143 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
| 1155 // 100 opcodes | 1144 // 100 opcodes |
| 1156 "movd %2, %%mm2 \n\t" // QP | 1145 "movd %2, %%mm2 \n\t" // QP |
| 1157 //"pcmpeqb %%mm2, %%mm2\n\t" | |
| 1158 "punpcklwd %%mm2, %%mm2 \n\t" | 1146 "punpcklwd %%mm2, %%mm2 \n\t" |
| 1159 "punpcklwd %%mm2, %%mm2 \n\t" | 1147 "punpcklwd %%mm2, %%mm2 \n\t" |
| 1160 "psllw $3, %%mm2 \n\t" // 8QP | 1148 "psllw $3, %%mm2 \n\t" // 8QP |
| 1161 "movq %%mm2, %%mm3 \n\t" // 8QP | 1149 "movq %%mm2, %%mm3 \n\t" // 8QP |
| 1162 "pcmpgtw %%mm4, %%mm2 \n\t" | 1150 "pcmpgtw %%mm4, %%mm2 \n\t" |
| 1230 "movq (%%eax, %1, 2), %%mm0 \n\t" | 1218 "movq (%%eax, %1, 2), %%mm0 \n\t" |
| 1231 "paddb %%mm4, %%mm0 \n\t" | 1219 "paddb %%mm4, %%mm0 \n\t" |
| 1232 "movq %%mm0, (%%eax, %1, 2) \n\t" | 1220 "movq %%mm0, (%%eax, %1, 2) \n\t" |
| 1233 "movq (%0, %1, 4), %%mm0 \n\t" | 1221 "movq (%0, %1, 4), %%mm0 \n\t" |
| 1234 "psubb %%mm4, %%mm0 \n\t" | 1222 "psubb %%mm4, %%mm0 \n\t" |
| 1235 // "pxor %%mm0, %%mm0 \n\t" | |
| 1236 "movq %%mm0, (%0, %1, 4) \n\t" | 1223 "movq %%mm0, (%0, %1, 4) \n\t" |
| 1237 | 1224 |
| 1238 : | 1225 : |
| 1239 : "r" (src), "r" (stride), "r" (QP) | 1226 : "r" (src), "r" (stride), "r" (QP) |
| 1240 : "%eax", "%ebx" | 1227 : "%eax", "%ebx" |
| 1248 const int l6= stride + l5; | 1235 const int l6= stride + l5; |
| 1249 const int l7= stride + l6; | 1236 const int l7= stride + l6; |
| 1250 const int l8= stride + l7; | 1237 const int l8= stride + l7; |
| 1251 // const int l9= stride + l8; | 1238 // const int l9= stride + l8; |
| 1252 int x; | 1239 int x; |
| 1240 src+= stride*3; | |
| 1253 for(x=0; x<BLOCK_SIZE; x++) | 1241 for(x=0; x<BLOCK_SIZE; x++) |
| 1254 { | 1242 { |
| 1255 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); | 1243 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
| 1256 if(ABS(middleEnergy) < 8*QP) | 1244 if(ABS(middleEnergy) < 8*QP) |
| 1257 { | 1245 { |
| 1879 #endif | 1867 #endif |
| 1880 } | 1868 } |
| 1881 | 1869 |
| 1882 /** | 1870 /** |
| 1883 * Deinterlaces the given block | 1871 * Deinterlaces the given block |
| 1884 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | 1872 * will be called for every 8x8 block, and can read & write into an 8x16 block |
| 1885 */ | 1873 */ |
| 1886 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) | 1874 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) |
| 1887 { | 1875 { |
| 1888 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1876 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 1889 asm volatile( | 1877 asm volatile( |
| 1892 // 0 1 2 3 4 5 6 7 8 9 | 1880 // 0 1 2 3 4 5 6 7 8 9 |
| 1893 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1881 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
| 1894 | 1882 |
| 1895 "movq (%0), %%mm0 \n\t" | 1883 "movq (%0), %%mm0 \n\t" |
| 1896 "movq (%%eax, %1), %%mm1 \n\t" | 1884 "movq (%%eax, %1), %%mm1 \n\t" |
| 1897 PAVGB(%%mm1, %%mm0)\ | 1885 PAVGB(%%mm1, %%mm0) |
| 1898 "movq %%mm0, (%%eax) \n\t" | 1886 "movq %%mm0, (%%eax) \n\t" |
| 1899 "movq (%0, %1, 4), %%mm0 \n\t" | 1887 "movq (%0, %1, 4), %%mm0 \n\t" |
| 1900 PAVGB(%%mm0, %%mm1)\ | 1888 PAVGB(%%mm0, %%mm1) |
| 1901 "movq %%mm1, (%%eax, %1, 2) \n\t" | 1889 "movq %%mm1, (%%eax, %1, 2) \n\t" |
| 1902 "movq (%%ebx, %1), %%mm1 \n\t" | 1890 "movq (%%ebx, %1), %%mm1 \n\t" |
| 1903 PAVGB(%%mm1, %%mm0)\ | 1891 PAVGB(%%mm1, %%mm0) |
| 1904 "movq %%mm0, (%%ebx) \n\t" | 1892 "movq %%mm0, (%%ebx) \n\t" |
| 1905 "movq (%0, %1, 8), %%mm0 \n\t" | 1893 "movq (%0, %1, 8), %%mm0 \n\t" |
| 1906 PAVGB(%%mm0, %%mm1)\ | 1894 PAVGB(%%mm0, %%mm1) |
| 1907 "movq %%mm1, (%%ebx, %1, 2) \n\t" | 1895 "movq %%mm1, (%%ebx, %1, 2) \n\t" |
| 1908 | 1896 |
| 1909 : : "r" (src), "r" (stride) | 1897 : : "r" (src), "r" (stride) |
| 1910 : "%eax", "%ebx" | 1898 : "%eax", "%ebx" |
| 1911 ); | 1899 ); |
| 1922 #endif | 1910 #endif |
| 1923 } | 1911 } |
| 1924 | 1912 |
| 1925 /** | 1913 /** |
| 1926 * Deinterlaces the given block | 1914 * Deinterlaces the given block |
| 1927 * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | 1915 * will be called for every 8x8 block, and can read & write into an 8x16 block |
| 1916 * no cliping in C version | |
| 1928 */ | 1917 */ |
| 1929 static inline void deInterlaceInterpolateLinearLastRow(uint8_t src[], int stride) | 1918 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride) |
| 1930 { | 1919 { |
| 1931 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1920 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 1932 asm volatile( | 1921 asm volatile( |
| 1933 "leal (%0, %1), %%eax \n\t" | 1922 "leal (%0, %1), %%eax \n\t" |
| 1934 "leal (%%eax, %1, 4), %%ebx \n\t" | 1923 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 1935 // 0 1 2 3 4 5 6 7 8 9 | 1924 "leal (%%ebx, %1, 4), %%ecx \n\t" |
| 1936 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1925 "addl %1, %%ecx \n\t" |
| 1937 | 1926 "pxor %%mm7, %%mm7 \n\t" |
| 1938 "movq (%0), %%mm0 \n\t" | 1927 // 0 1 2 3 4 5 6 7 8 9 10 |
| 1939 "movq (%%eax, %1), %%mm1 \n\t" | 1928 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 ecx |
| 1940 PAVGB(%%mm1, %%mm0)\ | 1929 |
| 1941 "movq %%mm0, (%%eax) \n\t" | 1930 #define DEINT_CUBIC(a,b,c,d,e)\ |
| 1942 "movq (%0, %1, 4), %%mm0 \n\t" | 1931 "movq " #a ", %%mm0 \n\t"\ |
| 1943 PAVGB(%%mm0, %%mm1)\ | 1932 "movq " #b ", %%mm1 \n\t"\ |
| 1944 "movq %%mm1, (%%eax, %1, 2) \n\t" | 1933 "movq " #d ", %%mm2 \n\t"\ |
| 1945 "movq (%%ebx, %1), %%mm1 \n\t" | 1934 "movq " #e ", %%mm3 \n\t"\ |
| 1946 PAVGB(%%mm1, %%mm0)\ | 1935 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ |
| 1947 "movq %%mm0, (%%ebx) \n\t" | 1936 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ |
| 1948 "movq %%mm1, (%%ebx, %1, 2) \n\t" | 1937 "movq %%mm0, %%mm2 \n\t"\ |
| 1949 | 1938 "punpcklbw %%mm7, %%mm0 \n\t"\ |
| 1939 "punpckhbw %%mm7, %%mm2 \n\t"\ | |
| 1940 "movq %%mm1, %%mm3 \n\t"\ | |
| 1941 "punpcklbw %%mm7, %%mm1 \n\t"\ | |
| 1942 "punpckhbw %%mm7, %%mm3 \n\t"\ | |
| 1943 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ | |
| 1944 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ | |
| 1945 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ | |
| 1946 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ | |
| 1947 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ | |
| 1948 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ | |
| 1949 "packuswb %%mm3, %%mm1 \n\t"\ | |
| 1950 "movq %%mm1, " #c " \n\t" | |
| 1951 | |
| 1952 DEINT_CUBIC((%0), (%%eax, %1), (%%eax, %1, 2), (%0, %1, 4), (%%ebx, %1)) | |
| 1953 DEINT_CUBIC((%%eax, %1), (%0, %1, 4), (%%ebx), (%%ebx, %1), (%0, %1, 8)) | |
| 1954 DEINT_CUBIC((%0, %1, 4), (%%ebx, %1), (%%ebx, %1, 2), (%0, %1, 8), (%%ecx)) | |
| 1955 DEINT_CUBIC((%%ebx, %1), (%0, %1, 8), (%%ebx, %1, 4), (%%ecx), (%%ecx, %1, 2)) | |
| 1950 | 1956 |
| 1951 : : "r" (src), "r" (stride) | 1957 : : "r" (src), "r" (stride) |
| 1952 : "%eax", "%ebx" | 1958 : "%eax", "%ebx", "ecx" |
| 1953 ); | 1959 ); |
| 1954 #else | 1960 #else |
| 1955 int x; | 1961 int x; |
| 1956 for(x=0; x<8; x++) | 1962 for(x=0; x<8; x++) |
| 1957 { | 1963 { |
| 1958 src[stride] = (src[0] + src[stride*2])>>1; | 1964 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; |
| 1959 src[stride*3] = (src[stride*2] + src[stride*4])>>1; | 1965 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; |
| 1960 src[stride*5] = (src[stride*4] + src[stride*6])>>1; | 1966 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; |
| 1961 src[stride*7] = src[stride*6]; | 1967 src[stride*9] = (-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4; |
| 1962 src++; | 1968 src++; |
| 1963 } | 1969 } |
| 1964 #endif | 1970 #endif |
| 1965 } | 1971 } |
| 1966 | 1972 |
| 1967 /** | 1973 /** |
| 1968 * Deinterlaces the given block | 1974 * Deinterlaces the given block |
| 1969 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | 1975 * will be called for every 8x8 block, and can read & write into an 8x16 block |
| 1970 * will shift the image up by 1 line (FIXME if this is a problem) | 1976 * will shift the image up by 1 line (FIXME if this is a problem) |
| 1971 */ | 1977 */ |
| 1972 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) | 1978 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) |
| 1973 { | 1979 { |
| 1974 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1980 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 2034 #endif | 2040 #endif |
| 2035 } | 2041 } |
| 2036 | 2042 |
| 2037 /** | 2043 /** |
| 2038 * Deinterlaces the given block | 2044 * Deinterlaces the given block |
| 2039 * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | |
| 2040 * will shift the image up by 1 line (FIXME if this is a problem) | |
| 2041 */ | |
| 2042 static inline void deInterlaceBlendLinearLastRow(uint8_t src[], int stride) | |
| 2043 { | |
| 2044 #if defined (HAVE_MMSX2) || defined (HAVE_3DNOW) | |
| 2045 asm volatile( | |
| 2046 "leal (%0, %1), %%eax \n\t" | |
| 2047 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 2048 // 0 1 2 3 4 5 6 7 8 9 | |
| 2049 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 2050 | |
| 2051 "movq (%0), %%mm0 \n\t" // L0 | |
| 2052 "movq (%%eax, %1), %%mm1 \n\t" // L2 | |
| 2053 PAVGB(%%mm1, %%mm0) // L0+L2 | |
| 2054 "movq (%%eax), %%mm2 \n\t" // L1 | |
| 2055 PAVGB(%%mm2, %%mm0) | |
| 2056 "movq %%mm0, (%0) \n\t" | |
| 2057 "movq (%%eax, %1, 2), %%mm0 \n\t" // L3 | |
| 2058 PAVGB(%%mm0, %%mm2) // L1+L3 | |
| 2059 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 | |
| 2060 "movq %%mm2, (%%eax) \n\t" | |
| 2061 "movq (%0, %1, 4), %%mm2 \n\t" // L4 | |
| 2062 PAVGB(%%mm2, %%mm1) // L2+L4 | |
| 2063 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 | |
| 2064 "movq %%mm1, (%%eax, %1) \n\t" | |
| 2065 "movq (%%ebx), %%mm1 \n\t" // L5 | |
| 2066 PAVGB(%%mm1, %%mm0) // L3+L5 | |
| 2067 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 | |
| 2068 "movq %%mm0, (%%eax, %1, 2) \n\t" | |
| 2069 "movq (%%ebx, %1), %%mm0 \n\t" // L6 | |
| 2070 PAVGB(%%mm0, %%mm2) // L4+L6 | |
| 2071 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 | |
| 2072 "movq %%mm2, (%0, %1, 4) \n\t" | |
| 2073 "movq (%%ebx, %1, 2), %%mm2 \n\t" // L7 | |
| 2074 PAVGB(%%mm2, %%mm1) // L5+L7 | |
| 2075 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 | |
| 2076 "movq %%mm1, (%%ebx) \n\t" | |
| 2077 PAVGB(%%mm2, %%mm0) // L7 + L8 | |
| 2078 "movq %%mm0, (%%ebx, %1) \n\t" | |
| 2079 "movq %%mm0, (%%ebx, %1, 2) \n\t" | |
| 2080 | |
| 2081 : : "r" (src), "r" (stride) | |
| 2082 : "%eax", "%ebx" | |
| 2083 ); | |
| 2084 #else | |
| 2085 int x; | |
| 2086 for(x=0; x<8; x++) | |
| 2087 { | |
| 2088 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |
| 2089 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |
| 2090 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |
| 2091 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |
| 2092 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |
| 2093 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |
| 2094 src[stride*6] = (src[stride*6] + src[stride*7])>>1; | |
| 2095 src[stride*7] = src[stride*6]; | |
| 2096 src++; | |
| 2097 } | |
| 2098 #endif | |
| 2099 } | |
| 2100 | |
| 2101 /** | |
| 2102 * Deinterlaces the given block | |
| 2103 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | 2045 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block |
| 2104 */ | 2046 */ |
| 2105 static inline void deInterlaceMedian(uint8_t src[], int stride) | 2047 static inline void deInterlaceMedian(uint8_t src[], int stride) |
| 2106 { | 2048 { |
| 2107 #ifdef HAVE_MMX | 2049 #ifdef HAVE_MMX |
| 2211 src++; | 2153 src++; |
| 2212 } | 2154 } |
| 2213 #endif | 2155 #endif |
| 2214 } | 2156 } |
| 2215 | 2157 |
| 2216 /** | |
| 2217 * Deinterlaces the given block | |
| 2218 * will be called for every 8x8 block, in the last row, and can read & write into an 8x8 block | |
| 2219 */ | |
| 2220 static inline void deInterlaceMedianLastRow(uint8_t src[], int stride) | |
| 2221 { | |
| 2222 #ifdef HAVE_MMX | |
| 2223 #ifdef HAVE_MMX2 | |
| 2224 asm volatile( | |
| 2225 "leal (%0, %1), %%eax \n\t" | |
| 2226 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 2227 // 0 1 2 3 4 5 6 7 8 9 | |
| 2228 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 2229 | |
| 2230 "movq (%0), %%mm0 \n\t" // | |
| 2231 "movq (%%eax, %1), %%mm2 \n\t" // | |
| 2232 "movq (%%eax), %%mm1 \n\t" // | |
| 2233 "movq %%mm0, %%mm3 \n\t" | |
| 2234 "pmaxub %%mm1, %%mm0 \n\t" // | |
| 2235 "pminub %%mm3, %%mm1 \n\t" // | |
| 2236 "pmaxub %%mm2, %%mm1 \n\t" // | |
| 2237 "pminub %%mm1, %%mm0 \n\t" | |
| 2238 "movq %%mm0, (%%eax) \n\t" | |
| 2239 | |
| 2240 "movq (%0, %1, 4), %%mm0 \n\t" // | |
| 2241 "movq (%%eax, %1, 2), %%mm1 \n\t" // | |
| 2242 "movq %%mm2, %%mm3 \n\t" | |
| 2243 "pmaxub %%mm1, %%mm2 \n\t" // | |
| 2244 "pminub %%mm3, %%mm1 \n\t" // | |
| 2245 "pmaxub %%mm0, %%mm1 \n\t" // | |
| 2246 "pminub %%mm1, %%mm2 \n\t" | |
| 2247 "movq %%mm2, (%%eax, %1, 2) \n\t" | |
| 2248 | |
| 2249 "movq (%%ebx), %%mm2 \n\t" // | |
| 2250 "movq (%%ebx, %1), %%mm1 \n\t" // | |
| 2251 "movq %%mm2, %%mm3 \n\t" | |
| 2252 "pmaxub %%mm0, %%mm2 \n\t" // | |
| 2253 "pminub %%mm3, %%mm0 \n\t" // | |
| 2254 "pmaxub %%mm1, %%mm0 \n\t" // | |
| 2255 "pminub %%mm0, %%mm2 \n\t" | |
| 2256 "movq %%mm2, (%%ebx) \n\t" | |
| 2257 | |
| 2258 "movq %%mm1, (%%ebx, %1, 2) \n\t" | |
| 2259 | |
| 2260 : : "r" (src), "r" (stride) | |
| 2261 : "%eax", "%ebx" | |
| 2262 ); | |
| 2263 #else //MMX & no MMX2 | |
| 2264 asm volatile( | |
| 2265 "leal (%0, %1), %%eax \n\t" | |
| 2266 "leal (%%eax, %1, 4), %%ebx \n\t" | |
| 2267 // 0 1 2 3 4 5 6 7 8 9 | |
| 2268 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | |
| 2269 "pxor %%mm7, %%mm7 \n\t" | |
| 2270 | |
| 2271 MEDIAN((%0), (%%eax), (%%eax, %1)) | |
| 2272 MEDIAN((%%eax, %1), (%%eax, %1, 2), (%0, %1, 4)) | |
| 2273 MEDIAN((%0, %1, 4), (%%ebx), (%%ebx, %1)) | |
| 2274 | |
| 2275 "movq (%%ebx, %1), %%mm0 \n\t" | |
| 2276 "movq %%mm0, (%%ebx, %1, 2) \n\t" | |
| 2277 | |
| 2278 : : "r" (src), "r" (stride) | |
| 2279 : "%eax", "%ebx" | |
| 2280 ); | |
| 2281 | |
| 2282 #endif //MMX | |
| 2283 #else | |
| 2284 //FIXME | |
| 2285 int x; | |
| 2286 for(x=0; x<8; x++) | |
| 2287 { | |
| 2288 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | |
| 2289 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | |
| 2290 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | |
| 2291 src[stride*3] = (src[stride*3] + 2*src[stride*4] + src[stride*5])>>2; | |
| 2292 src[stride*4] = (src[stride*4] + 2*src[stride*5] + src[stride*6])>>2; | |
| 2293 src[stride*5] = (src[stride*5] + 2*src[stride*6] + src[stride*7])>>2; | |
| 2294 src[stride*6] = (src[stride*6] + src[stride*7])>>1; | |
| 2295 src[stride*7] = src[stride*6]; | |
| 2296 src++; | |
| 2297 } | |
| 2298 #endif | |
| 2299 } | |
| 2300 | |
| 2301 #ifdef HAVE_ODIVX_POSTPROCESS | 2158 #ifdef HAVE_ODIVX_POSTPROCESS |
| 2302 #include "../opendivx/postprocess.h" | 2159 #include "../opendivx/postprocess.h" |
| 2303 int use_old_pp=0; | 2160 int use_old_pp=0; |
| 2304 #endif | 2161 #endif |
| 2305 | 2162 |
| 2535 /* we need 64bit here otherwise weŽll going to have a problem | 2392 /* we need 64bit here otherwise weŽll going to have a problem |
| 2536 after watching a black picture for 5 hours*/ | 2393 after watching a black picture for 5 hours*/ |
| 2537 static uint64_t *yHistogram= NULL; | 2394 static uint64_t *yHistogram= NULL; |
| 2538 int black=0, white=255; // blackest black and whitest white in the picture | 2395 int black=0, white=255; // blackest black and whitest white in the picture |
| 2539 | 2396 |
| 2397 /* Temporary buffers for handling the last row(s) */ | |
| 2398 static uint8_t *tempDst= NULL; | |
| 2399 static uint8_t *tempSrc= NULL; | |
| 2400 | |
| 2540 #ifdef TIMING | 2401 #ifdef TIMING |
| 2541 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; | 2402 long long T0, T1, memcpyTime=0, vertTime=0, horizTime=0, sumTime, diffTime=0; |
| 2542 sumTime= rdtsc(); | 2403 sumTime= rdtsc(); |
| 2543 #endif | 2404 #endif |
| 2405 | |
| 2406 if(tempDst==NULL) | |
| 2407 { | |
| 2408 tempDst= (uint8_t*)memalign(8, 1024*24); | |
| 2409 tempSrc= (uint8_t*)memalign(8, 1024*24); | |
| 2410 } | |
| 2544 | 2411 |
| 2545 if(!yHistogram) | 2412 if(!yHistogram) |
| 2546 { | 2413 { |
| 2547 int i; | 2414 int i; |
| 2548 yHistogram= (uint64_t*)malloc(8*256); | 2415 yHistogram= (uint64_t*)malloc(8*256); |
| 2567 // printf("%d ", yHistogram[i]); | 2434 // printf("%d ", yHistogram[i]); |
| 2568 } | 2435 } |
| 2569 // printf("\n\n"); | 2436 // printf("\n\n"); |
| 2570 | 2437 |
| 2571 /* we allways get a completly black picture first */ | 2438 /* we allways get a completly black picture first */ |
| 2572 | |
| 2573 maxClipped= (uint64_t)(sum * maxClippedThreshold); | 2439 maxClipped= (uint64_t)(sum * maxClippedThreshold); |
| 2574 | 2440 |
| 2575 clipped= sum; | 2441 clipped= sum; |
| 2576 for(black=255; black>0; black--) | 2442 for(black=255; black>0; black--) |
| 2577 { | 2443 { |
| 2602 { | 2468 { |
| 2603 packedYScale= 0x0100010001000100LL; | 2469 packedYScale= 0x0100010001000100LL; |
| 2604 packedYOffset= 0; | 2470 packedYOffset= 0; |
| 2605 } | 2471 } |
| 2606 | 2472 |
| 2473 /* copy first row of 8x8 blocks */ | |
| 2607 for(x=0; x<width; x+=BLOCK_SIZE) | 2474 for(x=0; x<width; x+=BLOCK_SIZE) |
| 2608 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); | 2475 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); |
| 2609 | 2476 |
| 2610 for(y=0; y<height-7; y+=BLOCK_SIZE) | 2477 for(y=0; y<height; y+=BLOCK_SIZE) |
| 2611 { | 2478 { |
| 2612 //1% speedup if these are here instead of the inner loop | 2479 //1% speedup if these are here instead of the inner loop |
| 2613 uint8_t *srcBlock= &(src[y*srcStride]); | 2480 uint8_t *srcBlock= &(src[y*srcStride]); |
| 2614 uint8_t *dstBlock= &(dst[y*dstStride]); | 2481 uint8_t *dstBlock= &(dst[y*dstStride]); |
| 2615 uint8_t *vertSrcBlock= &(srcBlock[srcStride*3]); // Blocks are 10x8 -> *3 to start | 2482 |
| 2616 uint8_t *vertBlock= &(dstBlock[dstStride*3]); | 2483 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards, if not |
| 2484 than use a temporary buffer */ | |
| 2485 if(y+15 >= height) | |
| 2486 { | |
| 2487 /* copy from line 5 to 12 of src, these will e copied with | |
| 2488 blockcopy to dst later */ | |
| 2489 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5, | |
| 2490 srcStride*MAX(height-y-5, 0) ); | |
| 2491 | |
| 2492 /* duplicate last line to fill the void upto line 12 */ | |
| 2493 if(y+12 >= height) | |
| 2494 { | |
| 2495 int i; | |
| 2496 for(i=height-y; i<=12; i++) | |
| 2497 memcpy(tempSrc + srcStride*i, | |
| 2498 src + srcStride*(height-1), srcStride); | |
| 2499 } | |
| 2500 | |
| 2501 | |
| 2502 /* copy up to 5 lines of dst */ | |
| 2503 memcpy(tempDst, dstBlock, dstStride*MIN(height-y, 5) ); | |
| 2504 dstBlock= tempDst; | |
| 2505 srcBlock= tempSrc; | |
| 2506 } | |
| 2617 | 2507 |
| 2618 // finish 1 block before the next otherwise weŽll might have a problem | 2508 // finish 1 block before the next otherwise weŽll might have a problem |
| 2619 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | 2509 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
| 2620 for(x=0; x<width; x+=BLOCK_SIZE) | 2510 for(x=0; x<width; x+=BLOCK_SIZE) |
| 2621 { | 2511 { |
| 2623 int QP= isColor ? | 2513 int QP= isColor ? |
| 2624 QPs[(y>>3)*QPStride + (x>>3)]: | 2514 QPs[(y>>3)*QPStride + (x>>3)]: |
| 2625 QPs[(y>>4)*QPStride + (x>>4)]; | 2515 QPs[(y>>4)*QPStride + (x>>4)]; |
| 2626 if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8; | 2516 if(!isColor && (mode & LEVEL_FIX)) QP= (QP* (packedYScale &0xFFFF))>>8; |
| 2627 #ifdef HAVE_MMX | 2517 #ifdef HAVE_MMX |
| 2628 asm volatile( | 2518 asm volatile( |
| 2629 "movd %0, %%mm7 \n\t" | 2519 "movd %0, %%mm7 \n\t" |
| 2630 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | 2520 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
| 2631 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | 2521 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP |
| 2632 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | 2522 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP |
| 2633 "movq %%mm7, pQPb \n\t" | 2523 "movq %%mm7, pQPb \n\t" |
| 2634 : : "r" (QP) | 2524 : : "r" (QP) |
| 2635 ); | 2525 ); |
| 2636 #endif | 2526 #endif |
| 2637 | 2527 |
| 2638 | |
| 2639 if(y + 12 < height) | |
| 2640 { | |
| 2641 #ifdef MORE_TIMING | 2528 #ifdef MORE_TIMING |
| 2642 T0= rdtsc(); | 2529 T0= rdtsc(); |
| 2643 #endif | 2530 #endif |
| 2644 | 2531 |
| 2645 #ifdef HAVE_MMX2 | 2532 #ifdef HAVE_MMX2 |
| 2646 prefetchnta(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); | 2533 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
| 2647 prefetchnta(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); | 2534 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
| 2648 prefetcht0(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); | 2535 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
| 2649 prefetcht0(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); | 2536 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
| 2650 #elif defined(HAVE_3DNOW) | 2537 #elif defined(HAVE_3DNOW) |
| 2651 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | 2538 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... |
| 2652 /* prefetch(vertSrcBlock + (((x>>3)&3) + 2)*srcStride + 32); | 2539 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
| 2653 prefetch(vertSrcBlock + (((x>>3)&3) + 6)*srcStride + 32); | 2540 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
| 2654 prefetchw(vertBlock + (((x>>3)&3) + 2)*dstStride + 32); | 2541 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
| 2655 prefetchw(vertBlock + (((x>>3)&3) + 6)*dstStride + 32); | 2542 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
| 2656 */ | 2543 */ |
| 2657 #endif | 2544 #endif |
| 2658 if(!isColor) yHistogram[ srcBlock[0] ]++; | 2545 |
| 2659 | 2546 if(!isColor) yHistogram[ srcBlock[srcStride*5] ]++; |
| 2660 blockCopy(vertBlock + dstStride*2, dstStride, | 2547 |
| 2661 vertSrcBlock + srcStride*2, srcStride, 8, mode & LEVEL_FIX); | 2548 blockCopy(dstBlock + dstStride*5, dstStride, |
| 2662 | 2549 srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX); |
| 2663 if(mode & LINEAR_IPOL_DEINT_FILTER) | 2550 |
| 2664 deInterlaceInterpolateLinear(dstBlock, dstStride); | 2551 if(mode & LINEAR_IPOL_DEINT_FILTER) |
| 2665 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 2552 deInterlaceInterpolateLinear(dstBlock, dstStride); |
| 2666 deInterlaceBlendLinear(dstBlock, dstStride); | 2553 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 2667 else if(mode & MEDIAN_DEINT_FILTER) | 2554 deInterlaceBlendLinear(dstBlock, dstStride); |
| 2668 deInterlaceMedian(dstBlock, dstStride); | 2555 else if(mode & MEDIAN_DEINT_FILTER) |
| 2669 /* else if(mode & CUBIC_IPOL_DEINT_FILTER) | 2556 deInterlaceMedian(dstBlock, dstStride); |
| 2670 deInterlaceInterpolateCubic(dstBlock, dstStride); | 2557 else if(mode & CUBIC_IPOL_DEINT_FILTER) |
| 2671 else if(mode & CUBIC_BLEND_DEINT_FILTER) | 2558 deInterlaceInterpolateCubic(dstBlock, dstStride); |
| 2672 deInterlaceBlendCubic(dstBlock, dstStride); | 2559 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
| 2560 deInterlaceBlendCubic(dstBlock, dstStride); | |
| 2673 */ | 2561 */ |
| 2674 | 2562 |
| 2563 /* only deblock if we have 2 blocks */ | |
| 2564 if(y + 8 < height) | |
| 2565 { | |
| 2675 #ifdef MORE_TIMING | 2566 #ifdef MORE_TIMING |
| 2676 T1= rdtsc(); | 2567 T1= rdtsc(); |
| 2677 memcpyTime+= T1-T0; | 2568 memcpyTime+= T1-T0; |
| 2678 T0=T1; | 2569 T0=T1; |
| 2679 #endif | 2570 #endif |
| 2680 if(mode & V_DEBLOCK) | 2571 if(mode & V_DEBLOCK) |
| 2681 { | 2572 { |
| 2682 if(mode & V_RK1_FILTER) | 2573 if(mode & V_RK1_FILTER) |
| 2683 vertRK1Filter(vertBlock, stride, QP); | 2574 vertRK1Filter(dstBlock, stride, QP); |
| 2684 else if(mode & V_X1_FILTER) | 2575 else if(mode & V_X1_FILTER) |
| 2685 vertX1Filter(vertBlock, stride, QP); | 2576 vertX1Filter(dstBlock, stride, QP); |
| 2686 else | 2577 else |
| 2687 { | 2578 { |
| 2688 if( isVertDC(vertBlock, stride)) | 2579 if( isVertDC(dstBlock, stride)) |
| 2689 { | 2580 { |
| 2690 if(isVertMinMaxOk(vertBlock, stride, QP)) | 2581 if(isVertMinMaxOk(dstBlock, stride, QP)) |
| 2691 doVertLowPass(vertBlock, stride, QP); | 2582 doVertLowPass(dstBlock, stride, QP); |
| 2692 } | 2583 } |
| 2693 else | 2584 else |
| 2694 doVertDefFilter(vertBlock, stride, QP); | 2585 doVertDefFilter(dstBlock, stride, QP); |
| 2695 } | 2586 } |
| 2696 } | 2587 } |
| 2697 #ifdef MORE_TIMING | 2588 #ifdef MORE_TIMING |
| 2698 T1= rdtsc(); | 2589 T1= rdtsc(); |
| 2699 vertTime+= T1-T0; | 2590 vertTime+= T1-T0; |
| 2700 T0=T1; | 2591 T0=T1; |
| 2701 #endif | 2592 #endif |
| 2702 } | 2593 } |
| 2703 else | 2594 |
| 2704 { | 2595 /* check if we have a previous block to deblock it with dstBlock */ |
| 2705 blockCopy(vertBlock + dstStride*1, dstStride, | |
| 2706 vertSrcBlock + srcStride*1, srcStride, 4, mode & LEVEL_FIX); | |
| 2707 | |
| 2708 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
| 2709 deInterlaceInterpolateLinearLastRow(dstBlock, dstStride); | |
| 2710 else if(mode & LINEAR_BLEND_DEINT_FILTER) | |
| 2711 deInterlaceBlendLinearLastRow(dstBlock, dstStride); | |
| 2712 else if(mode & MEDIAN_DEINT_FILTER) | |
| 2713 deInterlaceMedianLastRow(dstBlock, dstStride); | |
| 2714 /* else if(mode & CUBIC_IPOL_DEINT_FILTER) | |
| 2715 deInterlaceInterpolateCubicLastRow(dstBlock, dstStride); | |
| 2716 else if(mode & CUBIC_BLEND_DEINT_FILTER) | |
| 2717 deInterlaceBlendCubicLastRow(dstBlock, dstStride); | |
| 2718 */ | |
| 2719 } | |
| 2720 | |
| 2721 if(x - 8 >= 0 && x<width) | 2596 if(x - 8 >= 0 && x<width) |
| 2722 { | 2597 { |
| 2723 #ifdef MORE_TIMING | 2598 #ifdef MORE_TIMING |
| 2724 T0= rdtsc(); | 2599 T0= rdtsc(); |
| 2725 #endif | 2600 #endif |
| 2747 } | 2622 } |
| 2748 else if(y!=0) | 2623 else if(y!=0) |
| 2749 dering(dstBlock - stride*9 + width-9, stride, QP); | 2624 dering(dstBlock - stride*9 + width-9, stride, QP); |
| 2750 //FIXME dering filter will not be applied to last block (bottom right) | 2625 //FIXME dering filter will not be applied to last block (bottom right) |
| 2751 | 2626 |
| 2752 | |
| 2753 dstBlock+=8; | 2627 dstBlock+=8; |
| 2754 srcBlock+=8; | 2628 srcBlock+=8; |
| 2755 vertBlock+=8; | 2629 } |
| 2756 vertSrcBlock+=8; | 2630 |
| 2631 /* did we use a tmp buffer */ | |
| 2632 if(y+15 > height) | |
| 2633 { | |
| 2634 uint8_t *dstBlock= &(dst[y*dstStride]); | |
| 2635 memcpy(dstBlock, tempDst, dstStride*(height-y) ); | |
| 2757 } | 2636 } |
| 2758 } | 2637 } |
| 2759 #ifdef HAVE_3DNOW | 2638 #ifdef HAVE_3DNOW |
| 2760 asm volatile("femms"); | 2639 asm volatile("femms"); |
| 2761 #elif defined (HAVE_MMX) | 2640 #elif defined (HAVE_MMX) |
| 2770 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000), | 2649 (int)(memcpyTime/1000), (int)(vertTime/1000), (int)(horizTime/1000), |
| 2771 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) | 2650 (int)(sumTime/1000), (int)((sumTime-memcpyTime-vertTime-horizTime)/1000) |
| 2772 , black, white); | 2651 , black, white); |
| 2773 #endif | 2652 #endif |
| 2774 } | 2653 } |
| 2775 | |
| 2776 |
