Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 210:c2b6d68a0671 libavcodec
mangle for win32 in postproc
| author | atmos4 |
|---|---|
| date | Sat, 19 Jan 2002 05:14:46 +0000 |
| parents | 3ccd74a91074 |
| children | f1074f0d4969 |
comparison
equal
deleted
inserted
replaced
| 209:c0d8ecae7ac5 | 210:c2b6d68a0671 |
|---|---|
| 58 asm volatile( | 58 asm volatile( |
| 59 "leal (%1, %2), %%eax \n\t" | 59 "leal (%1, %2), %%eax \n\t" |
| 60 "leal (%%eax, %2, 4), %%ebx \n\t" | 60 "leal (%%eax, %2, 4), %%ebx \n\t" |
| 61 // 0 1 2 3 4 5 6 7 8 9 | 61 // 0 1 2 3 4 5 6 7 8 9 |
| 62 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 | 62 // %1 eax eax+%2 eax+2%2 %1+4%2 ebx ebx+%2 ebx+2%2 %1+8%2 ebx+4%2 |
| 63 "movq mmxDCOffset, %%mm7 \n\t" // mm7 = 0x7F | 63 "movq "MANGLE(mmxDCOffset)", %%mm7 \n\t" // mm7 = 0x7F |
| 64 "movq mmxDCThreshold, %%mm6 \n\t" // mm6 = 0x7D | 64 "movq "MANGLE(mmxDCThreshold)", %%mm6 \n\t" // mm6 = 0x7D |
| 65 "movq (%1), %%mm0 \n\t" | 65 "movq (%1), %%mm0 \n\t" |
| 66 "movq (%%eax), %%mm1 \n\t" | 66 "movq (%%eax), %%mm1 \n\t" |
| 67 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece | 67 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece |
| 68 "paddb %%mm7, %%mm0 \n\t" | 68 "paddb %%mm7, %%mm0 \n\t" |
| 69 "pcmpgtb %%mm6, %%mm0 \n\t" | 69 "pcmpgtb %%mm6, %%mm0 \n\t" |
| 169 "movq %%mm0, %%mm2 \n\t" | 169 "movq %%mm0, %%mm2 \n\t" |
| 170 "psubusb %%mm1, %%mm0 \n\t" | 170 "psubusb %%mm1, %%mm0 \n\t" |
| 171 "psubusb %%mm2, %%mm1 \n\t" | 171 "psubusb %%mm2, %%mm1 \n\t" |
| 172 "por %%mm1, %%mm0 \n\t" // ABS Diff | 172 "por %%mm1, %%mm0 \n\t" // ABS Diff |
| 173 | 173 |
| 174 "movq pQPb, %%mm7 \n\t" // QP,..., QP | 174 "movq "MANGLE(pQPb)", %%mm7 \n\t" // QP,..., QP |
| 175 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | 175 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
| 176 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 | 176 "psubusb %%mm7, %%mm0 \n\t" // Diff <= 2QP -> 0 |
| 177 "pcmpeqd b00, %%mm0 \n\t" | 177 "pcmpeqd "MANGLE(b00)", %%mm0 \n\t" |
| 178 "psrlq $16, %%mm0 \n\t" | 178 "psrlq $16, %%mm0 \n\t" |
| 179 "pcmpeqd bFF, %%mm0 \n\t" | 179 "pcmpeqd "MANGLE(bFF)", %%mm0 \n\t" |
| 180 // "movd %%mm0, (%1, %2, 4)\n\t" | 180 // "movd %%mm0, (%1, %2, 4)\n\t" |
| 181 "movd %%mm0, %0 \n\t" | 181 "movd %%mm0, %0 \n\t" |
| 182 : "=r" (isOk) | 182 : "=r" (isOk) |
| 183 : "r" (src), "r" (stride) | 183 : "r" (src), "r" (stride) |
| 184 ); | 184 ); |
| 217 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) | 217 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, int QP) |
| 218 { | 218 { |
| 219 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 219 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 220 src+= stride*3; | 220 src+= stride*3; |
| 221 asm volatile( //"movv %0 %1 %2\n\t" | 221 asm volatile( //"movv %0 %1 %2\n\t" |
| 222 "movq pQPb, %%mm0 \n\t" // QP,..., QP | 222 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
| 223 | 223 |
| 224 "movq (%0), %%mm6 \n\t" | 224 "movq (%0), %%mm6 \n\t" |
| 225 "movq (%0, %1), %%mm5 \n\t" | 225 "movq (%0, %1), %%mm5 \n\t" |
| 226 "movq %%mm5, %%mm1 \n\t" | 226 "movq %%mm5, %%mm1 \n\t" |
| 227 "movq %%mm6, %%mm2 \n\t" | 227 "movq %%mm6, %%mm2 \n\t" |
| 228 "psubusb %%mm6, %%mm5 \n\t" | 228 "psubusb %%mm6, %%mm5 \n\t" |
| 229 "psubusb %%mm1, %%mm2 \n\t" | 229 "psubusb %%mm1, %%mm2 \n\t" |
| 230 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | 230 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
| 231 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | 231 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
| 232 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF | 232 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // diff <= QP -> FF |
| 233 | 233 |
| 234 "pand %%mm2, %%mm6 \n\t" | 234 "pand %%mm2, %%mm6 \n\t" |
| 235 "pandn %%mm1, %%mm2 \n\t" | 235 "pandn %%mm1, %%mm2 \n\t" |
| 236 "por %%mm2, %%mm6 \n\t"// First Line to Filter | 236 "por %%mm2, %%mm6 \n\t"// First Line to Filter |
| 237 | 237 |
| 245 "movq %%mm7, %%mm2 \n\t" | 245 "movq %%mm7, %%mm2 \n\t" |
| 246 "psubusb %%mm7, %%mm5 \n\t" | 246 "psubusb %%mm7, %%mm5 \n\t" |
| 247 "psubusb %%mm1, %%mm2 \n\t" | 247 "psubusb %%mm1, %%mm2 \n\t" |
| 248 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | 248 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
| 249 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 | 249 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
| 250 "pcmpeqb b00, %%mm2 \n\t" // diff <= QP -> FF | 250 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // diff <= QP -> FF |
| 251 | 251 |
| 252 "pand %%mm2, %%mm7 \n\t" | 252 "pand %%mm2, %%mm7 \n\t" |
| 253 "pandn %%mm1, %%mm2 \n\t" | 253 "pandn %%mm1, %%mm2 \n\t" |
| 254 "por %%mm2, %%mm7 \n\t" // First Line to Filter | 254 "por %%mm2, %%mm7 \n\t" // First Line to Filter |
| 255 | 255 |
| 401 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 401 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 402 src+= stride*3; | 402 src+= stride*3; |
| 403 // FIXME rounding | 403 // FIXME rounding |
| 404 asm volatile( | 404 asm volatile( |
| 405 "pxor %%mm7, %%mm7 \n\t" // 0 | 405 "pxor %%mm7, %%mm7 \n\t" // 0 |
| 406 "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | 406 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 407 "leal (%0, %1), %%eax \n\t" | 407 "leal (%0, %1), %%eax \n\t" |
| 408 "leal (%%eax, %1, 4), %%ebx \n\t" | 408 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 409 // 0 1 2 3 4 5 6 7 8 9 | 409 // 0 1 2 3 4 5 6 7 8 9 |
| 410 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 410 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
| 411 "movq pQPb, %%mm0 \n\t" // QP,..., QP | 411 "movq "MANGLE(pQPb)", %%mm0 \n\t" // QP,..., QP |
| 412 "movq %%mm0, %%mm1 \n\t" // QP,..., QP | 412 "movq %%mm0, %%mm1 \n\t" // QP,..., QP |
| 413 "paddusb b02, %%mm0 \n\t" | 413 "paddusb "MANGLE(b02)", %%mm0 \n\t" |
| 414 "psrlw $2, %%mm0 \n\t" | 414 "psrlw $2, %%mm0 \n\t" |
| 415 "pand b3F, %%mm0 \n\t" // QP/4,..., QP/4 | 415 "pand "MANGLE(b3F)", %%mm0 \n\t" // QP/4,..., QP/4 |
| 416 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... | 416 "paddusb %%mm1, %%mm0 \n\t" // QP*1.25 ... |
| 417 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 | 417 "movq (%0, %1, 4), %%mm2 \n\t" // line 4 |
| 418 "movq (%%ebx), %%mm3 \n\t" // line 5 | 418 "movq (%%ebx), %%mm3 \n\t" // line 5 |
| 419 "movq %%mm2, %%mm4 \n\t" // line 4 | 419 "movq %%mm2, %%mm4 \n\t" // line 4 |
| 420 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 | 420 "pcmpeqb %%mm5, %%mm5 \n\t" // -1 |
| 439 // "psubb %%mm6, %%mm2 \n\t" | 439 // "psubb %%mm6, %%mm2 \n\t" |
| 440 "movq %%mm2, (%%ebx) \n\t" | 440 "movq %%mm2, (%%ebx) \n\t" |
| 441 | 441 |
| 442 "paddb %%mm6, %%mm5 \n\t" | 442 "paddb %%mm6, %%mm5 \n\t" |
| 443 "psrlw $2, %%mm5 \n\t" | 443 "psrlw $2, %%mm5 \n\t" |
| 444 "pand b3F, %%mm5 \n\t" | 444 "pand "MANGLE(b3F)", %%mm5 \n\t" |
| 445 "psubb b20, %%mm5 \n\t" // (l5-l4)/8 | 445 "psubb "MANGLE(b20)", %%mm5 \n\t" // (l5-l4)/8 |
| 446 | 446 |
| 447 "movq (%%eax, %1, 2), %%mm2 \n\t" | 447 "movq (%%eax, %1, 2), %%mm2 \n\t" |
| 448 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 | 448 "paddb %%mm6, %%mm2 \n\t" // line 3 + 0x80 |
| 449 "paddsb %%mm5, %%mm2 \n\t" | 449 "paddsb %%mm5, %%mm2 \n\t" |
| 450 "psubb %%mm6, %%mm2 \n\t" | 450 "psubb %%mm6, %%mm2 \n\t" |
| 501 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 501 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 502 src+= stride*3; | 502 src+= stride*3; |
| 503 | 503 |
| 504 asm volatile( | 504 asm volatile( |
| 505 "pxor %%mm7, %%mm7 \n\t" // 0 | 505 "pxor %%mm7, %%mm7 \n\t" // 0 |
| 506 // "movq b80, %%mm6 \n\t" // MIN_SIGNED_BYTE | 506 // "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 507 "leal (%0, %1), %%eax \n\t" | 507 "leal (%0, %1), %%eax \n\t" |
| 508 "leal (%%eax, %1, 4), %%ebx \n\t" | 508 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 509 // 0 1 2 3 4 5 6 7 8 9 | 509 // 0 1 2 3 4 5 6 7 8 9 |
| 510 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 510 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
| 511 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 | 511 "movq (%%eax, %1, 2), %%mm0 \n\t" // line 3 |
| 527 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 | 527 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
| 528 "psubusb %%mm1, %%mm5 \n\t" | 528 "psubusb %%mm1, %%mm5 \n\t" |
| 529 "por %%mm5, %%mm4 \n\t" // |l4 - l5| | 529 "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
| 530 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) | 530 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
| 531 "movq %%mm4, %%mm3 \n\t" // d | 531 "movq %%mm4, %%mm3 \n\t" // d |
| 532 "psubusb pQPb, %%mm4 \n\t" | 532 "psubusb "MANGLE(pQPb)", %%mm4 \n\t" |
| 533 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 | 533 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
| 534 "psubusb b01, %%mm3 \n\t" | 534 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 535 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 | 535 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
| 536 | 536 |
| 537 PAVGB(%%mm7, %%mm3) // d/2 | 537 PAVGB(%%mm7, %%mm3) // d/2 |
| 538 "movq %%mm3, %%mm1 \n\t" // d/2 | 538 "movq %%mm3, %%mm1 \n\t" // d/2 |
| 539 PAVGB(%%mm7, %%mm3) // d/4 | 539 PAVGB(%%mm7, %%mm3) // d/4 |
| 738 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 | 738 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 |
| 739 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 | 739 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 |
| 740 | 740 |
| 741 | 741 |
| 742 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 | 742 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 |
| 743 "movq pQPb, %%mm4 \n\t" // QP //FIXME QP+1 ? | 743 "movq "MANGLE(pQPb)", %%mm4 \n\t" // QP //FIXME QP+1 ? |
| 744 "paddusb b01, %%mm4 \n\t" | 744 "paddusb "MANGLE(b01)", %%mm4 \n\t" |
| 745 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP | 745 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
| 746 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 | 746 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 |
| 747 "pand %%mm4, %%mm3 \n\t" | 747 "pand %%mm4, %%mm3 \n\t" |
| 748 | 748 |
| 749 "movq %%mm3, %%mm1 \n\t" | 749 "movq %%mm3, %%mm1 \n\t" |
| 750 // "psubusb b01, %%mm3 \n\t" | 750 // "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 751 PAVGB(%%mm7, %%mm3) | 751 PAVGB(%%mm7, %%mm3) |
| 752 PAVGB(%%mm7, %%mm3) | 752 PAVGB(%%mm7, %%mm3) |
| 753 "paddusb %%mm1, %%mm3 \n\t" | 753 "paddusb %%mm1, %%mm3 \n\t" |
| 754 // "paddusb b01, %%mm3 \n\t" | 754 // "paddusb "MANGLE(b01)", %%mm3 \n\t" |
| 755 | 755 |
| 756 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 | 756 "movq (%%eax, %1, 2), %%mm6 \n\t" //l3 |
| 757 "movq (%0, %1, 4), %%mm5 \n\t" //l4 | 757 "movq (%0, %1, 4), %%mm5 \n\t" //l4 |
| 758 "movq (%0, %1, 4), %%mm4 \n\t" //l4 | 758 "movq (%0, %1, 4), %%mm4 \n\t" //l4 |
| 759 "psubusb %%mm6, %%mm5 \n\t" | 759 "psubusb %%mm6, %%mm5 \n\t" |
| 762 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) | 762 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) |
| 763 "pxor %%mm6, %%mm0 \n\t" | 763 "pxor %%mm6, %%mm0 \n\t" |
| 764 "pand %%mm0, %%mm3 \n\t" | 764 "pand %%mm0, %%mm3 \n\t" |
| 765 PMINUB(%%mm5, %%mm3, %%mm0) | 765 PMINUB(%%mm5, %%mm3, %%mm0) |
| 766 | 766 |
| 767 "psubusb b01, %%mm3 \n\t" | 767 "psubusb "MANGLE(b01)", %%mm3 \n\t" |
| 768 PAVGB(%%mm7, %%mm3) | 768 PAVGB(%%mm7, %%mm3) |
| 769 | 769 |
| 770 "movq (%%eax, %1, 2), %%mm0 \n\t" | 770 "movq (%%eax, %1, 2), %%mm0 \n\t" |
| 771 "movq (%0, %1, 4), %%mm2 \n\t" | 771 "movq (%0, %1, 4), %%mm2 \n\t" |
| 772 "pxor %%mm6, %%mm0 \n\t" | 772 "pxor %%mm6, %%mm0 \n\t" |
| 794 | 794 |
| 795 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 | 795 "movq (%%eax, %1, 4), %%mm2 \n\t" // l5 |
| 796 "movq (%%eax, %1), %%mm3 \n\t" // l2 | 796 "movq (%%eax, %1), %%mm3 \n\t" // l2 |
| 797 "pxor %%mm6, %%mm2 \n\t" // -l5-1 | 797 "pxor %%mm6, %%mm2 \n\t" // -l5-1 |
| 798 "movq %%mm2, %%mm5 \n\t" // -l5-1 | 798 "movq %%mm2, %%mm5 \n\t" // -l5-1 |
| 799 "movq b80, %%mm4 \n\t" // 128 | 799 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
| 800 "leal (%%eax, %1, 4), %%ebx \n\t" | 800 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 801 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 | 801 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 |
| 802 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 | 802 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 |
| 803 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 | 803 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 |
| 804 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 | 804 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 |
| 806 | 806 |
| 807 "movq (%%eax), %%mm2 \n\t" // l1 | 807 "movq (%%eax), %%mm2 \n\t" // l1 |
| 808 "pxor %%mm6, %%mm2 \n\t" // -l1-1 | 808 "pxor %%mm6, %%mm2 \n\t" // -l1-1 |
| 809 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 | 809 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 |
| 810 PAVGB((%0), %%mm1) // (l0-l3+256)/2 | 810 PAVGB((%0), %%mm1) // (l0-l3+256)/2 |
| 811 "movq b80, %%mm3 \n\t" // 128 | 811 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 |
| 812 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 | 812 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
| 813 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 | 813 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 |
| 814 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 | 814 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 |
| 815 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 | 815 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 |
| 816 | 816 |
| 817 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2 | 817 PAVGB((%%ebx, %1), %%mm5) // (l6-l5+256)/2 |
| 818 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7 | 818 "movq (%%ebx, %1, 2), %%mm1 \n\t" // l7 |
| 819 "pxor %%mm6, %%mm1 \n\t" // -l7-1 | 819 "pxor %%mm6, %%mm1 \n\t" // -l7-1 |
| 820 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 | 820 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 |
| 821 "movq b80, %%mm2 \n\t" // 128 | 821 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
| 822 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 | 822 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
| 823 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 | 823 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 |
| 824 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 | 824 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 |
| 825 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 | 825 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 |
| 826 | 826 |
| 827 "movq b00, %%mm1 \n\t" // 0 | 827 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
| 828 "movq b00, %%mm5 \n\t" // 0 | 828 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 |
| 829 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 | 829 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 |
| 830 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 | 830 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 |
| 831 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| | 831 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| |
| 832 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| | 832 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| |
| 833 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 | 833 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 |
| 834 | 834 |
| 835 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 | 835 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 |
| 836 | 836 |
| 837 "movq b00, %%mm7 \n\t" // 0 | 837 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
| 838 "movq pQPb, %%mm2 \n\t" // QP | 838 "movq "MANGLE(pQPb)", %%mm2 \n\t" // QP |
| 839 PAVGB(%%mm6, %%mm2) // 128 + QP/2 | 839 PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
| 840 "psubb %%mm6, %%mm2 \n\t" | 840 "psubb %%mm6, %%mm2 \n\t" |
| 841 | 841 |
| 842 "movq %%mm4, %%mm1 \n\t" | 842 "movq %%mm4, %%mm1 \n\t" |
| 843 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) | 843 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) |
| 846 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 | 846 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 |
| 847 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 | 847 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 |
| 848 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 | 848 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 |
| 849 | 849 |
| 850 "movq %%mm4, %%mm3 \n\t" // d | 850 "movq %%mm4, %%mm3 \n\t" // d |
| 851 "psubusb b01, %%mm4 \n\t" | 851 "psubusb "MANGLE(b01)", %%mm4 \n\t" |
| 852 PAVGB(%%mm7, %%mm4) // d/32 | 852 PAVGB(%%mm7, %%mm4) // d/32 |
| 853 PAVGB(%%mm7, %%mm4) // (d + 32)/64 | 853 PAVGB(%%mm7, %%mm4) // (d + 32)/64 |
| 854 "paddb %%mm3, %%mm4 \n\t" // 5d/64 | 854 "paddb %%mm3, %%mm4 \n\t" // 5d/64 |
| 855 "pand %%mm2, %%mm4 \n\t" | 855 "pand %%mm2, %%mm4 \n\t" |
| 856 | 856 |
| 857 "movq b80, %%mm5 \n\t" // 128 | 857 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 |
| 858 "psubb %%mm0, %%mm5 \n\t" // q | 858 "psubb %%mm0, %%mm5 \n\t" // q |
| 859 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding | 859 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding |
| 860 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) | 860 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) |
| 861 "pxor %%mm7, %%mm5 \n\t" | 861 "pxor %%mm7, %%mm5 \n\t" |
| 862 | 862 |
| 989 | 989 |
| 990 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 | 990 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
| 991 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 | 991 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
| 992 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | 992 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 993 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | 993 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
| 994 "movq %%mm0, temp0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | 994 "movq %%mm0, "MANGLE(temp0)" \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 995 "movq %%mm1, temp1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | 995 "movq %%mm1, "MANGLE(temp1)" \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
| 996 | 996 |
| 997 "movq (%0, %1, 4), %%mm0 \n\t" | 997 "movq (%0, %1, 4), %%mm0 \n\t" |
| 998 "movq %%mm0, %%mm1 \n\t" | 998 "movq %%mm0, %%mm1 \n\t" |
| 999 "punpcklbw %%mm7, %%mm0 \n\t" // L4 | 999 "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
| 1000 "punpckhbw %%mm7, %%mm1 \n\t" // H4 | 1000 "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
| 1001 | 1001 |
| 1002 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 | 1002 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
| 1003 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 | 1003 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
| 1004 "movq %%mm2, temp2 \n\t" // L3 - L4 | 1004 "movq %%mm2, "MANGLE(temp2)" \n\t" // L3 - L4 |
| 1005 "movq %%mm3, temp3 \n\t" // H3 - H4 | 1005 "movq %%mm3, "MANGLE(temp3)" \n\t" // H3 - H4 |
| 1006 "paddw %%mm4, %%mm4 \n\t" // 2L2 | 1006 "paddw %%mm4, %%mm4 \n\t" // 2L2 |
| 1007 "paddw %%mm5, %%mm5 \n\t" // 2H2 | 1007 "paddw %%mm5, %%mm5 \n\t" // 2H2 |
| 1008 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 | 1008 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
| 1009 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 | 1009 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
| 1010 | 1010 |
| 1047 "paddw %%mm2, %%mm2 \n\t" // 2L7 | 1047 "paddw %%mm2, %%mm2 \n\t" // 2L7 |
| 1048 "paddw %%mm3, %%mm3 \n\t" // 2H7 | 1048 "paddw %%mm3, %%mm3 \n\t" // 2H7 |
| 1049 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 | 1049 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
| 1050 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 | 1050 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
| 1051 | 1051 |
| 1052 "movq temp0, %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | 1052 "movq "MANGLE(temp0)", %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 1053 "movq temp1, %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | 1053 "movq "MANGLE(temp1)", %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
| 1054 | 1054 |
| 1055 #ifdef HAVE_MMX2 | 1055 #ifdef HAVE_MMX2 |
| 1056 "movq %%mm7, %%mm6 \n\t" // 0 | 1056 "movq %%mm7, %%mm6 \n\t" // 0 |
| 1057 "psubw %%mm0, %%mm6 \n\t" | 1057 "psubw %%mm0, %%mm6 \n\t" |
| 1058 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | 1058 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
| 1136 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120 | 1136 //FIXME if *5/64 is supposed to be /13 then we should use 5041 instead of 5120 |
| 1137 "pmulhw %%mm2, %%mm4 \n\t" // hd/13 | 1137 "pmulhw %%mm2, %%mm4 \n\t" // hd/13 |
| 1138 "pmulhw %%mm2, %%mm5 \n\t" // ld/13 | 1138 "pmulhw %%mm2, %%mm5 \n\t" // ld/13 |
| 1139 */ | 1139 */ |
| 1140 | 1140 |
| 1141 "movq temp2, %%mm0 \n\t" // L3 - L4 | 1141 "movq "MANGLE(temp2)", %%mm0 \n\t" // L3 - L4 |
| 1142 "movq temp3, %%mm1 \n\t" // H3 - H4 | 1142 "movq "MANGLE(temp3)", %%mm1 \n\t" // H3 - H4 |
| 1143 | 1143 |
| 1144 "pxor %%mm2, %%mm2 \n\t" | 1144 "pxor %%mm2, %%mm2 \n\t" |
| 1145 "pxor %%mm3, %%mm3 \n\t" | 1145 "pxor %%mm3, %%mm3 \n\t" |
| 1146 | 1146 |
| 1147 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) | 1147 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
| 1233 | 1233 |
| 1234 static inline void RENAME(dering)(uint8_t src[], int stride, int QP) | 1234 static inline void RENAME(dering)(uint8_t src[], int stride, int QP) |
| 1235 { | 1235 { |
| 1236 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1236 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 1237 asm volatile( | 1237 asm volatile( |
| 1238 "movq pQPb, %%mm0 \n\t" | 1238 "movq "MANGLE(pQPb)", %%mm0 \n\t" |
| 1239 "paddusb %%mm0, %%mm0 \n\t" | 1239 "paddusb %%mm0, %%mm0 \n\t" |
| 1240 "movq %%mm0, pQPb2 \n\t" | 1240 "movq %%mm0, "MANGLE(pQPb2)" \n\t" |
| 1241 | 1241 |
| 1242 "leal (%0, %1), %%eax \n\t" | 1242 "leal (%0, %1), %%eax \n\t" |
| 1243 "leal (%%eax, %1, 4), %%ebx \n\t" | 1243 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 1244 // 0 1 2 3 4 5 6 7 8 9 | 1244 // 0 1 2 3 4 5 6 7 8 9 |
| 1245 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 1245 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
| 1317 "paddb %%mm4, %%mm6 \n\t" | 1317 "paddb %%mm4, %%mm6 \n\t" |
| 1318 #endif | 1318 #endif |
| 1319 "movq %%mm6, %%mm0 \n\t" // max | 1319 "movq %%mm6, %%mm0 \n\t" // max |
| 1320 "psubb %%mm7, %%mm6 \n\t" // max - min | 1320 "psubb %%mm7, %%mm6 \n\t" // max - min |
| 1321 "movd %%mm6, %%ecx \n\t" | 1321 "movd %%mm6, %%ecx \n\t" |
| 1322 "cmpb deringThreshold, %%cl \n\t" | 1322 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" |
| 1323 " jb 1f \n\t" | 1323 " jb 1f \n\t" |
| 1324 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 | 1324 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
| 1325 "punpcklbw %%mm7, %%mm7 \n\t" | 1325 "punpcklbw %%mm7, %%mm7 \n\t" |
| 1326 "punpcklbw %%mm7, %%mm7 \n\t" | 1326 "punpcklbw %%mm7, %%mm7 \n\t" |
| 1327 "punpcklbw %%mm7, %%mm7 \n\t" | 1327 "punpcklbw %%mm7, %%mm7 \n\t" |
| 1328 "movq %%mm7, temp0 \n\t" | 1328 "movq %%mm7, "MANGLE(temp0)" \n\t" |
| 1329 | 1329 |
| 1330 "movq (%0), %%mm0 \n\t" // L10 | 1330 "movq (%0), %%mm0 \n\t" // L10 |
| 1331 "movq %%mm0, %%mm1 \n\t" // L10 | 1331 "movq %%mm0, %%mm1 \n\t" // L10 |
| 1332 "movq %%mm0, %%mm2 \n\t" // L10 | 1332 "movq %%mm0, %%mm2 \n\t" // L10 |
| 1333 "psllq $8, %%mm1 \n\t" | 1333 "psllq $8, %%mm1 \n\t" |
| 1342 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 | 1342 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 |
| 1343 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 | 1343 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 |
| 1344 "psubusb %%mm7, %%mm0 \n\t" | 1344 "psubusb %%mm7, %%mm0 \n\t" |
| 1345 "psubusb %%mm7, %%mm2 \n\t" | 1345 "psubusb %%mm7, %%mm2 \n\t" |
| 1346 "psubusb %%mm7, %%mm3 \n\t" | 1346 "psubusb %%mm7, %%mm3 \n\t" |
| 1347 "pcmpeqb b00, %%mm0 \n\t" // L10 > a ? 0 : -1 | 1347 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 |
| 1348 "pcmpeqb b00, %%mm2 \n\t" // L20 > a ? 0 : -1 | 1348 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 |
| 1349 "pcmpeqb b00, %%mm3 \n\t" // L00 > a ? 0 : -1 | 1349 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 |
| 1350 "paddb %%mm2, %%mm0 \n\t" | 1350 "paddb %%mm2, %%mm0 \n\t" |
| 1351 "paddb %%mm3, %%mm0 \n\t" | 1351 "paddb %%mm3, %%mm0 \n\t" |
| 1352 | 1352 |
| 1353 "movq (%%eax), %%mm2 \n\t" // L11 | 1353 "movq (%%eax), %%mm2 \n\t" // L11 |
| 1354 "movq %%mm2, %%mm3 \n\t" // L11 | 1354 "movq %%mm2, %%mm3 \n\t" // L11 |
| 1365 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 | 1365 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 |
| 1366 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 | 1366 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 |
| 1367 "psubusb %%mm7, %%mm2 \n\t" | 1367 "psubusb %%mm7, %%mm2 \n\t" |
| 1368 "psubusb %%mm7, %%mm4 \n\t" | 1368 "psubusb %%mm7, %%mm4 \n\t" |
| 1369 "psubusb %%mm7, %%mm5 \n\t" | 1369 "psubusb %%mm7, %%mm5 \n\t" |
| 1370 "pcmpeqb b00, %%mm2 \n\t" // L11 > a ? 0 : -1 | 1370 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 |
| 1371 "pcmpeqb b00, %%mm4 \n\t" // L21 > a ? 0 : -1 | 1371 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 |
| 1372 "pcmpeqb b00, %%mm5 \n\t" // L01 > a ? 0 : -1 | 1372 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 |
| 1373 "paddb %%mm4, %%mm2 \n\t" | 1373 "paddb %%mm4, %%mm2 \n\t" |
| 1374 "paddb %%mm5, %%mm2 \n\t" | 1374 "paddb %%mm5, %%mm2 \n\t" |
| 1375 // 0, 2, 3, 1 | 1375 // 0, 2, 3, 1 |
| 1376 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ | 1376 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ |
| 1377 "movq " #src ", " #sx " \n\t" /* src[0] */\ | 1377 "movq " #src ", " #sx " \n\t" /* src[0] */\ |
| 1387 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ | 1387 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ |
| 1388 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ | 1388 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ |
| 1389 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ | 1389 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ |
| 1390 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ | 1390 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ |
| 1391 PAVGB(lx, pplx) \ | 1391 PAVGB(lx, pplx) \ |
| 1392 "movq " #lx ", temp1 \n\t"\ | 1392 "movq " #lx ", "MANGLE(temp1)" \n\t"\ |
| 1393 "movq temp0, " #lx " \n\t"\ | 1393 "movq "MANGLE(temp0)", " #lx " \n\t"\ |
| 1394 "psubusb " #lx ", " #t1 " \n\t"\ | 1394 "psubusb " #lx ", " #t1 " \n\t"\ |
| 1395 "psubusb " #lx ", " #t0 " \n\t"\ | 1395 "psubusb " #lx ", " #t0 " \n\t"\ |
| 1396 "psubusb " #lx ", " #sx " \n\t"\ | 1396 "psubusb " #lx ", " #sx " \n\t"\ |
| 1397 "movq b00, " #lx " \n\t"\ | 1397 "movq "MANGLE(b00)", " #lx " \n\t"\ |
| 1398 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ | 1398 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
| 1399 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ | 1399 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ |
| 1400 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ | 1400 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ |
| 1401 "paddb " #t1 ", " #t0 " \n\t"\ | 1401 "paddb " #t1 ", " #t0 " \n\t"\ |
| 1402 "paddb " #t0 ", " #sx " \n\t"\ | 1402 "paddb " #t0 ", " #sx " \n\t"\ |
| 1403 \ | 1403 \ |
| 1404 PAVGB(plx, pplx) /* filtered */\ | 1404 PAVGB(plx, pplx) /* filtered */\ |
| 1405 "movq " #dst ", " #t0 " \n\t" /* dst */\ | 1405 "movq " #dst ", " #t0 " \n\t" /* dst */\ |
| 1406 "movq " #t0 ", " #t1 " \n\t" /* dst */\ | 1406 "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
| 1407 "psubusb pQPb2, " #t0 " \n\t"\ | 1407 "psubusb "MANGLE(pQPb2)", " #t0 " \n\t"\ |
| 1408 "paddusb pQPb2, " #t1 " \n\t"\ | 1408 "paddusb "MANGLE(pQPb2)", " #t1 " \n\t"\ |
| 1409 PMAXUB(t0, pplx)\ | 1409 PMAXUB(t0, pplx)\ |
| 1410 PMINUB(t1, pplx, t0)\ | 1410 PMINUB(t1, pplx, t0)\ |
| 1411 "paddb " #sx ", " #ppsx " \n\t"\ | 1411 "paddb " #sx ", " #ppsx " \n\t"\ |
| 1412 "paddb " #psx ", " #ppsx " \n\t"\ | 1412 "paddb " #psx ", " #ppsx " \n\t"\ |
| 1413 "#paddb b02, " #ppsx " \n\t"\ | 1413 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
| 1414 "pand b08, " #ppsx " \n\t"\ | 1414 "pand "MANGLE(b08)", " #ppsx " \n\t"\ |
| 1415 "pcmpeqb " #lx ", " #ppsx " \n\t"\ | 1415 "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
| 1416 "pand " #ppsx ", " #pplx " \n\t"\ | 1416 "pand " #ppsx ", " #pplx " \n\t"\ |
| 1417 "pandn " #dst ", " #ppsx " \n\t"\ | 1417 "pandn " #dst ", " #ppsx " \n\t"\ |
| 1418 "por " #pplx ", " #ppsx " \n\t"\ | 1418 "por " #pplx ", " #ppsx " \n\t"\ |
| 1419 "movq " #ppsx ", " #dst " \n\t"\ | 1419 "movq " #ppsx ", " #dst " \n\t"\ |
| 1420 "movq temp1, " #lx " \n\t" | 1420 "movq "MANGLE(temp1)", " #lx " \n\t" |
| 1421 | 1421 |
| 1422 /* | 1422 /* |
| 1423 0000000 | 1423 0000000 |
| 1424 1111111 | 1424 1111111 |
| 1425 | 1425 |
| 2080 "paddw %%mm5, %%mm6 \n\t" | 2080 "paddw %%mm5, %%mm6 \n\t" |
| 2081 "paddw %%mm7, %%mm6 \n\t" | 2081 "paddw %%mm7, %%mm6 \n\t" |
| 2082 "paddw %%mm6, %%mm0 \n\t" | 2082 "paddw %%mm6, %%mm0 \n\t" |
| 2083 #elif defined (FAST_L2_DIFF) | 2083 #elif defined (FAST_L2_DIFF) |
| 2084 "pcmpeqb %%mm7, %%mm7 \n\t" | 2084 "pcmpeqb %%mm7, %%mm7 \n\t" |
| 2085 "movq b80, %%mm6 \n\t" | 2085 "movq "MANGLE(b80)", %%mm6 \n\t" |
| 2086 "pxor %%mm0, %%mm0 \n\t" | 2086 "pxor %%mm0, %%mm0 \n\t" |
| 2087 #define L2_DIFF_CORE(a, b)\ | 2087 #define L2_DIFF_CORE(a, b)\ |
| 2088 "movq " #a ", %%mm5 \n\t"\ | 2088 "movq " #a ", %%mm5 \n\t"\ |
| 2089 "movq " #b ", %%mm2 \n\t"\ | 2089 "movq " #b ", %%mm2 \n\t"\ |
| 2090 "pxor %%mm7, %%mm2 \n\t"\ | 2090 "pxor %%mm7, %%mm2 \n\t"\ |
| 2150 "addl 1024(%%ebx), %%ecx \n\t" | 2150 "addl 1024(%%ebx), %%ecx \n\t" |
| 2151 "shrl $3, %%ecx \n\t" | 2151 "shrl $3, %%ecx \n\t" |
| 2152 "movl %%ecx, (%%ebx) \n\t" | 2152 "movl %%ecx, (%%ebx) \n\t" |
| 2153 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride | 2153 "leal (%%eax, %2, 2), %%ebx \n\t" // 5*stride |
| 2154 | 2154 |
| 2155 // "movl %3, %%ecx \n\t" | 2155 // "movl %3, %%ecx \n\t" |
| 2156 // "movl %%ecx, test \n\t" | 2156 // "movl %%ecx, test \n\t" |
| 2157 // "jmp 4f \n\t" | 2157 // "jmp 4f \n\t" |
| 2158 "cmpl 4+maxTmpNoise, %%ecx \n\t" | 2158 "cmpl 4+"MANGLE(maxTmpNoise)", %%ecx \n\t" |
| 2159 " jb 2f \n\t" | 2159 " jb 2f \n\t" |
| 2160 "cmpl 8+maxTmpNoise, %%ecx \n\t" | 2160 "cmpl 8+"MANGLE(maxTmpNoise)", %%ecx \n\t" |
| 2161 " jb 1f \n\t" | 2161 " jb 1f \n\t" |
| 2162 | 2162 |
| 2163 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | 2163 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride |
| 2164 "movq (%0), %%mm0 \n\t" // L0 | 2164 "movq (%0), %%mm0 \n\t" // L0 |
| 2165 "movq (%0, %2), %%mm1 \n\t" // L1 | 2165 "movq (%0, %2), %%mm1 \n\t" // L1 |
| 2214 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 | 2214 "movq %%mm6, (%0, %%eax, 2) \n\t" // L6 |
| 2215 "movq %%mm7, (%0, %%ecx) \n\t" // L7 | 2215 "movq %%mm7, (%0, %%ecx) \n\t" // L7 |
| 2216 "jmp 4f \n\t" | 2216 "jmp 4f \n\t" |
| 2217 | 2217 |
| 2218 "2: \n\t" | 2218 "2: \n\t" |
| 2219 "cmpl maxTmpNoise, %%ecx \n\t" | 2219 "cmpl "MANGLE(maxTmpNoise)", %%ecx \n\t" |
| 2220 " jb 3f \n\t" | 2220 " jb 3f \n\t" |
| 2221 | 2221 |
| 2222 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride | 2222 "leal (%%ebx, %2, 2), %%ecx \n\t" // 7*stride |
| 2223 "movq (%0), %%mm0 \n\t" // L0 | 2223 "movq (%0), %%mm0 \n\t" // L0 |
| 2224 "movq (%0, %2), %%mm1 \n\t" // L1 | 2224 "movq (%0, %2), %%mm1 \n\t" // L1 |
| 2459 { | 2459 { |
| 2460 #ifdef HAVE_MMX | 2460 #ifdef HAVE_MMX |
| 2461 asm volatile( | 2461 asm volatile( |
| 2462 "leal (%0,%2), %%eax \n\t" | 2462 "leal (%0,%2), %%eax \n\t" |
| 2463 "leal (%1,%3), %%ebx \n\t" | 2463 "leal (%1,%3), %%ebx \n\t" |
| 2464 "movq packedYOffset, %%mm2 \n\t" | 2464 "movq "MANGLE(packedYOffset)", %%mm2\n\t" |
| 2465 "movq packedYScale, %%mm3 \n\t" | 2465 "movq "MANGLE(packedYScale)", %%mm3\n\t" |
| 2466 "pxor %%mm4, %%mm4 \n\t" | 2466 "pxor %%mm4, %%mm4 \n\t" |
| 2467 #ifdef HAVE_MMX2 | 2467 #ifdef HAVE_MMX2 |
| 2468 #define SCALED_CPY(src1, src2, dst1, dst2) \ | 2468 #define SCALED_CPY(src1, src2, dst1, dst2) \ |
| 2469 "movq " #src1 ", %%mm0 \n\t"\ | 2469 "movq " #src1 ", %%mm0 \n\t"\ |
| 2470 "movq " #src1 ", %%mm5 \n\t"\ | 2470 "movq " #src1 ", %%mm5 \n\t"\ |
| 2882 asm volatile( | 2882 asm volatile( |
| 2883 "movd %0, %%mm7 \n\t" | 2883 "movd %0, %%mm7 \n\t" |
| 2884 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | 2884 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
| 2885 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | 2885 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP |
| 2886 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | 2886 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP |
| 2887 "movq %%mm7, pQPb \n\t" | 2887 "movq %%mm7, "MANGLE(pQPb)" \n\t" |
| 2888 : : "r" (QP) | 2888 : : "r" (QP) |
| 2889 ); | 2889 ); |
| 2890 #endif | 2890 #endif |
| 2891 | 2891 |
| 2892 #ifdef MORE_TIMING | 2892 #ifdef MORE_TIMING |
