Mercurial > libpostproc.hg
comparison postprocess_template.c @ 118:bdd1788fb53b libpostproc
Change semantic of CONFIG_*, HAVE_* and ARCH_*.
They are now always defined to either 0 or 1.
| author | aurel |
|---|---|
| date | Tue, 13 Jan 2009 23:44:16 +0000 |
| parents | bf8f52662dc3 |
| children | 4a1602d552aa |
comparison
equal
deleted
inserted
replaced
| 117:3a76063f4145 | 118:bdd1788fb53b |
|---|---|
| 29 | 29 |
| 30 #undef PAVGB | 30 #undef PAVGB |
| 31 #undef PMINUB | 31 #undef PMINUB |
| 32 #undef PMAXUB | 32 #undef PMAXUB |
| 33 | 33 |
| 34 #ifdef HAVE_MMX2 | 34 #if HAVE_MMX2 |
| 35 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" | 35 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
| 36 #elif defined (HAVE_3DNOW) | 36 #elif HAVE_3DNOW |
| 37 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" | 37 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
| 38 #endif | 38 #endif |
| 39 #define PAVGB(a,b) REAL_PAVGB(a,b) | 39 #define PAVGB(a,b) REAL_PAVGB(a,b) |
| 40 | 40 |
| 41 #ifdef HAVE_MMX2 | 41 #if HAVE_MMX2 |
| 42 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" | 42 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" |
| 43 #elif defined (HAVE_MMX) | 43 #elif HAVE_MMX |
| 44 #define PMINUB(b,a,t) \ | 44 #define PMINUB(b,a,t) \ |
| 45 "movq " #a ", " #t " \n\t"\ | 45 "movq " #a ", " #t " \n\t"\ |
| 46 "psubusb " #b ", " #t " \n\t"\ | 46 "psubusb " #b ", " #t " \n\t"\ |
| 47 "psubb " #t ", " #a " \n\t" | 47 "psubb " #t ", " #a " \n\t" |
| 48 #endif | 48 #endif |
| 49 | 49 |
| 50 #ifdef HAVE_MMX2 | 50 #if HAVE_MMX2 |
| 51 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" | 51 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" |
| 52 #elif defined (HAVE_MMX) | 52 #elif HAVE_MMX |
| 53 #define PMAXUB(a,b) \ | 53 #define PMAXUB(a,b) \ |
| 54 "psubusb " #a ", " #b " \n\t"\ | 54 "psubusb " #a ", " #b " \n\t"\ |
| 55 "paddb " #a ", " #b " \n\t" | 55 "paddb " #a ", " #b " \n\t" |
| 56 #endif | 56 #endif |
| 57 | 57 |
| 58 //FIXME? |255-0| = 1 (should not be a problem ...) | 58 //FIXME? |255-0| = 1 (should not be a problem ...) |
| 59 #ifdef HAVE_MMX | 59 #if HAVE_MMX |
| 60 /** | 60 /** |
| 61 * Check if the middle 8x8 Block in the given 8x16 block is flat | 61 * Check if the middle 8x8 Block in the given 8x16 block is flat |
| 62 */ | 62 */ |
| 63 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ | 63 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ |
| 64 int numEq= 0, dcOk; | 64 int numEq= 0, dcOk; |
| 134 "pcmpgtb %%mm6, %%mm2 \n\t" | 134 "pcmpgtb %%mm6, %%mm2 \n\t" |
| 135 "paddb %%mm2, %%mm0 \n\t" | 135 "paddb %%mm2, %%mm0 \n\t" |
| 136 "psubusb %%mm3, %%mm4 \n\t" | 136 "psubusb %%mm3, %%mm4 \n\t" |
| 137 | 137 |
| 138 " \n\t" | 138 " \n\t" |
| 139 #ifdef HAVE_MMX2 | 139 #if HAVE_MMX2 |
| 140 "pxor %%mm7, %%mm7 \n\t" | 140 "pxor %%mm7, %%mm7 \n\t" |
| 141 "psadbw %%mm7, %%mm0 \n\t" | 141 "psadbw %%mm7, %%mm0 \n\t" |
| 142 #else | 142 #else |
| 143 "movq %%mm0, %%mm1 \n\t" | 143 "movq %%mm0, %%mm1 \n\t" |
| 144 "psrlw $8, %%mm0 \n\t" | 144 "psrlw $8, %%mm0 \n\t" |
| 174 | 174 |
| 175 /** | 175 /** |
| 176 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) | 176 * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
| 177 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 | 177 * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
| 178 */ | 178 */ |
| 179 #ifndef HAVE_ALTIVEC | 179 #if !HAVE_ALTIVEC |
| 180 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) | 180 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
| 181 { | 181 { |
| 182 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 182 #if HAVE_MMX2 || HAVE_3DNOW |
| 183 src+= stride*3; | 183 src+= stride*3; |
| 184 __asm__ volatile( //"movv %0 %1 %2\n\t" | 184 __asm__ volatile( //"movv %0 %1 %2\n\t" |
| 185 "movq %2, %%mm0 \n\t" // QP,..., QP | 185 "movq %2, %%mm0 \n\t" // QP,..., QP |
| 186 "pxor %%mm4, %%mm4 \n\t" | 186 "pxor %%mm4, %%mm4 \n\t" |
| 187 | 187 |
| 304 | 304 |
| 305 : | 305 : |
| 306 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) | 306 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) |
| 307 : "%"REG_a, "%"REG_c | 307 : "%"REG_a, "%"REG_c |
| 308 ); | 308 ); |
| 309 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 309 #else //HAVE_MMX2 || HAVE_3DNOW |
| 310 const int l1= stride; | 310 const int l1= stride; |
| 311 const int l2= stride + l1; | 311 const int l2= stride + l1; |
| 312 const int l3= stride + l2; | 312 const int l3= stride + l2; |
| 313 const int l4= stride + l3; | 313 const int l4= stride + l3; |
| 314 const int l5= stride + l4; | 314 const int l5= stride + l4; |
| 343 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; | 343 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; |
| 344 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; | 344 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; |
| 345 | 345 |
| 346 src++; | 346 src++; |
| 347 } | 347 } |
| 348 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 348 #endif //HAVE_MMX2 || HAVE_3DNOW |
| 349 } | 349 } |
| 350 #endif //HAVE_ALTIVEC | 350 #endif //HAVE_ALTIVEC |
| 351 | 351 |
| 352 #if 0 | 352 #if 0 |
| 353 /** | 353 /** |
| 362 x/8 = 1 | 362 x/8 = 1 |
| 363 1 12 12 23 | 363 1 12 12 23 |
| 364 */ | 364 */ |
| 365 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) | 365 static inline void RENAME(vertRK1Filter)(uint8_t *src, int stride, int QP) |
| 366 { | 366 { |
| 367 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 367 #if HAVE_MMX2 || HAVE_3DNOW |
| 368 src+= stride*3; | 368 src+= stride*3; |
| 369 // FIXME rounding | 369 // FIXME rounding |
| 370 __asm__ volatile( | 370 __asm__ volatile( |
| 371 "pxor %%mm7, %%mm7 \n\t" // 0 | 371 "pxor %%mm7, %%mm7 \n\t" // 0 |
| 372 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE | 372 "movq "MANGLE(b80)", %%mm6 \n\t" // MIN_SIGNED_BYTE |
| 424 | 424 |
| 425 : | 425 : |
| 426 : "r" (src), "r" ((x86_reg)stride) | 426 : "r" (src), "r" ((x86_reg)stride) |
| 427 : "%"REG_a, "%"REG_c | 427 : "%"REG_a, "%"REG_c |
| 428 ); | 428 ); |
| 429 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 429 #else //HAVE_MMX2 || HAVE_3DNOW |
| 430 const int l1= stride; | 430 const int l1= stride; |
| 431 const int l2= stride + l1; | 431 const int l2= stride + l1; |
| 432 const int l3= stride + l2; | 432 const int l3= stride + l2; |
| 433 const int l4= stride + l3; | 433 const int l4= stride + l3; |
| 434 const int l5= stride + l4; | 434 const int l5= stride + l4; |
| 447 src[x+l5] -=v>>1; | 447 src[x+l5] -=v>>1; |
| 448 src[x+l6] -=v>>3; | 448 src[x+l6] -=v>>3; |
| 449 } | 449 } |
| 450 } | 450 } |
| 451 | 451 |
| 452 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 452 #endif //HAVE_MMX2 || HAVE_3DNOW |
| 453 } | 453 } |
| 454 #endif //0 | 454 #endif //0 |
| 455 | 455 |
| 456 /** | 456 /** |
| 457 * Experimental Filter 1 | 457 * Experimental Filter 1 |
| 460 * can only smooth blocks at the expected locations (it cannot smooth them if they did move) | 460 * can only smooth blocks at the expected locations (it cannot smooth them if they did move) |
| 461 * MMX2 version does correct clipping C version does not | 461 * MMX2 version does correct clipping C version does not |
| 462 */ | 462 */ |
| 463 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) | 463 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
| 464 { | 464 { |
| 465 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 465 #if HAVE_MMX2 || HAVE_3DNOW |
| 466 src+= stride*3; | 466 src+= stride*3; |
| 467 | 467 |
| 468 __asm__ volatile( | 468 __asm__ volatile( |
| 469 "pxor %%mm7, %%mm7 \n\t" // 0 | 469 "pxor %%mm7, %%mm7 \n\t" // 0 |
| 470 "lea (%0, %1), %%"REG_a" \n\t" | 470 "lea (%0, %1), %%"REG_a" \n\t" |
| 546 | 546 |
| 547 : | 547 : |
| 548 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) | 548 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) |
| 549 : "%"REG_a, "%"REG_c | 549 : "%"REG_a, "%"REG_c |
| 550 ); | 550 ); |
| 551 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 551 #else //HAVE_MMX2 || HAVE_3DNOW |
| 552 | 552 |
| 553 const int l1= stride; | 553 const int l1= stride; |
| 554 const int l2= stride + l1; | 554 const int l2= stride + l1; |
| 555 const int l3= stride + l2; | 555 const int l3= stride + l2; |
| 556 const int l4= stride + l3; | 556 const int l4= stride + l3; |
| 580 src[l6] -=v>>2; | 580 src[l6] -=v>>2; |
| 581 src[l7] -=v>>3; | 581 src[l7] -=v>>3; |
| 582 } | 582 } |
| 583 src++; | 583 src++; |
| 584 } | 584 } |
| 585 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 585 #endif //HAVE_MMX2 || HAVE_3DNOW |
| 586 } | 586 } |
| 587 | 587 |
| 588 #ifndef HAVE_ALTIVEC | 588 #if !HAVE_ALTIVEC |
| 589 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) | 589 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
| 590 { | 590 { |
| 591 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 591 #if HAVE_MMX2 || HAVE_3DNOW |
| 592 /* | 592 /* |
| 593 uint8_t tmp[16]; | 593 uint8_t tmp[16]; |
| 594 const int l1= stride; | 594 const int l1= stride; |
| 595 const int l2= stride + l1; | 595 const int l2= stride + l1; |
| 596 const int l3= stride + l2; | 596 const int l3= stride + l2; |
| 867 } | 867 } |
| 868 } | 868 } |
| 869 } | 869 } |
| 870 } | 870 } |
| 871 */ | 871 */ |
| 872 #elif defined (HAVE_MMX) | 872 #elif HAVE_MMX |
| 873 src+= stride*4; | 873 src+= stride*4; |
| 874 __asm__ volatile( | 874 __asm__ volatile( |
| 875 "pxor %%mm7, %%mm7 \n\t" | 875 "pxor %%mm7, %%mm7 \n\t" |
| 876 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars | 876 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars |
| 877 "and "ALIGN_MASK", %%"REG_c" \n\t" // align | 877 "and "ALIGN_MASK", %%"REG_c" \n\t" // align |
| 976 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 | 976 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
| 977 | 977 |
| 978 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | 978 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 979 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | 979 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
| 980 | 980 |
| 981 #ifdef HAVE_MMX2 | 981 #if HAVE_MMX2 |
| 982 "movq %%mm7, %%mm6 \n\t" // 0 | 982 "movq %%mm7, %%mm6 \n\t" // 0 |
| 983 "psubw %%mm0, %%mm6 \n\t" | 983 "psubw %%mm0, %%mm6 \n\t" |
| 984 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | 984 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
| 985 "movq %%mm7, %%mm6 \n\t" // 0 | 985 "movq %%mm7, %%mm6 \n\t" // 0 |
| 986 "psubw %%mm1, %%mm6 \n\t" | 986 "psubw %%mm1, %%mm6 \n\t" |
| 1008 "pcmpgtw %%mm3, %%mm6 \n\t" | 1008 "pcmpgtw %%mm3, %%mm6 \n\t" |
| 1009 "pxor %%mm6, %%mm3 \n\t" | 1009 "pxor %%mm6, %%mm3 \n\t" |
| 1010 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | 1010 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
| 1011 #endif | 1011 #endif |
| 1012 | 1012 |
| 1013 #ifdef HAVE_MMX2 | 1013 #if HAVE_MMX2 |
| 1014 "pminsw %%mm2, %%mm0 \n\t" | 1014 "pminsw %%mm2, %%mm0 \n\t" |
| 1015 "pminsw %%mm3, %%mm1 \n\t" | 1015 "pminsw %%mm3, %%mm1 \n\t" |
| 1016 #else | 1016 #else |
| 1017 "movq %%mm0, %%mm6 \n\t" | 1017 "movq %%mm0, %%mm6 \n\t" |
| 1018 "psubusw %%mm2, %%mm6 \n\t" | 1018 "psubusw %%mm2, %%mm6 \n\t" |
| 1072 "pxor %%mm6, %%mm2 \n\t" | 1072 "pxor %%mm6, %%mm2 \n\t" |
| 1073 "pxor %%mm7, %%mm3 \n\t" | 1073 "pxor %%mm7, %%mm3 \n\t" |
| 1074 "pand %%mm2, %%mm4 \n\t" | 1074 "pand %%mm2, %%mm4 \n\t" |
| 1075 "pand %%mm3, %%mm5 \n\t" | 1075 "pand %%mm3, %%mm5 \n\t" |
| 1076 | 1076 |
| 1077 #ifdef HAVE_MMX2 | 1077 #if HAVE_MMX2 |
| 1078 "pminsw %%mm0, %%mm4 \n\t" | 1078 "pminsw %%mm0, %%mm4 \n\t" |
| 1079 "pminsw %%mm1, %%mm5 \n\t" | 1079 "pminsw %%mm1, %%mm5 \n\t" |
| 1080 #else | 1080 #else |
| 1081 "movq %%mm4, %%mm2 \n\t" | 1081 "movq %%mm4, %%mm2 \n\t" |
| 1082 "psubusw %%mm0, %%mm2 \n\t" | 1082 "psubusw %%mm0, %%mm2 \n\t" |
| 1099 | 1099 |
| 1100 : "+r" (src) | 1100 : "+r" (src) |
| 1101 : "r" ((x86_reg)stride), "m" (c->pQPb) | 1101 : "r" ((x86_reg)stride), "m" (c->pQPb) |
| 1102 : "%"REG_a, "%"REG_c | 1102 : "%"REG_a, "%"REG_c |
| 1103 ); | 1103 ); |
| 1104 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1104 #else //HAVE_MMX2 || HAVE_3DNOW |
| 1105 const int l1= stride; | 1105 const int l1= stride; |
| 1106 const int l2= stride + l1; | 1106 const int l2= stride + l1; |
| 1107 const int l3= stride + l2; | 1107 const int l3= stride + l2; |
| 1108 const int l4= stride + l3; | 1108 const int l4= stride + l3; |
| 1109 const int l5= stride + l4; | 1109 const int l5= stride + l4; |
| 1137 src[l4]-= d; | 1137 src[l4]-= d; |
| 1138 src[l5]+= d; | 1138 src[l5]+= d; |
| 1139 } | 1139 } |
| 1140 src++; | 1140 src++; |
| 1141 } | 1141 } |
| 1142 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1142 #endif //HAVE_MMX2 || HAVE_3DNOW |
| 1143 } | 1143 } |
| 1144 #endif //HAVE_ALTIVEC | 1144 #endif //HAVE_ALTIVEC |
| 1145 | 1145 |
| 1146 #ifndef HAVE_ALTIVEC | 1146 #if !HAVE_ALTIVEC |
| 1147 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) | 1147 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
| 1148 { | 1148 { |
| 1149 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1149 #if HAVE_MMX2 || HAVE_3DNOW |
| 1150 __asm__ volatile( | 1150 __asm__ volatile( |
| 1151 "pxor %%mm6, %%mm6 \n\t" | 1151 "pxor %%mm6, %%mm6 \n\t" |
| 1152 "pcmpeqb %%mm7, %%mm7 \n\t" | 1152 "pcmpeqb %%mm7, %%mm7 \n\t" |
| 1153 "movq %2, %%mm0 \n\t" | 1153 "movq %2, %%mm0 \n\t" |
| 1154 "punpcklbw %%mm6, %%mm0 \n\t" | 1154 "punpcklbw %%mm6, %%mm0 \n\t" |
| 1162 | 1162 |
| 1163 // 0 1 2 3 4 5 6 7 8 9 | 1163 // 0 1 2 3 4 5 6 7 8 9 |
| 1164 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | 1164 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
| 1165 | 1165 |
| 1166 #undef FIND_MIN_MAX | 1166 #undef FIND_MIN_MAX |
| 1167 #ifdef HAVE_MMX2 | 1167 #if HAVE_MMX2 |
| 1168 #define REAL_FIND_MIN_MAX(addr)\ | 1168 #define REAL_FIND_MIN_MAX(addr)\ |
| 1169 "movq " #addr ", %%mm0 \n\t"\ | 1169 "movq " #addr ", %%mm0 \n\t"\ |
| 1170 "pminub %%mm0, %%mm7 \n\t"\ | 1170 "pminub %%mm0, %%mm7 \n\t"\ |
| 1171 "pmaxub %%mm0, %%mm6 \n\t" | 1171 "pmaxub %%mm0, %%mm6 \n\t" |
| 1172 #else | 1172 #else |
| 1189 FIND_MIN_MAX((%%REGd, %1, 2)) | 1189 FIND_MIN_MAX((%%REGd, %1, 2)) |
| 1190 FIND_MIN_MAX((%0, %1, 8)) | 1190 FIND_MIN_MAX((%0, %1, 8)) |
| 1191 | 1191 |
| 1192 "movq %%mm7, %%mm4 \n\t" | 1192 "movq %%mm7, %%mm4 \n\t" |
| 1193 "psrlq $8, %%mm7 \n\t" | 1193 "psrlq $8, %%mm7 \n\t" |
| 1194 #ifdef HAVE_MMX2 | 1194 #if HAVE_MMX2 |
| 1195 "pminub %%mm4, %%mm7 \n\t" // min of pixels | 1195 "pminub %%mm4, %%mm7 \n\t" // min of pixels |
| 1196 "pshufw $0xF9, %%mm7, %%mm4 \n\t" | 1196 "pshufw $0xF9, %%mm7, %%mm4 \n\t" |
| 1197 "pminub %%mm4, %%mm7 \n\t" // min of pixels | 1197 "pminub %%mm4, %%mm7 \n\t" // min of pixels |
| 1198 "pshufw $0xFE, %%mm7, %%mm4 \n\t" | 1198 "pshufw $0xFE, %%mm7, %%mm4 \n\t" |
| 1199 "pminub %%mm4, %%mm7 \n\t" | 1199 "pminub %%mm4, %%mm7 \n\t" |
| 1214 #endif | 1214 #endif |
| 1215 | 1215 |
| 1216 | 1216 |
| 1217 "movq %%mm6, %%mm4 \n\t" | 1217 "movq %%mm6, %%mm4 \n\t" |
| 1218 "psrlq $8, %%mm6 \n\t" | 1218 "psrlq $8, %%mm6 \n\t" |
| 1219 #ifdef HAVE_MMX2 | 1219 #if HAVE_MMX2 |
| 1220 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels | 1220 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
| 1221 "pshufw $0xF9, %%mm6, %%mm4 \n\t" | 1221 "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
| 1222 "pmaxub %%mm4, %%mm6 \n\t" | 1222 "pmaxub %%mm4, %%mm6 \n\t" |
| 1223 "pshufw $0xFE, %%mm6, %%mm4 \n\t" | 1223 "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
| 1224 "pmaxub %%mm4, %%mm6 \n\t" | 1224 "pmaxub %%mm4, %%mm6 \n\t" |
| 1368 | 1368 |
| 1369 "1: \n\t" | 1369 "1: \n\t" |
| 1370 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2) | 1370 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2) |
| 1371 : "%"REG_a, "%"REG_d, "%"REG_c | 1371 : "%"REG_a, "%"REG_d, "%"REG_c |
| 1372 ); | 1372 ); |
| 1373 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1373 #else //HAVE_MMX2 || HAVE_3DNOW |
| 1374 int y; | 1374 int y; |
| 1375 int min=255; | 1375 int min=255; |
| 1376 int max=0; | 1376 int max=0; |
| 1377 int avg; | 1377 int avg; |
| 1378 uint8_t *p; | 1378 uint8_t *p; |
| 1485 } | 1485 } |
| 1486 } | 1486 } |
| 1487 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; | 1487 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; |
| 1488 } | 1488 } |
| 1489 #endif | 1489 #endif |
| 1490 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1490 #endif //HAVE_MMX2 || HAVE_3DNOW |
| 1491 } | 1491 } |
| 1492 #endif //HAVE_ALTIVEC | 1492 #endif //HAVE_ALTIVEC |
| 1493 | 1493 |
| 1494 /** | 1494 /** |
| 1495 * Deinterlaces the given block by linearly interpolating every second line. | 1495 * Deinterlaces the given block by linearly interpolating every second line. |
| 1497 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. | 1497 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. |
| 1498 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1498 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
| 1499 */ | 1499 */ |
| 1500 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) | 1500 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
| 1501 { | 1501 { |
| 1502 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1502 #if HAVE_MMX2 || HAVE_3DNOW |
| 1503 src+= 4*stride; | 1503 src+= 4*stride; |
| 1504 __asm__ volatile( | 1504 __asm__ volatile( |
| 1505 "lea (%0, %1), %%"REG_a" \n\t" | 1505 "lea (%0, %1), %%"REG_a" \n\t" |
| 1506 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" | 1506 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
| 1507 // 0 1 2 3 4 5 6 7 8 9 | 1507 // 0 1 2 3 4 5 6 7 8 9 |
| 1550 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1550 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
| 1551 * this filter will read lines 3-15 and write 7-13 | 1551 * this filter will read lines 3-15 and write 7-13 |
| 1552 */ | 1552 */ |
| 1553 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) | 1553 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
| 1554 { | 1554 { |
| 1555 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1555 #if HAVE_MMX2 || HAVE_3DNOW |
| 1556 src+= stride*3; | 1556 src+= stride*3; |
| 1557 __asm__ volatile( | 1557 __asm__ volatile( |
| 1558 "lea (%0, %1), %%"REG_a" \n\t" | 1558 "lea (%0, %1), %%"REG_a" \n\t" |
| 1559 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | 1559 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
| 1560 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" | 1560 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" |
| 1592 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) | 1592 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) |
| 1593 | 1593 |
| 1594 : : "r" (src), "r" ((x86_reg)stride) | 1594 : : "r" (src), "r" ((x86_reg)stride) |
| 1595 : "%"REG_a, "%"REG_d, "%"REG_c | 1595 : "%"REG_a, "%"REG_d, "%"REG_c |
| 1596 ); | 1596 ); |
| 1597 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1597 #else //HAVE_MMX2 || HAVE_3DNOW |
| 1598 int x; | 1598 int x; |
| 1599 src+= stride*3; | 1599 src+= stride*3; |
| 1600 for(x=0; x<8; x++){ | 1600 for(x=0; x<8; x++){ |
| 1601 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); | 1601 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); |
| 1602 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); | 1602 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); |
| 1603 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); | 1603 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); |
| 1604 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); | 1604 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); |
| 1605 src++; | 1605 src++; |
| 1606 } | 1606 } |
| 1607 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1607 #endif //HAVE_MMX2 || HAVE_3DNOW |
| 1608 } | 1608 } |
| 1609 | 1609 |
| 1610 /** | 1610 /** |
| 1611 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. | 1611 * Deinterlaces the given block by filtering every second line with a (-1 4 2 4 -1) filter. |
| 1612 * will be called for every 8x8 block and can read & write from line 4-15 | 1612 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1614 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1614 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
| 1615 * this filter will read lines 4-13 and write 5-11 | 1615 * this filter will read lines 4-13 and write 5-11 |
| 1616 */ | 1616 */ |
| 1617 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) | 1617 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) |
| 1618 { | 1618 { |
| 1619 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1619 #if HAVE_MMX2 || HAVE_3DNOW |
| 1620 src+= stride*4; | 1620 src+= stride*4; |
| 1621 __asm__ volatile( | 1621 __asm__ volatile( |
| 1622 "lea (%0, %1), %%"REG_a" \n\t" | 1622 "lea (%0, %1), %%"REG_a" \n\t" |
| 1623 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | 1623 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
| 1624 "pxor %%mm7, %%mm7 \n\t" | 1624 "pxor %%mm7, %%mm7 \n\t" |
| 1663 | 1663 |
| 1664 "movq %%mm0, (%2) \n\t" | 1664 "movq %%mm0, (%2) \n\t" |
| 1665 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) | 1665 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) |
| 1666 : "%"REG_a, "%"REG_d | 1666 : "%"REG_a, "%"REG_d |
| 1667 ); | 1667 ); |
| 1668 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1668 #else //HAVE_MMX2 || HAVE_3DNOW |
| 1669 int x; | 1669 int x; |
| 1670 src+= stride*4; | 1670 src+= stride*4; |
| 1671 for(x=0; x<8; x++){ | 1671 for(x=0; x<8; x++){ |
| 1672 int t1= tmp[x]; | 1672 int t1= tmp[x]; |
| 1673 int t2= src[stride*1]; | 1673 int t2= src[stride*1]; |
| 1681 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); | 1681 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); |
| 1682 tmp[x]= t1; | 1682 tmp[x]= t1; |
| 1683 | 1683 |
| 1684 src++; | 1684 src++; |
| 1685 } | 1685 } |
| 1686 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1686 #endif //HAVE_MMX2 || HAVE_3DNOW |
| 1687 } | 1687 } |
| 1688 | 1688 |
| 1689 /** | 1689 /** |
| 1690 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. | 1690 * Deinterlaces the given block by filtering every line with a (-1 2 6 2 -1) filter. |
| 1691 * will be called for every 8x8 block and can read & write from line 4-15 | 1691 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1693 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1693 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
| 1694 * this filter will read lines 4-13 and write 4-11 | 1694 * this filter will read lines 4-13 and write 4-11 |
| 1695 */ | 1695 */ |
| 1696 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) | 1696 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) |
| 1697 { | 1697 { |
| 1698 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1698 #if HAVE_MMX2 || HAVE_3DNOW |
| 1699 src+= stride*4; | 1699 src+= stride*4; |
| 1700 __asm__ volatile( | 1700 __asm__ volatile( |
| 1701 "lea (%0, %1), %%"REG_a" \n\t" | 1701 "lea (%0, %1), %%"REG_a" \n\t" |
| 1702 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | 1702 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
| 1703 "pxor %%mm7, %%mm7 \n\t" | 1703 "pxor %%mm7, %%mm7 \n\t" |
| 1753 "movq %%mm0, (%2) \n\t" | 1753 "movq %%mm0, (%2) \n\t" |
| 1754 "movq %%mm1, (%3) \n\t" | 1754 "movq %%mm1, (%3) \n\t" |
| 1755 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) | 1755 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) |
| 1756 : "%"REG_a, "%"REG_d | 1756 : "%"REG_a, "%"REG_d |
| 1757 ); | 1757 ); |
| 1758 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1758 #else //HAVE_MMX2 || HAVE_3DNOW |
| 1759 int x; | 1759 int x; |
| 1760 src+= stride*4; | 1760 src+= stride*4; |
| 1761 for(x=0; x<8; x++){ | 1761 for(x=0; x<8; x++){ |
| 1762 int t1= tmp[x]; | 1762 int t1= tmp[x]; |
| 1763 int t2= tmp2[x]; | 1763 int t2= tmp2[x]; |
| 1782 tmp[x]= t3; | 1782 tmp[x]= t3; |
| 1783 tmp2[x]= t1; | 1783 tmp2[x]= t1; |
| 1784 | 1784 |
| 1785 src++; | 1785 src++; |
| 1786 } | 1786 } |
| 1787 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1787 #endif //HAVE_MMX2 || HAVE_3DNOW |
| 1788 } | 1788 } |
| 1789 | 1789 |
| 1790 /** | 1790 /** |
| 1791 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. | 1791 * Deinterlaces the given block by filtering all lines with a (1 2 1) filter. |
| 1792 * will be called for every 8x8 block and can read & write from line 4-15 | 1792 * will be called for every 8x8 block and can read & write from line 4-15 |
| 1794 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1794 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
| 1795 * this filter will read lines 4-13 and write 4-11 | 1795 * this filter will read lines 4-13 and write 4-11 |
| 1796 */ | 1796 */ |
| 1797 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) | 1797 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) |
| 1798 { | 1798 { |
| 1799 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1799 #if HAVE_MMX2 || HAVE_3DNOW |
| 1800 src+= 4*stride; | 1800 src+= 4*stride; |
| 1801 __asm__ volatile( | 1801 __asm__ volatile( |
| 1802 "lea (%0, %1), %%"REG_a" \n\t" | 1802 "lea (%0, %1), %%"REG_a" \n\t" |
| 1803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | 1803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
| 1804 // 0 1 2 3 4 5 6 7 8 9 | 1804 // 0 1 2 3 4 5 6 7 8 9 |
| 1841 "movq %%mm1, (%2) \n\t" | 1841 "movq %%mm1, (%2) \n\t" |
| 1842 | 1842 |
| 1843 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) | 1843 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) |
| 1844 : "%"REG_a, "%"REG_d | 1844 : "%"REG_a, "%"REG_d |
| 1845 ); | 1845 ); |
| 1846 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1846 #else //HAVE_MMX2 || HAVE_3DNOW |
| 1847 int a, b, c, x; | 1847 int a, b, c, x; |
| 1848 src+= 4*stride; | 1848 src+= 4*stride; |
| 1849 | 1849 |
| 1850 for(x=0; x<2; x++){ | 1850 for(x=0; x<2; x++){ |
| 1851 a= *(uint32_t*)&tmp[stride*0]; | 1851 a= *(uint32_t*)&tmp[stride*0]; |
| 1884 | 1884 |
| 1885 *(uint32_t*)&tmp[stride*0]= c; | 1885 *(uint32_t*)&tmp[stride*0]= c; |
| 1886 src += 4; | 1886 src += 4; |
| 1887 tmp += 4; | 1887 tmp += 4; |
| 1888 } | 1888 } |
| 1889 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 1889 #endif //HAVE_MMX2 || HAVE_3DNOW |
| 1890 } | 1890 } |
| 1891 | 1891 |
| 1892 /** | 1892 /** |
| 1893 * Deinterlaces the given block by applying a median filter to every second line. | 1893 * Deinterlaces the given block by applying a median filter to every second line. |
| 1894 * will be called for every 8x8 block and can read & write from line 4-15, | 1894 * will be called for every 8x8 block and can read & write from line 4-15, |
| 1895 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. | 1895 * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. |
| 1896 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | 1896 * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
| 1897 */ | 1897 */ |
| 1898 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) | 1898 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
| 1899 { | 1899 { |
| 1900 #ifdef HAVE_MMX | 1900 #ifd HAVE_MMX |
| 1901 src+= 4*stride; | 1901 src+= 4*stride; |
| 1902 #ifdef HAVE_MMX2 | 1902 #if HAVE_MMX2 |
| 1903 __asm__ volatile( | 1903 __asm__ volatile( |
| 1904 "lea (%0, %1), %%"REG_a" \n\t" | 1904 "lea (%0, %1), %%"REG_a" \n\t" |
| 1905 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" | 1905 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
| 1906 // 0 1 2 3 4 5 6 7 8 9 | 1906 // 0 1 2 3 4 5 6 7 8 9 |
| 1907 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 | 1907 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
| 2010 src++; | 2010 src++; |
| 2011 } | 2011 } |
| 2012 #endif //HAVE_MMX | 2012 #endif //HAVE_MMX |
| 2013 } | 2013 } |
| 2014 | 2014 |
| 2015 #ifdef HAVE_MMX | 2015 #if HAVE_MMX |
| 2016 /** | 2016 /** |
| 2017 * transposes and shift the given 8x8 Block into dst1 and dst2 | 2017 * transposes and shift the given 8x8 Block into dst1 and dst2 |
| 2018 */ | 2018 */ |
| 2019 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) | 2019 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
| 2020 { | 2020 { |
| 2178 ); | 2178 ); |
| 2179 } | 2179 } |
| 2180 #endif //HAVE_MMX | 2180 #endif //HAVE_MMX |
| 2181 //static long test=0; | 2181 //static long test=0; |
| 2182 | 2182 |
| 2183 #ifndef HAVE_ALTIVEC | 2183 #if !HAVE_ALTIVEC |
| 2184 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, | 2184 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
| 2185 uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise) | 2185 uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise) |
| 2186 { | 2186 { |
| 2187 // to save a register (FIXME do this outside of the loops) | 2187 // to save a register (FIXME do this outside of the loops) |
| 2188 tempBlurredPast[127]= maxNoise[0]; | 2188 tempBlurredPast[127]= maxNoise[0]; |
| 2189 tempBlurredPast[128]= maxNoise[1]; | 2189 tempBlurredPast[128]= maxNoise[1]; |
| 2190 tempBlurredPast[129]= maxNoise[2]; | 2190 tempBlurredPast[129]= maxNoise[2]; |
| 2191 | 2191 |
| 2192 #define FAST_L2_DIFF | 2192 #define FAST_L2_DIFF |
| 2193 //#define L1_DIFF //u should change the thresholds too if u try that one | 2193 //#define L1_DIFF //u should change the thresholds too if u try that one |
| 2194 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2194 #if HAVE_MMX2 || HAVE_3DNOW |
| 2195 __asm__ volatile( | 2195 __asm__ volatile( |
| 2196 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride | 2196 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride |
| 2197 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride | 2197 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride |
| 2198 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride | 2198 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
| 2199 // 0 1 2 3 4 5 6 7 8 9 | 2199 // 0 1 2 3 4 5 6 7 8 9 |
| 2477 "4: \n\t" | 2477 "4: \n\t" |
| 2478 | 2478 |
| 2479 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) | 2479 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) |
| 2480 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" | 2480 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" |
| 2481 ); | 2481 ); |
| 2482 #else //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2482 #else //HAVE_MMX2 || HAVE_3DNOW |
| 2483 { | 2483 { |
| 2484 int y; | 2484 int y; |
| 2485 int d=0; | 2485 int d=0; |
| 2486 // int sysd=0; | 2486 // int sysd=0; |
| 2487 int i; | 2487 int i; |
| 2560 } | 2560 } |
| 2561 } | 2561 } |
| 2562 } | 2562 } |
| 2563 } | 2563 } |
| 2564 } | 2564 } |
| 2565 #endif //defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2565 #endif //HAVE_MMX2 || HAVE_3DNOW |
| 2566 } | 2566 } |
| 2567 #endif //HAVE_ALTIVEC | 2567 #endif //HAVE_ALTIVEC |
| 2568 | 2568 |
| 2569 #ifdef HAVE_MMX | 2569 #if HAVE_MMX |
| 2570 /** | 2570 /** |
| 2571 * accurate deblock filter | 2571 * accurate deblock filter |
| 2572 */ | 2572 */ |
| 2573 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ | 2573 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ |
| 2574 int64_t dc_mask, eq_mask, both_masks; | 2574 int64_t dc_mask, eq_mask, both_masks; |
| 2968 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 | 2968 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
| 2969 | 2969 |
| 2970 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | 2970 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
| 2971 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | 2971 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
| 2972 | 2972 |
| 2973 #ifdef HAVE_MMX2 | 2973 #if HAVE_MMX2 |
| 2974 "movq %%mm7, %%mm6 \n\t" // 0 | 2974 "movq %%mm7, %%mm6 \n\t" // 0 |
| 2975 "psubw %%mm0, %%mm6 \n\t" | 2975 "psubw %%mm0, %%mm6 \n\t" |
| 2976 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | 2976 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
| 2977 "movq %%mm7, %%mm6 \n\t" // 0 | 2977 "movq %%mm7, %%mm6 \n\t" // 0 |
| 2978 "psubw %%mm1, %%mm6 \n\t" | 2978 "psubw %%mm1, %%mm6 \n\t" |
| 3000 "pcmpgtw %%mm3, %%mm6 \n\t" | 3000 "pcmpgtw %%mm3, %%mm6 \n\t" |
| 3001 "pxor %%mm6, %%mm3 \n\t" | 3001 "pxor %%mm6, %%mm3 \n\t" |
| 3002 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | 3002 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
| 3003 #endif | 3003 #endif |
| 3004 | 3004 |
| 3005 #ifdef HAVE_MMX2 | 3005 #if HAVE_MMX2 |
| 3006 "pminsw %%mm2, %%mm0 \n\t" | 3006 "pminsw %%mm2, %%mm0 \n\t" |
| 3007 "pminsw %%mm3, %%mm1 \n\t" | 3007 "pminsw %%mm3, %%mm1 \n\t" |
| 3008 #else | 3008 #else |
| 3009 "movq %%mm0, %%mm6 \n\t" | 3009 "movq %%mm0, %%mm6 \n\t" |
| 3010 "psubusw %%mm2, %%mm6 \n\t" | 3010 "psubusw %%mm2, %%mm6 \n\t" |
| 3064 "pxor %%mm6, %%mm2 \n\t" | 3064 "pxor %%mm6, %%mm2 \n\t" |
| 3065 "pxor %%mm7, %%mm3 \n\t" | 3065 "pxor %%mm7, %%mm3 \n\t" |
| 3066 "pand %%mm2, %%mm4 \n\t" | 3066 "pand %%mm2, %%mm4 \n\t" |
| 3067 "pand %%mm3, %%mm5 \n\t" | 3067 "pand %%mm3, %%mm5 \n\t" |
| 3068 | 3068 |
| 3069 #ifdef HAVE_MMX2 | 3069 #if HAVE_MMX2 |
| 3070 "pminsw %%mm0, %%mm4 \n\t" | 3070 "pminsw %%mm0, %%mm4 \n\t" |
| 3071 "pminsw %%mm1, %%mm5 \n\t" | 3071 "pminsw %%mm1, %%mm5 \n\t" |
| 3072 #else | 3072 #else |
| 3073 "movq %%mm4, %%mm2 \n\t" | 3073 "movq %%mm4, %%mm2 \n\t" |
| 3074 "psubusw %%mm0, %%mm2 \n\t" | 3074 "psubusw %%mm0, %%mm2 \n\t" |
| 3114 #undef SCALED_CPY | 3114 #undef SCALED_CPY |
| 3115 | 3115 |
| 3116 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride, | 3116 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride, |
| 3117 int levelFix, int64_t *packedOffsetAndScale) | 3117 int levelFix, int64_t *packedOffsetAndScale) |
| 3118 { | 3118 { |
| 3119 #ifndef HAVE_MMX | 3119 #if !HAVE_MMX |
| 3120 int i; | 3120 int i; |
| 3121 #endif | 3121 #endif |
| 3122 if(levelFix){ | 3122 if(levelFix){ |
| 3123 #ifdef HAVE_MMX | 3123 #if HAVE_MMX |
| 3124 __asm__ volatile( | 3124 __asm__ volatile( |
| 3125 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset | 3125 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset |
| 3126 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale | 3126 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale |
| 3127 "lea (%2,%4), %%"REG_a" \n\t" | 3127 "lea (%2,%4), %%"REG_a" \n\t" |
| 3128 "lea (%3,%5), %%"REG_d" \n\t" | 3128 "lea (%3,%5), %%"REG_d" \n\t" |
| 3129 "pxor %%mm4, %%mm4 \n\t" | 3129 "pxor %%mm4, %%mm4 \n\t" |
| 3130 #ifdef HAVE_MMX2 | 3130 #if HAVE_MMX2 |
| 3131 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ | 3131 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ |
| 3132 "movq " #src1 ", %%mm0 \n\t"\ | 3132 "movq " #src1 ", %%mm0 \n\t"\ |
| 3133 "movq " #src1 ", %%mm5 \n\t"\ | 3133 "movq " #src1 ", %%mm5 \n\t"\ |
| 3134 "movq " #src2 ", %%mm1 \n\t"\ | 3134 "movq " #src2 ", %%mm1 \n\t"\ |
| 3135 "movq " #src2 ", %%mm6 \n\t"\ | 3135 "movq " #src2 ", %%mm6 \n\t"\ |
| 3201 for(i=0; i<8; i++) | 3201 for(i=0; i<8; i++) |
| 3202 memcpy( &(dst[dstStride*i]), | 3202 memcpy( &(dst[dstStride*i]), |
| 3203 &(src[srcStride*i]), BLOCK_SIZE); | 3203 &(src[srcStride*i]), BLOCK_SIZE); |
| 3204 #endif //HAVE_MMX | 3204 #endif //HAVE_MMX |
| 3205 }else{ | 3205 }else{ |
| 3206 #ifdef HAVE_MMX | 3206 #if HAVE_MMX |
| 3207 __asm__ volatile( | 3207 __asm__ volatile( |
| 3208 "lea (%0,%2), %%"REG_a" \n\t" | 3208 "lea (%0,%2), %%"REG_a" \n\t" |
| 3209 "lea (%1,%3), %%"REG_d" \n\t" | 3209 "lea (%1,%3), %%"REG_d" \n\t" |
| 3210 | 3210 |
| 3211 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ | 3211 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ |
| 3241 /** | 3241 /** |
| 3242 * Duplicates the given 8 src pixels ? times upward | 3242 * Duplicates the given 8 src pixels ? times upward |
| 3243 */ | 3243 */ |
| 3244 static inline void RENAME(duplicate)(uint8_t src[], int stride) | 3244 static inline void RENAME(duplicate)(uint8_t src[], int stride) |
| 3245 { | 3245 { |
| 3246 #ifdef HAVE_MMX | 3246 #if HAVE_MMX |
| 3247 __asm__ volatile( | 3247 __asm__ volatile( |
| 3248 "movq (%0), %%mm0 \n\t" | 3248 "movq (%0), %%mm0 \n\t" |
| 3249 "add %1, %0 \n\t" | 3249 "add %1, %0 \n\t" |
| 3250 "movq %%mm0, (%0) \n\t" | 3250 "movq %%mm0, (%0) \n\t" |
| 3251 "movq %%mm0, (%0, %1) \n\t" | 3251 "movq %%mm0, (%0, %1) \n\t" |
| 3278 #endif | 3278 #endif |
| 3279 int black=0, white=255; // blackest black and whitest white in the picture | 3279 int black=0, white=255; // blackest black and whitest white in the picture |
| 3280 int QPCorrecture= 256*256; | 3280 int QPCorrecture= 256*256; |
| 3281 | 3281 |
| 3282 int copyAhead; | 3282 int copyAhead; |
| 3283 #ifdef HAVE_MMX | 3283 #if HAVE_MMX |
| 3284 int i; | 3284 int i; |
| 3285 #endif | 3285 #endif |
| 3286 | 3286 |
| 3287 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; | 3287 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; |
| 3288 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; | 3288 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; |
| 3291 uint64_t * const yHistogram= c.yHistogram; | 3291 uint64_t * const yHistogram= c.yHistogram; |
| 3292 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; | 3292 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; |
| 3293 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride; | 3293 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride; |
| 3294 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; | 3294 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; |
| 3295 | 3295 |
| 3296 #ifdef HAVE_MMX | 3296 #if HAVE_MMX |
| 3297 for(i=0; i<57; i++){ | 3297 for(i=0; i<57; i++){ |
| 3298 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; | 3298 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; |
| 3299 int threshold= offset*2 + 1; | 3299 int threshold= offset*2 + 1; |
| 3300 c.mmxDcOffset[i]= 0x7F - offset; | 3300 c.mmxDcOffset[i]= 0x7F - offset; |
| 3301 c.mmxDcThreshold[i]= 0x7F - threshold; | 3301 c.mmxDcThreshold[i]= 0x7F - threshold; |
| 3349 clipped-= yHistogram[white]; | 3349 clipped-= yHistogram[white]; |
| 3350 } | 3350 } |
| 3351 | 3351 |
| 3352 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); | 3352 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); |
| 3353 | 3353 |
| 3354 #ifdef HAVE_MMX2 | 3354 #if HAVE_MMX2 |
| 3355 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); | 3355 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
| 3356 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; | 3356 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; |
| 3357 #else | 3357 #else |
| 3358 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); | 3358 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
| 3359 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; | 3359 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; |
| 3382 // From this point on it is guaranteed that we can read and write 16 lines downward | 3382 // From this point on it is guaranteed that we can read and write 16 lines downward |
| 3383 // finish 1 block before the next otherwise we might have a problem | 3383 // finish 1 block before the next otherwise we might have a problem |
| 3384 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | 3384 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
| 3385 for(x=0; x<width; x+=BLOCK_SIZE){ | 3385 for(x=0; x<width; x+=BLOCK_SIZE){ |
| 3386 | 3386 |
| 3387 #ifdef HAVE_MMX2 | 3387 #if HAVE_MMX2 |
| 3388 /* | 3388 /* |
| 3389 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | 3389 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
| 3390 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | 3390 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
| 3391 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | 3391 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
| 3392 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | 3392 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
| 3409 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), | 3409 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), |
| 3410 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) | 3410 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) |
| 3411 : "%"REG_a, "%"REG_d | 3411 : "%"REG_a, "%"REG_d |
| 3412 ); | 3412 ); |
| 3413 | 3413 |
| 3414 #elif defined(HAVE_3DNOW) | 3414 #elif HAVE_3DNOW |
| 3415 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... | 3415 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... |
| 3416 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | 3416 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
| 3417 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | 3417 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
| 3418 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 3418 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
| 3419 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 3419 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
| 3455 | 3455 |
| 3456 for(y=0; y<height; y+=BLOCK_SIZE){ | 3456 for(y=0; y<height; y+=BLOCK_SIZE){ |
| 3457 //1% speedup if these are here instead of the inner loop | 3457 //1% speedup if these are here instead of the inner loop |
| 3458 const uint8_t *srcBlock= &(src[y*srcStride]); | 3458 const uint8_t *srcBlock= &(src[y*srcStride]); |
| 3459 uint8_t *dstBlock= &(dst[y*dstStride]); | 3459 uint8_t *dstBlock= &(dst[y*dstStride]); |
| 3460 #ifdef HAVE_MMX | 3460 #if HAVE_MMX |
| 3461 uint8_t *tempBlock1= c.tempBlocks; | 3461 uint8_t *tempBlock1= c.tempBlocks; |
| 3462 uint8_t *tempBlock2= c.tempBlocks + 8; | 3462 uint8_t *tempBlock2= c.tempBlocks + 8; |
| 3463 #endif | 3463 #endif |
| 3464 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; | 3464 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; |
| 3465 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)]; | 3465 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)]; |
| 3491 // From this point on it is guaranteed that we can read and write 16 lines downward | 3491 // From this point on it is guaranteed that we can read and write 16 lines downward |
| 3492 // finish 1 block before the next otherwise we might have a problem | 3492 // finish 1 block before the next otherwise we might have a problem |
| 3493 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | 3493 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing |
| 3494 for(x=0; x<width; x+=BLOCK_SIZE){ | 3494 for(x=0; x<width; x+=BLOCK_SIZE){ |
| 3495 const int stride= dstStride; | 3495 const int stride= dstStride; |
| 3496 #ifdef HAVE_MMX | 3496 #if HAVE_MMX |
| 3497 uint8_t *tmpXchg; | 3497 uint8_t *tmpXchg; |
| 3498 #endif | 3498 #endif |
| 3499 if(isColor){ | 3499 if(isColor){ |
| 3500 QP= QPptr[x>>qpHShift]; | 3500 QP= QPptr[x>>qpHShift]; |
| 3501 c.nonBQP= nonBQPptr[x>>qpHShift]; | 3501 c.nonBQP= nonBQPptr[x>>qpHShift]; |
| 3505 c.nonBQP= nonBQPptr[x>>4]; | 3505 c.nonBQP= nonBQPptr[x>>4]; |
| 3506 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; | 3506 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; |
| 3507 yHistogram[ srcBlock[srcStride*12 + 4] ]++; | 3507 yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
| 3508 } | 3508 } |
| 3509 c.QP= QP; | 3509 c.QP= QP; |
| 3510 #ifdef HAVE_MMX | 3510 #if HAVE_MMX |
| 3511 __asm__ volatile( | 3511 __asm__ volatile( |
| 3512 "movd %1, %%mm7 \n\t" | 3512 "movd %1, %%mm7 \n\t" |
| 3513 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP | 3513 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
| 3514 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP | 3514 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP |
| 3515 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP | 3515 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP |
| 3518 : "r" (QP) | 3518 : "r" (QP) |
| 3519 ); | 3519 ); |
| 3520 #endif | 3520 #endif |
| 3521 | 3521 |
| 3522 | 3522 |
| 3523 #ifdef HAVE_MMX2 | 3523 #if HAVE_MMX2 |
| 3524 /* | 3524 /* |
| 3525 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | 3525 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
| 3526 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | 3526 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
| 3527 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | 3527 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
| 3528 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | 3528 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
| 3545 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), | 3545 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), |
| 3546 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) | 3546 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) |
| 3547 : "%"REG_a, "%"REG_d | 3547 : "%"REG_a, "%"REG_d |
| 3548 ); | 3548 ); |
| 3549 | 3549 |
| 3550 #elif defined(HAVE_3DNOW) | 3550 #elif HAVE_3DNOW |
| 3551 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... | 3551 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... |
| 3552 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | 3552 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
| 3553 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | 3553 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
| 3554 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | 3554 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
| 3555 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | 3555 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
| 3589 }else if(mode & V_A_DEBLOCK){ | 3589 }else if(mode & V_A_DEBLOCK){ |
| 3590 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); | 3590 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); |
| 3591 } | 3591 } |
| 3592 } | 3592 } |
| 3593 | 3593 |
| 3594 #ifdef HAVE_MMX | 3594 #if HAVE_MMX |
| 3595 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); | 3595 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
| 3596 #endif | 3596 #endif |
| 3597 /* check if we have a previous block to deblock it with dstBlock */ | 3597 /* check if we have a previous block to deblock it with dstBlock */ |
| 3598 if(x - 8 >= 0){ | 3598 if(x - 8 >= 0){ |
| 3599 #ifdef HAVE_MMX | 3599 #if HAVE_MMX |
| 3600 if(mode & H_X1_FILTER) | 3600 if(mode & H_X1_FILTER) |
| 3601 RENAME(vertX1Filter)(tempBlock1, 16, &c); | 3601 RENAME(vertX1Filter)(tempBlock1, 16, &c); |
| 3602 else if(mode & H_DEBLOCK){ | 3602 else if(mode & H_DEBLOCK){ |
| 3603 //START_TIMER | 3603 //START_TIMER |
| 3604 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); | 3604 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); |
| 3615 | 3615 |
| 3616 #else | 3616 #else |
| 3617 if(mode & H_X1_FILTER) | 3617 if(mode & H_X1_FILTER) |
| 3618 horizX1Filter(dstBlock-4, stride, QP); | 3618 horizX1Filter(dstBlock-4, stride, QP); |
| 3619 else if(mode & H_DEBLOCK){ | 3619 else if(mode & H_DEBLOCK){ |
| 3620 #ifdef HAVE_ALTIVEC | 3620 #if HAVE_ALTIVEC |
| 3621 DECLARE_ALIGNED(16, unsigned char, tempBlock[272]); | 3621 DECLARE_ALIGNED(16, unsigned char, tempBlock[272]); |
| 3622 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); | 3622 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); |
| 3623 | 3623 |
| 3624 const int t=vertClassify_altivec(tempBlock-48, 16, &c); | 3624 const int t=vertClassify_altivec(tempBlock-48, 16, &c); |
| 3625 if(t==1) { | 3625 if(t==1) { |
| 3657 } | 3657 } |
| 3658 | 3658 |
| 3659 dstBlock+=8; | 3659 dstBlock+=8; |
| 3660 srcBlock+=8; | 3660 srcBlock+=8; |
| 3661 | 3661 |
| 3662 #ifdef HAVE_MMX | 3662 #if HAVE_MMX |
| 3663 tmpXchg= tempBlock1; | 3663 tmpXchg= tempBlock1; |
| 3664 tempBlock1= tempBlock2; | 3664 tempBlock1= tempBlock2; |
| 3665 tempBlock2 = tmpXchg; | 3665 tempBlock2 = tmpXchg; |
| 3666 #endif | 3666 #endif |
| 3667 } | 3667 } |
| 3697 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; | 3697 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
| 3698 + dstBlock[x +13*dstStride] | 3698 + dstBlock[x +13*dstStride] |
| 3699 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; | 3699 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; |
| 3700 }*/ | 3700 }*/ |
| 3701 } | 3701 } |
| 3702 #ifdef HAVE_3DNOW | 3702 #if HAVE_3DNOW |
| 3703 __asm__ volatile("femms"); | 3703 __asm__ volatile("femms"); |
| 3704 #elif defined (HAVE_MMX) | 3704 #elif HAVE_MMX |
| 3705 __asm__ volatile("emms"); | 3705 __asm__ volatile("emms"); |
| 3706 #endif | 3706 #endif |
| 3707 | 3707 |
| 3708 #ifdef DEBUG_BRIGHTNESS | 3708 #ifdef DEBUG_BRIGHTNESS |
| 3709 if(!isColor){ | 3709 if(!isColor){ |
