Mercurial > libavcodec.hg
comparison libpostproc/postprocess.c @ 142:da4c751fc151 libavcodec
deinterlace bugfix
| author | michael |
|---|---|
| date | Wed, 31 Oct 2001 18:29:03 +0000 |
| parents | 626bfabff1f5 |
| children | 1cfc4d567c0a |
comparison
equal
deleted
inserted
replaced
| 141:626bfabff1f5 | 142:da4c751fc151 |
|---|---|
| 2115 #endif | 2115 #endif |
| 2116 } | 2116 } |
| 2117 | 2117 |
| 2118 /** | 2118 /** |
| 2119 * Deinterlaces the given block | 2119 * Deinterlaces the given block |
| 2120 * will be called for every 8x8 block, and can read & write into an 8x16 block | 2120 * will be called for every 8x8 block and can read & write from line 4-15 |
| 2121 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 2122 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 2121 */ | 2123 */ |
| 2122 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) | 2124 static inline void deInterlaceInterpolateLinear(uint8_t src[], int stride) |
| 2123 { | 2125 { |
| 2124 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2126 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 2127 src+= 4*stride; | |
| 2125 asm volatile( | 2128 asm volatile( |
| 2126 "leal (%0, %1), %%eax \n\t" | 2129 "leal (%0, %1), %%eax \n\t" |
| 2127 "leal (%%eax, %1, 4), %%ebx \n\t" | 2130 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 2128 // 0 1 2 3 4 5 6 7 8 9 | 2131 // 0 1 2 3 4 5 6 7 8 9 |
| 2129 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 2132 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
| 2145 : : "r" (src), "r" (stride) | 2148 : : "r" (src), "r" (stride) |
| 2146 : "%eax", "%ebx" | 2149 : "%eax", "%ebx" |
| 2147 ); | 2150 ); |
| 2148 #else | 2151 #else |
| 2149 int x; | 2152 int x; |
| 2153 src+= 4*stride; | |
| 2150 for(x=0; x<8; x++) | 2154 for(x=0; x<8; x++) |
| 2151 { | 2155 { |
| 2152 src[stride] = (src[0] + src[stride*2])>>1; | 2156 src[stride] = (src[0] + src[stride*2])>>1; |
| 2153 src[stride*3] = (src[stride*2] + src[stride*4])>>1; | 2157 src[stride*3] = (src[stride*2] + src[stride*4])>>1; |
| 2154 src[stride*5] = (src[stride*4] + src[stride*6])>>1; | 2158 src[stride*5] = (src[stride*4] + src[stride*6])>>1; |
| 2158 #endif | 2162 #endif |
| 2159 } | 2163 } |
| 2160 | 2164 |
| 2161 /** | 2165 /** |
| 2162 * Deinterlaces the given block | 2166 * Deinterlaces the given block |
| 2163 * will be called for every 8x8 block, and can read & write into an 8x16 block | 2167 * will be called for every 8x8 block and can read & write from line 4-15 |
| 2168 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 2169 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 2170 * this filter will read lines 3-15 and write 7-13 | |
| 2164 * no cliping in C version | 2171 * no cliping in C version |
| 2165 */ | 2172 */ |
| 2166 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride) | 2173 static inline void deInterlaceInterpolateCubic(uint8_t src[], int stride) |
| 2167 { | 2174 { |
| 2168 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2175 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 2176 src+= stride*3; | |
| 2169 asm volatile( | 2177 asm volatile( |
| 2170 "leal (%0, %1), %%eax \n\t" | 2178 "leal (%0, %1), %%eax \n\t" |
| 2171 "leal (%%eax, %1, 4), %%ebx \n\t" | 2179 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 2172 "leal (%%ebx, %1, 4), %%ecx \n\t" | 2180 "leal (%%ebx, %1, 4), %%ecx \n\t" |
| 2173 "addl %1, %%ecx \n\t" | 2181 "addl %1, %%ecx \n\t" |
| 2205 : : "r" (src), "r" (stride) | 2213 : : "r" (src), "r" (stride) |
| 2206 : "%eax", "%ebx", "ecx" | 2214 : "%eax", "%ebx", "ecx" |
| 2207 ); | 2215 ); |
| 2208 #else | 2216 #else |
| 2209 int x; | 2217 int x; |
| 2218 src+= stride*3; | |
| 2210 for(x=0; x<8; x++) | 2219 for(x=0; x<8; x++) |
| 2211 { | 2220 { |
| 2212 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; | 2221 src[stride*3] = (-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4; |
| 2213 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; | 2222 src[stride*5] = (-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4; |
| 2214 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; | 2223 src[stride*7] = (-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4; |
| 2218 #endif | 2227 #endif |
| 2219 } | 2228 } |
| 2220 | 2229 |
| 2221 /** | 2230 /** |
| 2222 * Deinterlaces the given block | 2231 * Deinterlaces the given block |
| 2223 * will be called for every 8x8 block, and can read & write into an 8x16 block | 2232 * will be called for every 8x8 block and can read & write from line 4-15 |
| 2233 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 2234 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 2224 * will shift the image up by 1 line (FIXME if this is a problem) | 2235 * will shift the image up by 1 line (FIXME if this is a problem) |
| 2236 * this filter will read lines 4-13 and write 4-11 | |
| 2225 */ | 2237 */ |
| 2226 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) | 2238 static inline void deInterlaceBlendLinear(uint8_t src[], int stride) |
| 2227 { | 2239 { |
| 2228 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) | 2240 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW) |
| 2241 src+= 4*stride; | |
| 2229 asm volatile( | 2242 asm volatile( |
| 2230 "leal (%0, %1), %%eax \n\t" | 2243 "leal (%0, %1), %%eax \n\t" |
| 2231 "leal (%%eax, %1, 4), %%ebx \n\t" | 2244 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 2232 // 0 1 2 3 4 5 6 7 8 9 | 2245 // 0 1 2 3 4 5 6 7 8 9 |
| 2233 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 | 2246 // %0 eax eax+%1 eax+2%1 %0+4%1 ebx ebx+%1 ebx+2%1 %0+8%1 ebx+4%1 |
| 2271 : : "r" (src), "r" (stride) | 2284 : : "r" (src), "r" (stride) |
| 2272 : "%eax", "%ebx" | 2285 : "%eax", "%ebx" |
| 2273 ); | 2286 ); |
| 2274 #else | 2287 #else |
| 2275 int x; | 2288 int x; |
| 2289 src+= 4*stride; | |
| 2276 for(x=0; x<8; x++) | 2290 for(x=0; x<8; x++) |
| 2277 { | 2291 { |
| 2278 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | 2292 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; |
| 2279 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | 2293 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; |
| 2280 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | 2294 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; |
| 2288 #endif | 2302 #endif |
| 2289 } | 2303 } |
| 2290 | 2304 |
| 2291 /** | 2305 /** |
| 2292 * Deinterlaces the given block | 2306 * Deinterlaces the given block |
| 2293 * will be called for every 8x8 block, except the last row, and can read & write into an 8x16 block | 2307 * will be called for every 8x8 block and can read & write from line 4-15, |
| 2308 * lines 0-3 have been passed through the deblock / dering filters allready, but can be read too | |
| 2309 * lines 4-12 will be read into the deblocking filter and should be deinterlaced | |
| 2294 */ | 2310 */ |
| 2295 static inline void deInterlaceMedian(uint8_t src[], int stride) | 2311 static inline void deInterlaceMedian(uint8_t src[], int stride) |
| 2296 { | 2312 { |
| 2297 #ifdef HAVE_MMX | 2313 #ifdef HAVE_MMX |
| 2314 src+= 4*stride; | |
| 2298 #ifdef HAVE_MMX2 | 2315 #ifdef HAVE_MMX2 |
| 2299 asm volatile( | 2316 asm volatile( |
| 2300 "leal (%0, %1), %%eax \n\t" | 2317 "leal (%0, %1), %%eax \n\t" |
| 2301 "leal (%%eax, %1, 4), %%ebx \n\t" | 2318 "leal (%%eax, %1, 4), %%ebx \n\t" |
| 2302 // 0 1 2 3 4 5 6 7 8 9 | 2319 // 0 1 2 3 4 5 6 7 8 9 |
| 2386 ); | 2403 ); |
| 2387 #endif // MMX | 2404 #endif // MMX |
| 2388 #else | 2405 #else |
| 2389 //FIXME | 2406 //FIXME |
| 2390 int x; | 2407 int x; |
| 2408 src+= 4*stride; | |
| 2391 for(x=0; x<8; x++) | 2409 for(x=0; x<8; x++) |
| 2392 { | 2410 { |
| 2393 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; | 2411 src[0 ] = (src[0 ] + 2*src[stride ] + src[stride*2])>>2; |
| 2394 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; | 2412 src[stride ] = (src[stride ] + 2*src[stride*2] + src[stride*3])>>2; |
| 2395 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; | 2413 src[stride*2] = (src[stride*2] + 2*src[stride*3] + src[stride*4])>>2; |
| 2772 horizontal_size >>= 1; | 2790 horizontal_size >>= 1; |
| 2773 vertical_size >>= 1; | 2791 vertical_size >>= 1; |
| 2774 src_stride >>= 1; | 2792 src_stride >>= 1; |
| 2775 dst_stride >>= 1; | 2793 dst_stride >>= 1; |
| 2776 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00); | 2794 mode= ((mode&0xFF)>>4) | (mode&0xFFFFFF00); |
| 2795 // mode&= ~(LINEAR_IPOL_DEINT_FILTER | LINEAR_BLEND_DEINT_FILTER | | |
| 2796 // MEDIAN_DEINT_FILTER | CUBIC_IPOL_DEINT_FILTER); | |
| 2777 | 2797 |
| 2778 if(1) | 2798 if(1) |
| 2779 { | 2799 { |
| 2780 postProcess(src[1], src_stride, dst[1], dst_stride, | 2800 postProcess(src[1], src_stride, dst[1], dst_stride, |
| 2781 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); | 2801 horizontal_size, vertical_size, QP_store, QP_stride, 1, mode); |
| 3086 } | 3106 } |
| 3087 | 3107 |
| 3088 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF; | 3108 if(mode & LEVEL_FIX) QPCorrecture= packedYScale &0xFFFF; |
| 3089 else QPCorrecture= 256; | 3109 else QPCorrecture= 256; |
| 3090 | 3110 |
| 3091 /* copy first row of 8x8 blocks */ | 3111 /* line before the first one */ |
| 3092 for(x=0; x<width; x+=BLOCK_SIZE) | 3112 y=-BLOCK_SIZE; |
| 3093 blockCopy(dst + x, dstStride, src + x, srcStride, 8, mode & LEVEL_FIX); | 3113 { |
| 3114 //1% speedup if these are here instead of the inner loop | |
| 3115 uint8_t *srcBlock= &(src[y*srcStride]); | |
| 3116 uint8_t *dstBlock= &(dst[y*dstStride]); | |
| 3117 | |
| 3118 dstBlock= tempDst + dstStride; | |
| 3119 | |
| 3120 // From this point on it is guranteed that we can read and write 16 lines downward | |
| 3121 // finish 1 block before the next otherwise weŽll might have a problem | |
| 3122 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing | |
| 3123 for(x=0; x<width; x+=BLOCK_SIZE) | |
| 3124 { | |
| 3125 | |
| 3126 #ifdef HAVE_MMX2 | |
| 3127 /* | |
| 3128 prefetchnta(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
| 3129 prefetchnta(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 3130 prefetcht0(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 3131 prefetcht0(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 3132 */ | |
| 3133 /* | |
| 3134 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); | |
| 3135 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); | |
| 3136 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); | |
| 3137 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); | |
| 3138 */ | |
| 3139 | |
| 3140 asm( | |
| 3141 "movl %4, %%eax \n\t" | |
| 3142 "shrl $2, %%eax \n\t" | |
| 3143 "andl $6, %%eax \n\t" | |
| 3144 "addl $8, %%eax \n\t" | |
| 3145 "movl %%eax, %%ebx \n\t" | |
| 3146 "imul %1, %%eax \n\t" | |
| 3147 "imul %3, %%ebx \n\t" | |
| 3148 "prefetchnta 32(%%eax, %0) \n\t" | |
| 3149 "prefetcht0 32(%%ebx, %2) \n\t" | |
| 3150 "addl %1, %%eax \n\t" | |
| 3151 "addl %3, %%ebx \n\t" | |
| 3152 "prefetchnta 32(%%eax, %0) \n\t" | |
| 3153 "prefetcht0 32(%%ebx, %2) \n\t" | |
| 3154 :: "r" (srcBlock), "r" (srcStride), "r" (dstBlock), "r" (dstStride), | |
| 3155 "m" (x) | |
| 3156 : "%eax", "%ebx" | |
| 3157 ); | |
| 3158 | |
| 3159 #elif defined(HAVE_3DNOW) | |
| 3160 //FIXME check if this is faster on an 3dnow chip or if its faster without the prefetch or ... | |
| 3161 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); | |
| 3162 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); | |
| 3163 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); | |
| 3164 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); | |
| 3165 */ | |
| 3166 #endif | |
| 3167 | |
| 3168 blockCopy(dstBlock + dstStride*8, dstStride, | |
| 3169 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); | |
| 3170 | |
| 3171 if(mode & LINEAR_IPOL_DEINT_FILTER) | |
| 3172 deInterlaceInterpolateLinear(dstBlock, dstStride); | |
| 3173 else if(mode & LINEAR_BLEND_DEINT_FILTER) | |
| 3174 deInterlaceBlendLinear(dstBlock, dstStride); | |
| 3175 else if(mode & MEDIAN_DEINT_FILTER) | |
| 3176 deInterlaceMedian(dstBlock, dstStride); | |
| 3177 else if(mode & CUBIC_IPOL_DEINT_FILTER) | |
| 3178 deInterlaceInterpolateCubic(dstBlock, dstStride); | |
| 3179 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) | |
| 3180 deInterlaceBlendCubic(dstBlock, dstStride); | |
| 3181 */ | |
| 3182 dstBlock+=8; | |
| 3183 srcBlock+=8; | |
| 3184 } | |
| 3185 memcpy(&(dst[y*dstStride]) + 8*dstStride, tempDst + 9*dstStride, 8*dstStride ); | |
| 3186 } | |
| 3094 | 3187 |
| 3095 for(y=0; y<height; y+=BLOCK_SIZE) | 3188 for(y=0; y<height; y+=BLOCK_SIZE) |
| 3096 { | 3189 { |
| 3097 //1% speedup if these are here instead of the inner loop | 3190 //1% speedup if these are here instead of the inner loop |
| 3098 uint8_t *srcBlock= &(src[y*srcStride]); | 3191 uint8_t *srcBlock= &(src[y*srcStride]); |
| 3106 #endif | 3199 #endif |
| 3107 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards | 3200 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
| 3108 if not than use a temporary buffer */ | 3201 if not than use a temporary buffer */ |
| 3109 if(y+15 >= height) | 3202 if(y+15 >= height) |
| 3110 { | 3203 { |
| 3111 /* copy from line 5 to 12 of src, these will be copied with | 3204 /* copy from line 8 to 15 of src, these will be copied with |
| 3112 blockcopy to dst later */ | 3205 blockcopy to dst later */ |
| 3113 memcpy(tempSrc + srcStride*5, srcBlock + srcStride*5, | 3206 memcpy(tempSrc + srcStride*8, srcBlock + srcStride*8, |
| 3114 srcStride*MAX(height-y-5, 0) ); | 3207 srcStride*MAX(height-y-8, 0) ); |
| 3115 | 3208 |
| 3116 /* duplicate last line to fill the void upto line 12 */ | 3209 /* duplicate last line to fill the void upto line 15 */ |
| 3117 if(y+12 >= height) | 3210 if(y+15 >= height) |
| 3118 { | 3211 { |
| 3119 int i; | 3212 int i; |
| 3120 for(i=height-y; i<=12; i++) | 3213 for(i=height-y; i<=15; i++) |
| 3121 memcpy(tempSrc + srcStride*i, | 3214 memcpy(tempSrc + srcStride*i, |
| 3122 src + srcStride*(height-1), srcStride); | 3215 src + srcStride*(height-1), srcStride); |
| 3123 } | 3216 } |
| 3124 | 3217 |
| 3125 | 3218 /* copy up to 9 lines of dst */ |
| 3126 /* copy up to 6 lines of dst */ | 3219 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 9) ); |
| 3127 memcpy(tempDst, dstBlock - dstStride, dstStride*MIN(height-y+1, 6) ); | |
| 3128 dstBlock= tempDst + dstStride; | 3220 dstBlock= tempDst + dstStride; |
| 3129 srcBlock= tempSrc; | 3221 srcBlock= tempSrc; |
| 3130 } | 3222 } |
| 3131 | 3223 |
| 3132 // From this point on it is guranteed that we can read and write 16 lines downward | 3224 // From this point on it is guranteed that we can read and write 16 lines downward |
| 3188 | 3280 |
| 3189 asm( | 3281 asm( |
| 3190 "movl %4, %%eax \n\t" | 3282 "movl %4, %%eax \n\t" |
| 3191 "shrl $2, %%eax \n\t" | 3283 "shrl $2, %%eax \n\t" |
| 3192 "andl $6, %%eax \n\t" | 3284 "andl $6, %%eax \n\t" |
| 3193 "addl $5, %%eax \n\t" | 3285 "addl $8, %%eax \n\t" |
| 3194 "movl %%eax, %%ebx \n\t" | 3286 "movl %%eax, %%ebx \n\t" |
| 3195 "imul %1, %%eax \n\t" | 3287 "imul %1, %%eax \n\t" |
| 3196 "imul %3, %%ebx \n\t" | 3288 "imul %3, %%ebx \n\t" |
| 3197 "prefetchnta 32(%%eax, %0) \n\t" | 3289 "prefetchnta 32(%%eax, %0) \n\t" |
| 3198 "prefetcht0 32(%%ebx, %2) \n\t" | 3290 "prefetcht0 32(%%ebx, %2) \n\t" |
| 3231 dstBlock= tempDstBlock; | 3323 dstBlock= tempDstBlock; |
| 3232 srcBlock= tempSrcBlock; | 3324 srcBlock= tempSrcBlock; |
| 3233 } | 3325 } |
| 3234 #endif | 3326 #endif |
| 3235 | 3327 |
| 3236 blockCopy(dstBlock + dstStride*5, dstStride, | 3328 blockCopy(dstBlock + dstStride*8, dstStride, |
| 3237 srcBlock + srcStride*5, srcStride, 8, mode & LEVEL_FIX); | 3329 srcBlock + srcStride*8, srcStride, 8, mode & LEVEL_FIX); |
| 3238 | 3330 |
| 3239 if(mode & LINEAR_IPOL_DEINT_FILTER) | 3331 if(mode & LINEAR_IPOL_DEINT_FILTER) |
| 3240 deInterlaceInterpolateLinear(dstBlock, dstStride); | 3332 deInterlaceInterpolateLinear(dstBlock, dstStride); |
| 3241 else if(mode & LINEAR_BLEND_DEINT_FILTER) | 3333 else if(mode & LINEAR_BLEND_DEINT_FILTER) |
| 3242 deInterlaceBlendLinear(dstBlock, dstStride); | 3334 deInterlaceBlendLinear(dstBlock, dstStride); |
| 3359 tempBlock1= tempBlock2; | 3451 tempBlock1= tempBlock2; |
| 3360 tempBlock2 = tmpXchg; | 3452 tempBlock2 = tmpXchg; |
| 3361 #endif | 3453 #endif |
| 3362 } | 3454 } |
| 3363 | 3455 |
| 3364 /* did we use a tmp buffer */ | 3456 /* did we use a tmp buffer for the last lines*/ |
| 3365 if(y+15 >= height) | 3457 if(y+15 >= height) |
| 3366 { | 3458 { |
| 3367 uint8_t *dstBlock= &(dst[y*dstStride]); | 3459 uint8_t *dstBlock= &(dst[y*dstStride]); |
| 3368 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); | 3460 memcpy(dstBlock, tempDst + dstStride, dstStride*(height-y) ); |
| 3369 } | 3461 } |
