comparison src/ffmpeg/libavcodec/dsputil.c @ 808:e8776388b02a trunk

[svn] - add ffmpeg
author nenolod
date Mon, 12 Mar 2007 11:18:54 -0700
parents
children 23a5aa2c545c
comparison
equal deleted inserted replaced
807:0f9c8d4d3ac4 808:e8776388b02a
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 *
22 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
23 */
24
25 /**
26 * @file dsputil.c
27 * DSP utils
28 */
29
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
34 #include "faandct.h"
35 #include "snow.h"
36
37 /* snow.c */
38 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
39
40 /* vorbis.c */
41 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
42
43 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t squareTbl[512] = {0, };
45
46 const uint8_t ff_zigzag_direct[64] = {
47 0, 1, 8, 16, 9, 2, 3, 10,
48 17, 24, 32, 25, 18, 11, 4, 5,
49 12, 19, 26, 33, 40, 48, 41, 34,
50 27, 20, 13, 6, 7, 14, 21, 28,
51 35, 42, 49, 56, 57, 50, 43, 36,
52 29, 22, 15, 23, 30, 37, 44, 51,
53 58, 59, 52, 45, 38, 31, 39, 46,
54 53, 60, 61, 54, 47, 55, 62, 63
55 };
56
57 /* Specific zigzag scan for 248 idct. NOTE that unlike the
58 specification, we interleave the fields */
59 const uint8_t ff_zigzag248_direct[64] = {
60 0, 8, 1, 9, 16, 24, 2, 10,
61 17, 25, 32, 40, 48, 56, 33, 41,
62 18, 26, 3, 11, 4, 12, 19, 27,
63 34, 42, 49, 57, 50, 58, 35, 43,
64 20, 28, 5, 13, 6, 14, 21, 29,
65 36, 44, 51, 59, 52, 60, 37, 45,
66 22, 30, 7, 15, 23, 31, 38, 46,
67 53, 61, 54, 62, 39, 47, 55, 63,
68 };
69
70 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
71 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
72
73 const uint8_t ff_alternate_horizontal_scan[64] = {
74 0, 1, 2, 3, 8, 9, 16, 17,
75 10, 11, 4, 5, 6, 7, 15, 14,
76 13, 12, 19, 18, 24, 25, 32, 33,
77 26, 27, 20, 21, 22, 23, 28, 29,
78 30, 31, 34, 35, 40, 41, 48, 49,
79 42, 43, 36, 37, 38, 39, 44, 45,
80 46, 47, 50, 51, 56, 57, 58, 59,
81 52, 53, 54, 55, 60, 61, 62, 63,
82 };
83
84 const uint8_t ff_alternate_vertical_scan[64] = {
85 0, 8, 16, 24, 1, 9, 2, 10,
86 17, 25, 32, 40, 48, 56, 57, 49,
87 41, 33, 26, 18, 3, 11, 4, 12,
88 19, 27, 34, 42, 50, 58, 35, 43,
89 51, 59, 20, 28, 5, 13, 6, 14,
90 21, 29, 36, 44, 52, 60, 37, 45,
91 53, 61, 22, 30, 7, 15, 23, 31,
92 38, 46, 54, 62, 39, 47, 55, 63,
93 };
94
95 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
96 const uint32_t inverse[256]={
97 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
98 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
99 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
100 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
101 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
102 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
103 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
104 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
105 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
106 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
107 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
108 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
109 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
110 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
111 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
112 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
113 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
114 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
115 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
116 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
117 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
118 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
119 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
120 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
121 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
122 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
123 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
124 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
125 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
126 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
127 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
128 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
129 };
130
131 /* Input permutation for the simple_idct_mmx */
132 static const uint8_t simple_mmx_permutation[64]={
133 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
134 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
135 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
136 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
137 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
138 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
139 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
140 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
141 };
142
143 static int pix_sum_c(uint8_t * pix, int line_size)
144 {
145 int s, i, j;
146
147 s = 0;
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
150 s += pix[0];
151 s += pix[1];
152 s += pix[2];
153 s += pix[3];
154 s += pix[4];
155 s += pix[5];
156 s += pix[6];
157 s += pix[7];
158 pix += 8;
159 }
160 pix += line_size - 16;
161 }
162 return s;
163 }
164
165 static int pix_norm1_c(uint8_t * pix, int line_size)
166 {
167 int s, i, j;
168 uint32_t *sq = squareTbl + 256;
169
170 s = 0;
171 for (i = 0; i < 16; i++) {
172 for (j = 0; j < 16; j += 8) {
173 #if 0
174 s += sq[pix[0]];
175 s += sq[pix[1]];
176 s += sq[pix[2]];
177 s += sq[pix[3]];
178 s += sq[pix[4]];
179 s += sq[pix[5]];
180 s += sq[pix[6]];
181 s += sq[pix[7]];
182 #else
183 #if LONG_MAX > 2147483647
184 register uint64_t x=*(uint64_t*)pix;
185 s += sq[x&0xff];
186 s += sq[(x>>8)&0xff];
187 s += sq[(x>>16)&0xff];
188 s += sq[(x>>24)&0xff];
189 s += sq[(x>>32)&0xff];
190 s += sq[(x>>40)&0xff];
191 s += sq[(x>>48)&0xff];
192 s += sq[(x>>56)&0xff];
193 #else
194 register uint32_t x=*(uint32_t*)pix;
195 s += sq[x&0xff];
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
199 x=*(uint32_t*)(pix+4);
200 s += sq[x&0xff];
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
204 #endif
205 #endif
206 pix += 8;
207 }
208 pix += line_size - 16;
209 }
210 return s;
211 }
212
213 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
214 int i;
215
216 for(i=0; i+8<=w; i+=8){
217 dst[i+0]= bswap_32(src[i+0]);
218 dst[i+1]= bswap_32(src[i+1]);
219 dst[i+2]= bswap_32(src[i+2]);
220 dst[i+3]= bswap_32(src[i+3]);
221 dst[i+4]= bswap_32(src[i+4]);
222 dst[i+5]= bswap_32(src[i+5]);
223 dst[i+6]= bswap_32(src[i+6]);
224 dst[i+7]= bswap_32(src[i+7]);
225 }
226 for(;i<w; i++){
227 dst[i+0]= bswap_32(src[i+0]);
228 }
229 }
230
231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
232 {
233 int s, i;
234 uint32_t *sq = squareTbl + 256;
235
236 s = 0;
237 for (i = 0; i < h; i++) {
238 s += sq[pix1[0] - pix2[0]];
239 s += sq[pix1[1] - pix2[1]];
240 s += sq[pix1[2] - pix2[2]];
241 s += sq[pix1[3] - pix2[3]];
242 pix1 += line_size;
243 pix2 += line_size;
244 }
245 return s;
246 }
247
248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
249 {
250 int s, i;
251 uint32_t *sq = squareTbl + 256;
252
253 s = 0;
254 for (i = 0; i < h; i++) {
255 s += sq[pix1[0] - pix2[0]];
256 s += sq[pix1[1] - pix2[1]];
257 s += sq[pix1[2] - pix2[2]];
258 s += sq[pix1[3] - pix2[3]];
259 s += sq[pix1[4] - pix2[4]];
260 s += sq[pix1[5] - pix2[5]];
261 s += sq[pix1[6] - pix2[6]];
262 s += sq[pix1[7] - pix2[7]];
263 pix1 += line_size;
264 pix2 += line_size;
265 }
266 return s;
267 }
268
269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
270 {
271 int s, i;
272 uint32_t *sq = squareTbl + 256;
273
274 s = 0;
275 for (i = 0; i < h; i++) {
276 s += sq[pix1[ 0] - pix2[ 0]];
277 s += sq[pix1[ 1] - pix2[ 1]];
278 s += sq[pix1[ 2] - pix2[ 2]];
279 s += sq[pix1[ 3] - pix2[ 3]];
280 s += sq[pix1[ 4] - pix2[ 4]];
281 s += sq[pix1[ 5] - pix2[ 5]];
282 s += sq[pix1[ 6] - pix2[ 6]];
283 s += sq[pix1[ 7] - pix2[ 7]];
284 s += sq[pix1[ 8] - pix2[ 8]];
285 s += sq[pix1[ 9] - pix2[ 9]];
286 s += sq[pix1[10] - pix2[10]];
287 s += sq[pix1[11] - pix2[11]];
288 s += sq[pix1[12] - pix2[12]];
289 s += sq[pix1[13] - pix2[13]];
290 s += sq[pix1[14] - pix2[14]];
291 s += sq[pix1[15] - pix2[15]];
292
293 pix1 += line_size;
294 pix2 += line_size;
295 }
296 return s;
297 }
298
299
300 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
301 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
302 int s, i, j;
303 const int dec_count= w==8 ? 3 : 4;
304 int tmp[32*32];
305 int level, ori;
306 static const int scale[2][2][4][4]={
307 {
308 {
309 // 9/7 8x8 dec=3
310 {268, 239, 239, 213},
311 { 0, 224, 224, 152},
312 { 0, 135, 135, 110},
313 },{
314 // 9/7 16x16 or 32x32 dec=4
315 {344, 310, 310, 280},
316 { 0, 320, 320, 228},
317 { 0, 175, 175, 136},
318 { 0, 129, 129, 102},
319 }
320 },{
321 {
322 // 5/3 8x8 dec=3
323 {275, 245, 245, 218},
324 { 0, 230, 230, 156},
325 { 0, 138, 138, 113},
326 },{
327 // 5/3 16x16 or 32x32 dec=4
328 {352, 317, 317, 286},
329 { 0, 328, 328, 233},
330 { 0, 180, 180, 140},
331 { 0, 132, 132, 105},
332 }
333 }
334 };
335
336 for (i = 0; i < h; i++) {
337 for (j = 0; j < w; j+=4) {
338 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
339 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
340 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
341 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
342 }
343 pix1 += line_size;
344 pix2 += line_size;
345 }
346
347 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
348
349 s=0;
350 assert(w==h);
351 for(level=0; level<dec_count; level++){
352 for(ori= level ? 1 : 0; ori<4; ori++){
353 int size= w>>(dec_count-level);
354 int sx= (ori&1) ? size : 0;
355 int stride= 32<<(dec_count-level);
356 int sy= (ori&2) ? stride>>1 : 0;
357
358 for(i=0; i<size; i++){
359 for(j=0; j<size; j++){
360 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
361 s += FFABS(v);
362 }
363 }
364 }
365 }
366 assert(s>=0);
367 return s>>9;
368 }
369
370 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
371 return w_c(v, pix1, pix2, line_size, 8, h, 1);
372 }
373
374 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 0);
376 }
377
378 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 16, h, 1);
380 }
381
382 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 0);
384 }
385
386 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 32, h, 1);
388 }
389
390 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 32, h, 0);
392 }
393 #endif
394
395 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
396 {
397 int i;
398
399 /* read the pixels */
400 for(i=0;i<8;i++) {
401 block[0] = pixels[0];
402 block[1] = pixels[1];
403 block[2] = pixels[2];
404 block[3] = pixels[3];
405 block[4] = pixels[4];
406 block[5] = pixels[5];
407 block[6] = pixels[6];
408 block[7] = pixels[7];
409 pixels += line_size;
410 block += 8;
411 }
412 }
413
414 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
415 const uint8_t *s2, int stride){
416 int i;
417
418 /* read the pixels */
419 for(i=0;i<8;i++) {
420 block[0] = s1[0] - s2[0];
421 block[1] = s1[1] - s2[1];
422 block[2] = s1[2] - s2[2];
423 block[3] = s1[3] - s2[3];
424 block[4] = s1[4] - s2[4];
425 block[5] = s1[5] - s2[5];
426 block[6] = s1[6] - s2[6];
427 block[7] = s1[7] - s2[7];
428 s1 += stride;
429 s2 += stride;
430 block += 8;
431 }
432 }
433
434
435 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
436 int line_size)
437 {
438 int i;
439 uint8_t *cm = cropTbl + MAX_NEG_CROP;
440
441 /* read the pixels */
442 for(i=0;i<8;i++) {
443 pixels[0] = cm[block[0]];
444 pixels[1] = cm[block[1]];
445 pixels[2] = cm[block[2]];
446 pixels[3] = cm[block[3]];
447 pixels[4] = cm[block[4]];
448 pixels[5] = cm[block[5]];
449 pixels[6] = cm[block[6]];
450 pixels[7] = cm[block[7]];
451
452 pixels += line_size;
453 block += 8;
454 }
455 }
456
457 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
458 int line_size)
459 {
460 int i;
461 uint8_t *cm = cropTbl + MAX_NEG_CROP;
462
463 /* read the pixels */
464 for(i=0;i<4;i++) {
465 pixels[0] = cm[block[0]];
466 pixels[1] = cm[block[1]];
467 pixels[2] = cm[block[2]];
468 pixels[3] = cm[block[3]];
469
470 pixels += line_size;
471 block += 8;
472 }
473 }
474
475 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
476 int line_size)
477 {
478 int i;
479 uint8_t *cm = cropTbl + MAX_NEG_CROP;
480
481 /* read the pixels */
482 for(i=0;i<2;i++) {
483 pixels[0] = cm[block[0]];
484 pixels[1] = cm[block[1]];
485
486 pixels += line_size;
487 block += 8;
488 }
489 }
490
491 static void put_signed_pixels_clamped_c(const DCTELEM *block,
492 uint8_t *restrict pixels,
493 int line_size)
494 {
495 int i, j;
496
497 for (i = 0; i < 8; i++) {
498 for (j = 0; j < 8; j++) {
499 if (*block < -128)
500 *pixels = 0;
501 else if (*block > 127)
502 *pixels = 255;
503 else
504 *pixels = (uint8_t)(*block + 128);
505 block++;
506 pixels++;
507 }
508 pixels += (line_size - 8);
509 }
510 }
511
512 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
513 int line_size)
514 {
515 int i;
516 uint8_t *cm = cropTbl + MAX_NEG_CROP;
517
518 /* read the pixels */
519 for(i=0;i<8;i++) {
520 pixels[0] = cm[pixels[0] + block[0]];
521 pixels[1] = cm[pixels[1] + block[1]];
522 pixels[2] = cm[pixels[2] + block[2]];
523 pixels[3] = cm[pixels[3] + block[3]];
524 pixels[4] = cm[pixels[4] + block[4]];
525 pixels[5] = cm[pixels[5] + block[5]];
526 pixels[6] = cm[pixels[6] + block[6]];
527 pixels[7] = cm[pixels[7] + block[7]];
528 pixels += line_size;
529 block += 8;
530 }
531 }
532
533 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
534 int line_size)
535 {
536 int i;
537 uint8_t *cm = cropTbl + MAX_NEG_CROP;
538
539 /* read the pixels */
540 for(i=0;i<4;i++) {
541 pixels[0] = cm[pixels[0] + block[0]];
542 pixels[1] = cm[pixels[1] + block[1]];
543 pixels[2] = cm[pixels[2] + block[2]];
544 pixels[3] = cm[pixels[3] + block[3]];
545 pixels += line_size;
546 block += 8;
547 }
548 }
549
550 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
551 int line_size)
552 {
553 int i;
554 uint8_t *cm = cropTbl + MAX_NEG_CROP;
555
556 /* read the pixels */
557 for(i=0;i<2;i++) {
558 pixels[0] = cm[pixels[0] + block[0]];
559 pixels[1] = cm[pixels[1] + block[1]];
560 pixels += line_size;
561 block += 8;
562 }
563 }
564
565 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
566 {
567 int i;
568 for(i=0;i<8;i++) {
569 pixels[0] += block[0];
570 pixels[1] += block[1];
571 pixels[2] += block[2];
572 pixels[3] += block[3];
573 pixels[4] += block[4];
574 pixels[5] += block[5];
575 pixels[6] += block[6];
576 pixels[7] += block[7];
577 pixels += line_size;
578 block += 8;
579 }
580 }
581
582 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
583 {
584 int i;
585 for(i=0;i<4;i++) {
586 pixels[0] += block[0];
587 pixels[1] += block[1];
588 pixels[2] += block[2];
589 pixels[3] += block[3];
590 pixels += line_size;
591 block += 4;
592 }
593 }
594
595 #if 0
596
597 #define PIXOP2(OPNAME, OP) \
598 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
599 {\
600 int i;\
601 for(i=0; i<h; i++){\
602 OP(*((uint64_t*)block), LD64(pixels));\
603 pixels+=line_size;\
604 block +=line_size;\
605 }\
606 }\
607 \
608 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
609 {\
610 int i;\
611 for(i=0; i<h; i++){\
612 const uint64_t a= LD64(pixels );\
613 const uint64_t b= LD64(pixels+1);\
614 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
615 pixels+=line_size;\
616 block +=line_size;\
617 }\
618 }\
619 \
620 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
621 {\
622 int i;\
623 for(i=0; i<h; i++){\
624 const uint64_t a= LD64(pixels );\
625 const uint64_t b= LD64(pixels+1);\
626 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
627 pixels+=line_size;\
628 block +=line_size;\
629 }\
630 }\
631 \
632 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
633 {\
634 int i;\
635 for(i=0; i<h; i++){\
636 const uint64_t a= LD64(pixels );\
637 const uint64_t b= LD64(pixels+line_size);\
638 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
639 pixels+=line_size;\
640 block +=line_size;\
641 }\
642 }\
643 \
644 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
645 {\
646 int i;\
647 for(i=0; i<h; i++){\
648 const uint64_t a= LD64(pixels );\
649 const uint64_t b= LD64(pixels+line_size);\
650 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
651 pixels+=line_size;\
652 block +=line_size;\
653 }\
654 }\
655 \
656 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657 {\
658 int i;\
659 const uint64_t a= LD64(pixels );\
660 const uint64_t b= LD64(pixels+1);\
661 uint64_t l0= (a&0x0303030303030303ULL)\
662 + (b&0x0303030303030303ULL)\
663 + 0x0202020202020202ULL;\
664 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
665 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
666 uint64_t l1,h1;\
667 \
668 pixels+=line_size;\
669 for(i=0; i<h; i+=2){\
670 uint64_t a= LD64(pixels );\
671 uint64_t b= LD64(pixels+1);\
672 l1= (a&0x0303030303030303ULL)\
673 + (b&0x0303030303030303ULL);\
674 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
675 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
676 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
677 pixels+=line_size;\
678 block +=line_size;\
679 a= LD64(pixels );\
680 b= LD64(pixels+1);\
681 l0= (a&0x0303030303030303ULL)\
682 + (b&0x0303030303030303ULL)\
683 + 0x0202020202020202ULL;\
684 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
685 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
686 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
687 pixels+=line_size;\
688 block +=line_size;\
689 }\
690 }\
691 \
692 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
693 {\
694 int i;\
695 const uint64_t a= LD64(pixels );\
696 const uint64_t b= LD64(pixels+1);\
697 uint64_t l0= (a&0x0303030303030303ULL)\
698 + (b&0x0303030303030303ULL)\
699 + 0x0101010101010101ULL;\
700 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
701 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
702 uint64_t l1,h1;\
703 \
704 pixels+=line_size;\
705 for(i=0; i<h; i+=2){\
706 uint64_t a= LD64(pixels );\
707 uint64_t b= LD64(pixels+1);\
708 l1= (a&0x0303030303030303ULL)\
709 + (b&0x0303030303030303ULL);\
710 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
711 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
712 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
713 pixels+=line_size;\
714 block +=line_size;\
715 a= LD64(pixels );\
716 b= LD64(pixels+1);\
717 l0= (a&0x0303030303030303ULL)\
718 + (b&0x0303030303030303ULL)\
719 + 0x0101010101010101ULL;\
720 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
721 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
722 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
723 pixels+=line_size;\
724 block +=line_size;\
725 }\
726 }\
727 \
728 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
729 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
730 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
731 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
733 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
734 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
735
736 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
737 #else // 64 bit variant
738
739 #define PIXOP2(OPNAME, OP) \
740 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
741 int i;\
742 for(i=0; i<h; i++){\
743 OP(*((uint16_t*)(block )), LD16(pixels ));\
744 pixels+=line_size;\
745 block +=line_size;\
746 }\
747 }\
748 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
749 int i;\
750 for(i=0; i<h; i++){\
751 OP(*((uint32_t*)(block )), LD32(pixels ));\
752 pixels+=line_size;\
753 block +=line_size;\
754 }\
755 }\
756 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
757 int i;\
758 for(i=0; i<h; i++){\
759 OP(*((uint32_t*)(block )), LD32(pixels ));\
760 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
761 pixels+=line_size;\
762 block +=line_size;\
763 }\
764 }\
765 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
766 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
767 }\
768 \
769 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
770 int src_stride1, int src_stride2, int h){\
771 int i;\
772 for(i=0; i<h; i++){\
773 uint32_t a,b;\
774 a= LD32(&src1[i*src_stride1 ]);\
775 b= LD32(&src2[i*src_stride2 ]);\
776 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
777 a= LD32(&src1[i*src_stride1+4]);\
778 b= LD32(&src2[i*src_stride2+4]);\
779 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
780 }\
781 }\
782 \
783 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
784 int src_stride1, int src_stride2, int h){\
785 int i;\
786 for(i=0; i<h; i++){\
787 uint32_t a,b;\
788 a= LD32(&src1[i*src_stride1 ]);\
789 b= LD32(&src2[i*src_stride2 ]);\
790 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
791 a= LD32(&src1[i*src_stride1+4]);\
792 b= LD32(&src2[i*src_stride2+4]);\
793 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
794 }\
795 }\
796 \
797 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
798 int src_stride1, int src_stride2, int h){\
799 int i;\
800 for(i=0; i<h; i++){\
801 uint32_t a,b;\
802 a= LD32(&src1[i*src_stride1 ]);\
803 b= LD32(&src2[i*src_stride2 ]);\
804 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
805 }\
806 }\
807 \
808 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
809 int src_stride1, int src_stride2, int h){\
810 int i;\
811 for(i=0; i<h; i++){\
812 uint32_t a,b;\
813 a= LD16(&src1[i*src_stride1 ]);\
814 b= LD16(&src2[i*src_stride2 ]);\
815 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
816 }\
817 }\
818 \
819 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
820 int src_stride1, int src_stride2, int h){\
821 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
822 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
823 }\
824 \
825 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
826 int src_stride1, int src_stride2, int h){\
827 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
828 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
829 }\
830 \
831 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
832 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
833 }\
834 \
835 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
836 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
837 }\
838 \
839 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
840 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
841 }\
842 \
843 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
845 }\
846 \
847 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
848 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
849 int i;\
850 for(i=0; i<h; i++){\
851 uint32_t a, b, c, d, l0, l1, h0, h1;\
852 a= LD32(&src1[i*src_stride1]);\
853 b= LD32(&src2[i*src_stride2]);\
854 c= LD32(&src3[i*src_stride3]);\
855 d= LD32(&src4[i*src_stride4]);\
856 l0= (a&0x03030303UL)\
857 + (b&0x03030303UL)\
858 + 0x02020202UL;\
859 h0= ((a&0xFCFCFCFCUL)>>2)\
860 + ((b&0xFCFCFCFCUL)>>2);\
861 l1= (c&0x03030303UL)\
862 + (d&0x03030303UL);\
863 h1= ((c&0xFCFCFCFCUL)>>2)\
864 + ((d&0xFCFCFCFCUL)>>2);\
865 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
866 a= LD32(&src1[i*src_stride1+4]);\
867 b= LD32(&src2[i*src_stride2+4]);\
868 c= LD32(&src3[i*src_stride3+4]);\
869 d= LD32(&src4[i*src_stride4+4]);\
870 l0= (a&0x03030303UL)\
871 + (b&0x03030303UL)\
872 + 0x02020202UL;\
873 h0= ((a&0xFCFCFCFCUL)>>2)\
874 + ((b&0xFCFCFCFCUL)>>2);\
875 l1= (c&0x03030303UL)\
876 + (d&0x03030303UL);\
877 h1= ((c&0xFCFCFCFCUL)>>2)\
878 + ((d&0xFCFCFCFCUL)>>2);\
879 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
880 }\
881 }\
882 \
883 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
884 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
885 }\
886 \
887 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
888 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
889 }\
890 \
891 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
893 }\
894 \
895 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
897 }\
898 \
899 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
900 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
901 int i;\
902 for(i=0; i<h; i++){\
903 uint32_t a, b, c, d, l0, l1, h0, h1;\
904 a= LD32(&src1[i*src_stride1]);\
905 b= LD32(&src2[i*src_stride2]);\
906 c= LD32(&src3[i*src_stride3]);\
907 d= LD32(&src4[i*src_stride4]);\
908 l0= (a&0x03030303UL)\
909 + (b&0x03030303UL)\
910 + 0x01010101UL;\
911 h0= ((a&0xFCFCFCFCUL)>>2)\
912 + ((b&0xFCFCFCFCUL)>>2);\
913 l1= (c&0x03030303UL)\
914 + (d&0x03030303UL);\
915 h1= ((c&0xFCFCFCFCUL)>>2)\
916 + ((d&0xFCFCFCFCUL)>>2);\
917 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
918 a= LD32(&src1[i*src_stride1+4]);\
919 b= LD32(&src2[i*src_stride2+4]);\
920 c= LD32(&src3[i*src_stride3+4]);\
921 d= LD32(&src4[i*src_stride4+4]);\
922 l0= (a&0x03030303UL)\
923 + (b&0x03030303UL)\
924 + 0x01010101UL;\
925 h0= ((a&0xFCFCFCFCUL)>>2)\
926 + ((b&0xFCFCFCFCUL)>>2);\
927 l1= (c&0x03030303UL)\
928 + (d&0x03030303UL);\
929 h1= ((c&0xFCFCFCFCUL)>>2)\
930 + ((d&0xFCFCFCFCUL)>>2);\
931 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
932 }\
933 }\
934 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
935 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
938 }\
939 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
940 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
941 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
942 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
943 }\
944 \
945 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
946 {\
947 int i, a0, b0, a1, b1;\
948 a0= pixels[0];\
949 b0= pixels[1] + 2;\
950 a0 += b0;\
951 b0 += pixels[2];\
952 \
953 pixels+=line_size;\
954 for(i=0; i<h; i+=2){\
955 a1= pixels[0];\
956 b1= pixels[1];\
957 a1 += b1;\
958 b1 += pixels[2];\
959 \
960 block[0]= (a1+a0)>>2; /* FIXME non put */\
961 block[1]= (b1+b0)>>2;\
962 \
963 pixels+=line_size;\
964 block +=line_size;\
965 \
966 a0= pixels[0];\
967 b0= pixels[1] + 2;\
968 a0 += b0;\
969 b0 += pixels[2];\
970 \
971 block[0]= (a1+a0)>>2;\
972 block[1]= (b1+b0)>>2;\
973 pixels+=line_size;\
974 block +=line_size;\
975 }\
976 }\
977 \
978 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
979 {\
980 int i;\
981 const uint32_t a= LD32(pixels );\
982 const uint32_t b= LD32(pixels+1);\
983 uint32_t l0= (a&0x03030303UL)\
984 + (b&0x03030303UL)\
985 + 0x02020202UL;\
986 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
987 + ((b&0xFCFCFCFCUL)>>2);\
988 uint32_t l1,h1;\
989 \
990 pixels+=line_size;\
991 for(i=0; i<h; i+=2){\
992 uint32_t a= LD32(pixels );\
993 uint32_t b= LD32(pixels+1);\
994 l1= (a&0x03030303UL)\
995 + (b&0x03030303UL);\
996 h1= ((a&0xFCFCFCFCUL)>>2)\
997 + ((b&0xFCFCFCFCUL)>>2);\
998 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
999 pixels+=line_size;\
1000 block +=line_size;\
1001 a= LD32(pixels );\
1002 b= LD32(pixels+1);\
1003 l0= (a&0x03030303UL)\
1004 + (b&0x03030303UL)\
1005 + 0x02020202UL;\
1006 h0= ((a&0xFCFCFCFCUL)>>2)\
1007 + ((b&0xFCFCFCFCUL)>>2);\
1008 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1009 pixels+=line_size;\
1010 block +=line_size;\
1011 }\
1012 }\
1013 \
1014 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1015 {\
1016 int j;\
1017 for(j=0; j<2; j++){\
1018 int i;\
1019 const uint32_t a= LD32(pixels );\
1020 const uint32_t b= LD32(pixels+1);\
1021 uint32_t l0= (a&0x03030303UL)\
1022 + (b&0x03030303UL)\
1023 + 0x02020202UL;\
1024 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1025 + ((b&0xFCFCFCFCUL)>>2);\
1026 uint32_t l1,h1;\
1027 \
1028 pixels+=line_size;\
1029 for(i=0; i<h; i+=2){\
1030 uint32_t a= LD32(pixels );\
1031 uint32_t b= LD32(pixels+1);\
1032 l1= (a&0x03030303UL)\
1033 + (b&0x03030303UL);\
1034 h1= ((a&0xFCFCFCFCUL)>>2)\
1035 + ((b&0xFCFCFCFCUL)>>2);\
1036 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1037 pixels+=line_size;\
1038 block +=line_size;\
1039 a= LD32(pixels );\
1040 b= LD32(pixels+1);\
1041 l0= (a&0x03030303UL)\
1042 + (b&0x03030303UL)\
1043 + 0x02020202UL;\
1044 h0= ((a&0xFCFCFCFCUL)>>2)\
1045 + ((b&0xFCFCFCFCUL)>>2);\
1046 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1047 pixels+=line_size;\
1048 block +=line_size;\
1049 }\
1050 pixels+=4-line_size*(h+1);\
1051 block +=4-line_size*h;\
1052 }\
1053 }\
1054 \
1055 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1056 {\
1057 int j;\
1058 for(j=0; j<2; j++){\
1059 int i;\
1060 const uint32_t a= LD32(pixels );\
1061 const uint32_t b= LD32(pixels+1);\
1062 uint32_t l0= (a&0x03030303UL)\
1063 + (b&0x03030303UL)\
1064 + 0x01010101UL;\
1065 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1066 + ((b&0xFCFCFCFCUL)>>2);\
1067 uint32_t l1,h1;\
1068 \
1069 pixels+=line_size;\
1070 for(i=0; i<h; i+=2){\
1071 uint32_t a= LD32(pixels );\
1072 uint32_t b= LD32(pixels+1);\
1073 l1= (a&0x03030303UL)\
1074 + (b&0x03030303UL);\
1075 h1= ((a&0xFCFCFCFCUL)>>2)\
1076 + ((b&0xFCFCFCFCUL)>>2);\
1077 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1078 pixels+=line_size;\
1079 block +=line_size;\
1080 a= LD32(pixels );\
1081 b= LD32(pixels+1);\
1082 l0= (a&0x03030303UL)\
1083 + (b&0x03030303UL)\
1084 + 0x01010101UL;\
1085 h0= ((a&0xFCFCFCFCUL)>>2)\
1086 + ((b&0xFCFCFCFCUL)>>2);\
1087 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1088 pixels+=line_size;\
1089 block +=line_size;\
1090 }\
1091 pixels+=4-line_size*(h+1);\
1092 block +=4-line_size*h;\
1093 }\
1094 }\
1095 \
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1098 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1102 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1103 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1104
1105 #define op_avg(a, b) a = rnd_avg32(a, b)
1106 #endif
1107 #define op_put(a, b) a = b
1108
1109 PIXOP2(avg, op_avg)
1110 PIXOP2(put, op_put)
1111 #undef op_avg
1112 #undef op_put
1113
1114 #define avg2(a,b) ((a+b+1)>>1)
1115 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1116
1117 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1118 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1119 }
1120
1121 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1122 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1123 }
1124
1125 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1126 {
1127 const int A=(16-x16)*(16-y16);
1128 const int B=( x16)*(16-y16);
1129 const int C=(16-x16)*( y16);
1130 const int D=( x16)*( y16);
1131 int i;
1132
1133 for(i=0; i<h; i++)
1134 {
1135 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1136 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1137 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1138 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1139 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1140 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1141 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1142 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1143 dst+= stride;
1144 src+= stride;
1145 }
1146 }
1147
1148 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1149 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1150 {
1151 int y, vx, vy;
1152 const int s= 1<<shift;
1153
1154 width--;
1155 height--;
1156
1157 for(y=0; y<h; y++){
1158 int x;
1159
1160 vx= ox;
1161 vy= oy;
1162 for(x=0; x<8; x++){ //XXX FIXME optimize
1163 int src_x, src_y, frac_x, frac_y, index;
1164
1165 src_x= vx>>16;
1166 src_y= vy>>16;
1167 frac_x= src_x&(s-1);
1168 frac_y= src_y&(s-1);
1169 src_x>>=shift;
1170 src_y>>=shift;
1171
1172 if((unsigned)src_x < width){
1173 if((unsigned)src_y < height){
1174 index= src_x + src_y*stride;
1175 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1176 + src[index +1]* frac_x )*(s-frac_y)
1177 + ( src[index+stride ]*(s-frac_x)
1178 + src[index+stride+1]* frac_x )* frac_y
1179 + r)>>(shift*2);
1180 }else{
1181 index= src_x + clip(src_y, 0, height)*stride;
1182 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1183 + src[index +1]* frac_x )*s
1184 + r)>>(shift*2);
1185 }
1186 }else{
1187 if((unsigned)src_y < height){
1188 index= clip(src_x, 0, width) + src_y*stride;
1189 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1190 + src[index+stride ]* frac_y )*s
1191 + r)>>(shift*2);
1192 }else{
1193 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1194 dst[y*stride + x]= src[index ];
1195 }
1196 }
1197
1198 vx+= dxx;
1199 vy+= dyx;
1200 }
1201 ox += dxy;
1202 oy += dyy;
1203 }
1204 }
1205
1206 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1207 switch(width){
1208 case 2: put_pixels2_c (dst, src, stride, height); break;
1209 case 4: put_pixels4_c (dst, src, stride, height); break;
1210 case 8: put_pixels8_c (dst, src, stride, height); break;
1211 case 16:put_pixels16_c(dst, src, stride, height); break;
1212 }
1213 }
1214
1215 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216 int i,j;
1217 for (i=0; i < height; i++) {
1218 for (j=0; j < width; j++) {
1219 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1220 }
1221 src += stride;
1222 dst += stride;
1223 }
1224 }
1225
1226 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1227 int i,j;
1228 for (i=0; i < height; i++) {
1229 for (j=0; j < width; j++) {
1230 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1231 }
1232 src += stride;
1233 dst += stride;
1234 }
1235 }
1236
1237 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1238 int i,j;
1239 for (i=0; i < height; i++) {
1240 for (j=0; j < width; j++) {
1241 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1242 }
1243 src += stride;
1244 dst += stride;
1245 }
1246 }
1247
1248 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1249 int i,j;
1250 for (i=0; i < height; i++) {
1251 for (j=0; j < width; j++) {
1252 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1253 }
1254 src += stride;
1255 dst += stride;
1256 }
1257 }
1258
1259 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1260 int i,j;
1261 for (i=0; i < height; i++) {
1262 for (j=0; j < width; j++) {
1263 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1264 }
1265 src += stride;
1266 dst += stride;
1267 }
1268 }
1269
1270 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271 int i,j;
1272 for (i=0; i < height; i++) {
1273 for (j=0; j < width; j++) {
1274 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1275 }
1276 src += stride;
1277 dst += stride;
1278 }
1279 }
1280
1281 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1282 int i,j;
1283 for (i=0; i < height; i++) {
1284 for (j=0; j < width; j++) {
1285 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1286 }
1287 src += stride;
1288 dst += stride;
1289 }
1290 }
1291
1292 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1293 int i,j;
1294 for (i=0; i < height; i++) {
1295 for (j=0; j < width; j++) {
1296 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1297 }
1298 src += stride;
1299 dst += stride;
1300 }
1301 }
1302
1303 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1304 switch(width){
1305 case 2: avg_pixels2_c (dst, src, stride, height); break;
1306 case 4: avg_pixels4_c (dst, src, stride, height); break;
1307 case 8: avg_pixels8_c (dst, src, stride, height); break;
1308 case 16:avg_pixels16_c(dst, src, stride, height); break;
1309 }
1310 }
1311
1312 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313 int i,j;
1314 for (i=0; i < height; i++) {
1315 for (j=0; j < width; j++) {
1316 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1317 }
1318 src += stride;
1319 dst += stride;
1320 }
1321 }
1322
1323 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324 int i,j;
1325 for (i=0; i < height; i++) {
1326 for (j=0; j < width; j++) {
1327 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1328 }
1329 src += stride;
1330 dst += stride;
1331 }
1332 }
1333
1334 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335 int i,j;
1336 for (i=0; i < height; i++) {
1337 for (j=0; j < width; j++) {
1338 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1339 }
1340 src += stride;
1341 dst += stride;
1342 }
1343 }
1344
1345 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346 int i,j;
1347 for (i=0; i < height; i++) {
1348 for (j=0; j < width; j++) {
1349 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1350 }
1351 src += stride;
1352 dst += stride;
1353 }
1354 }
1355
1356 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357 int i,j;
1358 for (i=0; i < height; i++) {
1359 for (j=0; j < width; j++) {
1360 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1361 }
1362 src += stride;
1363 dst += stride;
1364 }
1365 }
1366
1367 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368 int i,j;
1369 for (i=0; i < height; i++) {
1370 for (j=0; j < width; j++) {
1371 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1372 }
1373 src += stride;
1374 dst += stride;
1375 }
1376 }
1377
1378 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379 int i,j;
1380 for (i=0; i < height; i++) {
1381 for (j=0; j < width; j++) {
1382 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1383 }
1384 src += stride;
1385 dst += stride;
1386 }
1387 }
1388
1389 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390 int i,j;
1391 for (i=0; i < height; i++) {
1392 for (j=0; j < width; j++) {
1393 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1394 }
1395 src += stride;
1396 dst += stride;
1397 }
1398 }
1399 #if 0
1400 #define TPEL_WIDTH(width)\
1401 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1419 #endif
1420
1421 #define H264_CHROMA_MC(OPNAME, OP)\
1422 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1423 const int A=(8-x)*(8-y);\
1424 const int B=( x)*(8-y);\
1425 const int C=(8-x)*( y);\
1426 const int D=( x)*( y);\
1427 int i;\
1428 \
1429 assert(x<8 && y<8 && x>=0 && y>=0);\
1430 \
1431 for(i=0; i<h; i++)\
1432 {\
1433 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1434 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1435 dst+= stride;\
1436 src+= stride;\
1437 }\
1438 }\
1439 \
1440 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1441 const int A=(8-x)*(8-y);\
1442 const int B=( x)*(8-y);\
1443 const int C=(8-x)*( y);\
1444 const int D=( x)*( y);\
1445 int i;\
1446 \
1447 assert(x<8 && y<8 && x>=0 && y>=0);\
1448 \
1449 for(i=0; i<h; i++)\
1450 {\
1451 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1452 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1453 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1454 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1455 dst+= stride;\
1456 src+= stride;\
1457 }\
1458 }\
1459 \
1460 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1461 const int A=(8-x)*(8-y);\
1462 const int B=( x)*(8-y);\
1463 const int C=(8-x)*( y);\
1464 const int D=( x)*( y);\
1465 int i;\
1466 \
1467 assert(x<8 && y<8 && x>=0 && y>=0);\
1468 \
1469 for(i=0; i<h; i++)\
1470 {\
1471 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1472 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1473 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1474 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1475 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1476 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1477 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1478 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1479 dst+= stride;\
1480 src+= stride;\
1481 }\
1482 }
1483
1484 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1485 #define op_put(a, b) a = (((b) + 32)>>6)
1486
1487 H264_CHROMA_MC(put_ , op_put)
1488 H264_CHROMA_MC(avg_ , op_avg)
1489 #undef op_avg
1490 #undef op_put
1491
1492 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1493 const int A=(8-x)*(8-y);
1494 const int B=( x)*(8-y);
1495 const int C=(8-x)*( y);
1496 const int D=( x)*( y);
1497 int i;
1498
1499 assert(x<8 && y<8 && x>=0 && y>=0);
1500
1501 for(i=0; i<h; i++)
1502 {
1503 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1504 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1505 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1506 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1507 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1508 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1509 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1510 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1511 dst+= stride;
1512 src+= stride;
1513 }
1514 }
1515
1516 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1517 {
1518 int i;
1519 for(i=0; i<h; i++)
1520 {
1521 ST16(dst , LD16(src ));
1522 dst+=dstStride;
1523 src+=srcStride;
1524 }
1525 }
1526
1527 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1528 {
1529 int i;
1530 for(i=0; i<h; i++)
1531 {
1532 ST32(dst , LD32(src ));
1533 dst+=dstStride;
1534 src+=srcStride;
1535 }
1536 }
1537
1538 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1539 {
1540 int i;
1541 for(i=0; i<h; i++)
1542 {
1543 ST32(dst , LD32(src ));
1544 ST32(dst+4 , LD32(src+4 ));
1545 dst+=dstStride;
1546 src+=srcStride;
1547 }
1548 }
1549
1550 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1551 {
1552 int i;
1553 for(i=0; i<h; i++)
1554 {
1555 ST32(dst , LD32(src ));
1556 ST32(dst+4 , LD32(src+4 ));
1557 ST32(dst+8 , LD32(src+8 ));
1558 ST32(dst+12, LD32(src+12));
1559 dst+=dstStride;
1560 src+=srcStride;
1561 }
1562 }
1563
1564 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1565 {
1566 int i;
1567 for(i=0; i<h; i++)
1568 {
1569 ST32(dst , LD32(src ));
1570 ST32(dst+4 , LD32(src+4 ));
1571 ST32(dst+8 , LD32(src+8 ));
1572 ST32(dst+12, LD32(src+12));
1573 dst[16]= src[16];
1574 dst+=dstStride;
1575 src+=srcStride;
1576 }
1577 }
1578
1579 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1580 {
1581 int i;
1582 for(i=0; i<h; i++)
1583 {
1584 ST32(dst , LD32(src ));
1585 ST32(dst+4 , LD32(src+4 ));
1586 dst[8]= src[8];
1587 dst+=dstStride;
1588 src+=srcStride;
1589 }
1590 }
1591
1592
1593 #define QPEL_MC(r, OPNAME, RND, OP) \
1594 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1595 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1596 int i;\
1597 for(i=0; i<h; i++)\
1598 {\
1599 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1600 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1601 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1602 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1603 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1604 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1605 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1606 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1607 dst+=dstStride;\
1608 src+=srcStride;\
1609 }\
1610 }\
1611 \
1612 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1613 const int w=8;\
1614 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1615 int i;\
1616 for(i=0; i<w; i++)\
1617 {\
1618 const int src0= src[0*srcStride];\
1619 const int src1= src[1*srcStride];\
1620 const int src2= src[2*srcStride];\
1621 const int src3= src[3*srcStride];\
1622 const int src4= src[4*srcStride];\
1623 const int src5= src[5*srcStride];\
1624 const int src6= src[6*srcStride];\
1625 const int src7= src[7*srcStride];\
1626 const int src8= src[8*srcStride];\
1627 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1628 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1629 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1630 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1631 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1632 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1633 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1634 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1635 dst++;\
1636 src++;\
1637 }\
1638 }\
1639 \
1640 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1641 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1642 int i;\
1643 \
1644 for(i=0; i<h; i++)\
1645 {\
1646 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1647 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1648 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1649 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1650 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1651 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1652 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1653 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1654 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1655 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1656 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1657 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1658 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1659 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1660 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1661 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1662 dst+=dstStride;\
1663 src+=srcStride;\
1664 }\
1665 }\
1666 \
1667 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1668 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1669 int i;\
1670 const int w=16;\
1671 for(i=0; i<w; i++)\
1672 {\
1673 const int src0= src[0*srcStride];\
1674 const int src1= src[1*srcStride];\
1675 const int src2= src[2*srcStride];\
1676 const int src3= src[3*srcStride];\
1677 const int src4= src[4*srcStride];\
1678 const int src5= src[5*srcStride];\
1679 const int src6= src[6*srcStride];\
1680 const int src7= src[7*srcStride];\
1681 const int src8= src[8*srcStride];\
1682 const int src9= src[9*srcStride];\
1683 const int src10= src[10*srcStride];\
1684 const int src11= src[11*srcStride];\
1685 const int src12= src[12*srcStride];\
1686 const int src13= src[13*srcStride];\
1687 const int src14= src[14*srcStride];\
1688 const int src15= src[15*srcStride];\
1689 const int src16= src[16*srcStride];\
1690 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1691 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1692 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1693 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1694 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1695 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1696 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1697 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1698 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1699 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1700 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1701 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1702 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1703 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1704 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1705 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1706 dst++;\
1707 src++;\
1708 }\
1709 }\
1710 \
1711 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1712 OPNAME ## pixels8_c(dst, src, stride, 8);\
1713 }\
1714 \
1715 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1716 uint8_t half[64];\
1717 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1718 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1719 }\
1720 \
1721 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1722 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1723 }\
1724 \
1725 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1726 uint8_t half[64];\
1727 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1728 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1729 }\
1730 \
1731 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1732 uint8_t full[16*9];\
1733 uint8_t half[64];\
1734 copy_block9(full, src, 16, stride, 9);\
1735 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1736 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1737 }\
1738 \
1739 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1740 uint8_t full[16*9];\
1741 copy_block9(full, src, 16, stride, 9);\
1742 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1743 }\
1744 \
1745 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1746 uint8_t full[16*9];\
1747 uint8_t half[64];\
1748 copy_block9(full, src, 16, stride, 9);\
1749 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1750 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1751 }\
1752 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1753 uint8_t full[16*9];\
1754 uint8_t halfH[72];\
1755 uint8_t halfV[64];\
1756 uint8_t halfHV[64];\
1757 copy_block9(full, src, 16, stride, 9);\
1758 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1759 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1760 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1761 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1762 }\
1763 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1764 uint8_t full[16*9];\
1765 uint8_t halfH[72];\
1766 uint8_t halfHV[64];\
1767 copy_block9(full, src, 16, stride, 9);\
1768 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1769 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1770 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1772 }\
1773 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[16*9];\
1775 uint8_t halfH[72];\
1776 uint8_t halfV[64];\
1777 uint8_t halfHV[64];\
1778 copy_block9(full, src, 16, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1781 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1782 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1783 }\
1784 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1785 uint8_t full[16*9];\
1786 uint8_t halfH[72];\
1787 uint8_t halfHV[64];\
1788 copy_block9(full, src, 16, stride, 9);\
1789 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1790 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1791 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1792 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1793 }\
1794 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1795 uint8_t full[16*9];\
1796 uint8_t halfH[72];\
1797 uint8_t halfV[64];\
1798 uint8_t halfHV[64];\
1799 copy_block9(full, src, 16, stride, 9);\
1800 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1801 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1802 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1803 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1804 }\
1805 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1806 uint8_t full[16*9];\
1807 uint8_t halfH[72];\
1808 uint8_t halfHV[64];\
1809 copy_block9(full, src, 16, stride, 9);\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1812 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1813 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1814 }\
1815 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1816 uint8_t full[16*9];\
1817 uint8_t halfH[72];\
1818 uint8_t halfV[64];\
1819 uint8_t halfHV[64];\
1820 copy_block9(full, src, 16, stride, 9);\
1821 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1822 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1823 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1824 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1825 }\
1826 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1827 uint8_t full[16*9];\
1828 uint8_t halfH[72];\
1829 uint8_t halfHV[64];\
1830 copy_block9(full, src, 16, stride, 9);\
1831 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1832 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1833 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1834 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1835 }\
1836 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t halfH[72];\
1838 uint8_t halfHV[64];\
1839 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1840 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1842 }\
1843 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t halfH[72];\
1845 uint8_t halfHV[64];\
1846 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1847 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1848 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1849 }\
1850 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1851 uint8_t full[16*9];\
1852 uint8_t halfH[72];\
1853 uint8_t halfV[64];\
1854 uint8_t halfHV[64];\
1855 copy_block9(full, src, 16, stride, 9);\
1856 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1857 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1858 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1859 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1860 }\
1861 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1862 uint8_t full[16*9];\
1863 uint8_t halfH[72];\
1864 copy_block9(full, src, 16, stride, 9);\
1865 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1867 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1868 }\
1869 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1870 uint8_t full[16*9];\
1871 uint8_t halfH[72];\
1872 uint8_t halfV[64];\
1873 uint8_t halfHV[64];\
1874 copy_block9(full, src, 16, stride, 9);\
1875 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1876 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1877 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1878 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1879 }\
1880 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1881 uint8_t full[16*9];\
1882 uint8_t halfH[72];\
1883 copy_block9(full, src, 16, stride, 9);\
1884 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1885 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1886 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1887 }\
1888 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1889 uint8_t halfH[72];\
1890 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1891 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1892 }\
1893 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1894 OPNAME ## pixels16_c(dst, src, stride, 16);\
1895 }\
1896 \
1897 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1898 uint8_t half[256];\
1899 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1900 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1901 }\
1902 \
1903 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1904 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1905 }\
1906 \
1907 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1908 uint8_t half[256];\
1909 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1910 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1911 }\
1912 \
1913 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t full[24*17];\
1915 uint8_t half[256];\
1916 copy_block17(full, src, 24, stride, 17);\
1917 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1918 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1919 }\
1920 \
1921 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1922 uint8_t full[24*17];\
1923 copy_block17(full, src, 24, stride, 17);\
1924 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1925 }\
1926 \
1927 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1928 uint8_t full[24*17];\
1929 uint8_t half[256];\
1930 copy_block17(full, src, 24, stride, 17);\
1931 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1932 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1933 }\
1934 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935 uint8_t full[24*17];\
1936 uint8_t halfH[272];\
1937 uint8_t halfV[256];\
1938 uint8_t halfHV[256];\
1939 copy_block17(full, src, 24, stride, 17);\
1940 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1941 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1942 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1943 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1944 }\
1945 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1946 uint8_t full[24*17];\
1947 uint8_t halfH[272];\
1948 uint8_t halfHV[256];\
1949 copy_block17(full, src, 24, stride, 17);\
1950 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1951 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1952 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1954 }\
1955 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t full[24*17];\
1957 uint8_t halfH[272];\
1958 uint8_t halfV[256];\
1959 uint8_t halfHV[256];\
1960 copy_block17(full, src, 24, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1963 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1964 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1965 }\
1966 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1967 uint8_t full[24*17];\
1968 uint8_t halfH[272];\
1969 uint8_t halfHV[256];\
1970 copy_block17(full, src, 24, stride, 17);\
1971 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1972 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1973 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1974 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1975 }\
1976 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1977 uint8_t full[24*17];\
1978 uint8_t halfH[272];\
1979 uint8_t halfV[256];\
1980 uint8_t halfHV[256];\
1981 copy_block17(full, src, 24, stride, 17);\
1982 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1983 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1984 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1985 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1986 }\
1987 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1988 uint8_t full[24*17];\
1989 uint8_t halfH[272];\
1990 uint8_t halfHV[256];\
1991 copy_block17(full, src, 24, stride, 17);\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1994 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1995 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1996 }\
1997 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1998 uint8_t full[24*17];\
1999 uint8_t halfH[272];\
2000 uint8_t halfV[256];\
2001 uint8_t halfHV[256];\
2002 copy_block17(full, src, 24, stride, 17);\
2003 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2004 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2005 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2006 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2007 }\
2008 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2009 uint8_t full[24*17];\
2010 uint8_t halfH[272];\
2011 uint8_t halfHV[256];\
2012 copy_block17(full, src, 24, stride, 17);\
2013 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2014 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2015 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2016 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2017 }\
2018 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2019 uint8_t halfH[272];\
2020 uint8_t halfHV[256];\
2021 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2022 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2023 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2024 }\
2025 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t halfH[272];\
2027 uint8_t halfHV[256];\
2028 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2029 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2031 }\
2032 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2033 uint8_t full[24*17];\
2034 uint8_t halfH[272];\
2035 uint8_t halfV[256];\
2036 uint8_t halfHV[256];\
2037 copy_block17(full, src, 24, stride, 17);\
2038 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2039 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2040 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2041 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2042 }\
2043 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2044 uint8_t full[24*17];\
2045 uint8_t halfH[272];\
2046 copy_block17(full, src, 24, stride, 17);\
2047 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2049 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2050 }\
2051 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2052 uint8_t full[24*17];\
2053 uint8_t halfH[272];\
2054 uint8_t halfV[256];\
2055 uint8_t halfHV[256];\
2056 copy_block17(full, src, 24, stride, 17);\
2057 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2059 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2061 }\
2062 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2063 uint8_t full[24*17];\
2064 uint8_t halfH[272];\
2065 copy_block17(full, src, 24, stride, 17);\
2066 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2067 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2068 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2069 }\
2070 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2071 uint8_t halfH[272];\
2072 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2073 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2074 }
2075
2076 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2077 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2078 #define op_put(a, b) a = cm[((b) + 16)>>5]
2079 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2080
2081 QPEL_MC(0, put_ , _ , op_put)
2082 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2083 QPEL_MC(0, avg_ , _ , op_avg)
2084 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2085 #undef op_avg
2086 #undef op_avg_no_rnd
2087 #undef op_put
2088 #undef op_put_no_rnd
2089
2090 #if 1
2091 #define H264_LOWPASS(OPNAME, OP, OP2) \
2092 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2093 const int h=2;\
2094 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2095 int i;\
2096 for(i=0; i<h; i++)\
2097 {\
2098 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2099 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2100 dst+=dstStride;\
2101 src+=srcStride;\
2102 }\
2103 }\
2104 \
2105 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2106 const int w=2;\
2107 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2108 int i;\
2109 for(i=0; i<w; i++)\
2110 {\
2111 const int srcB= src[-2*srcStride];\
2112 const int srcA= src[-1*srcStride];\
2113 const int src0= src[0 *srcStride];\
2114 const int src1= src[1 *srcStride];\
2115 const int src2= src[2 *srcStride];\
2116 const int src3= src[3 *srcStride];\
2117 const int src4= src[4 *srcStride];\
2118 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2119 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2120 dst++;\
2121 src++;\
2122 }\
2123 }\
2124 \
2125 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2126 const int h=2;\
2127 const int w=2;\
2128 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2129 int i;\
2130 src -= 2*srcStride;\
2131 for(i=0; i<h+5; i++)\
2132 {\
2133 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2134 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2135 tmp+=tmpStride;\
2136 src+=srcStride;\
2137 }\
2138 tmp -= tmpStride*(h+5-2);\
2139 for(i=0; i<w; i++)\
2140 {\
2141 const int tmpB= tmp[-2*tmpStride];\
2142 const int tmpA= tmp[-1*tmpStride];\
2143 const int tmp0= tmp[0 *tmpStride];\
2144 const int tmp1= tmp[1 *tmpStride];\
2145 const int tmp2= tmp[2 *tmpStride];\
2146 const int tmp3= tmp[3 *tmpStride];\
2147 const int tmp4= tmp[4 *tmpStride];\
2148 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2149 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2150 dst++;\
2151 tmp++;\
2152 }\
2153 }\
2154 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2155 const int h=4;\
2156 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2157 int i;\
2158 for(i=0; i<h; i++)\
2159 {\
2160 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2161 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2162 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2163 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2164 dst+=dstStride;\
2165 src+=srcStride;\
2166 }\
2167 }\
2168 \
2169 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2170 const int w=4;\
2171 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2172 int i;\
2173 for(i=0; i<w; i++)\
2174 {\
2175 const int srcB= src[-2*srcStride];\
2176 const int srcA= src[-1*srcStride];\
2177 const int src0= src[0 *srcStride];\
2178 const int src1= src[1 *srcStride];\
2179 const int src2= src[2 *srcStride];\
2180 const int src3= src[3 *srcStride];\
2181 const int src4= src[4 *srcStride];\
2182 const int src5= src[5 *srcStride];\
2183 const int src6= src[6 *srcStride];\
2184 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2185 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2186 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2187 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2188 dst++;\
2189 src++;\
2190 }\
2191 }\
2192 \
2193 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2194 const int h=4;\
2195 const int w=4;\
2196 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2197 int i;\
2198 src -= 2*srcStride;\
2199 for(i=0; i<h+5; i++)\
2200 {\
2201 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2202 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2203 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2204 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2205 tmp+=tmpStride;\
2206 src+=srcStride;\
2207 }\
2208 tmp -= tmpStride*(h+5-2);\
2209 for(i=0; i<w; i++)\
2210 {\
2211 const int tmpB= tmp[-2*tmpStride];\
2212 const int tmpA= tmp[-1*tmpStride];\
2213 const int tmp0= tmp[0 *tmpStride];\
2214 const int tmp1= tmp[1 *tmpStride];\
2215 const int tmp2= tmp[2 *tmpStride];\
2216 const int tmp3= tmp[3 *tmpStride];\
2217 const int tmp4= tmp[4 *tmpStride];\
2218 const int tmp5= tmp[5 *tmpStride];\
2219 const int tmp6= tmp[6 *tmpStride];\
2220 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2221 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2222 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2223 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2224 dst++;\
2225 tmp++;\
2226 }\
2227 }\
2228 \
2229 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230 const int h=8;\
2231 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2232 int i;\
2233 for(i=0; i<h; i++)\
2234 {\
2235 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2236 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2237 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2238 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2239 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2240 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2241 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2242 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2243 dst+=dstStride;\
2244 src+=srcStride;\
2245 }\
2246 }\
2247 \
2248 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2249 const int w=8;\
2250 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2251 int i;\
2252 for(i=0; i<w; i++)\
2253 {\
2254 const int srcB= src[-2*srcStride];\
2255 const int srcA= src[-1*srcStride];\
2256 const int src0= src[0 *srcStride];\
2257 const int src1= src[1 *srcStride];\
2258 const int src2= src[2 *srcStride];\
2259 const int src3= src[3 *srcStride];\
2260 const int src4= src[4 *srcStride];\
2261 const int src5= src[5 *srcStride];\
2262 const int src6= src[6 *srcStride];\
2263 const int src7= src[7 *srcStride];\
2264 const int src8= src[8 *srcStride];\
2265 const int src9= src[9 *srcStride];\
2266 const int src10=src[10*srcStride];\
2267 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2268 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2269 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2270 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2271 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2272 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2273 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2274 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2275 dst++;\
2276 src++;\
2277 }\
2278 }\
2279 \
2280 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2281 const int h=8;\
2282 const int w=8;\
2283 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2284 int i;\
2285 src -= 2*srcStride;\
2286 for(i=0; i<h+5; i++)\
2287 {\
2288 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2289 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2290 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2291 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2292 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2293 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2294 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2295 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2296 tmp+=tmpStride;\
2297 src+=srcStride;\
2298 }\
2299 tmp -= tmpStride*(h+5-2);\
2300 for(i=0; i<w; i++)\
2301 {\
2302 const int tmpB= tmp[-2*tmpStride];\
2303 const int tmpA= tmp[-1*tmpStride];\
2304 const int tmp0= tmp[0 *tmpStride];\
2305 const int tmp1= tmp[1 *tmpStride];\
2306 const int tmp2= tmp[2 *tmpStride];\
2307 const int tmp3= tmp[3 *tmpStride];\
2308 const int tmp4= tmp[4 *tmpStride];\
2309 const int tmp5= tmp[5 *tmpStride];\
2310 const int tmp6= tmp[6 *tmpStride];\
2311 const int tmp7= tmp[7 *tmpStride];\
2312 const int tmp8= tmp[8 *tmpStride];\
2313 const int tmp9= tmp[9 *tmpStride];\
2314 const int tmp10=tmp[10*tmpStride];\
2315 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2316 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2317 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2318 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2319 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2320 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2321 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2322 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2323 dst++;\
2324 tmp++;\
2325 }\
2326 }\
2327 \
2328 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2329 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2330 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2331 src += 8*srcStride;\
2332 dst += 8*dstStride;\
2333 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2334 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2335 }\
2336 \
2337 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2338 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2339 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2340 src += 8*srcStride;\
2341 dst += 8*dstStride;\
2342 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2343 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2344 }\
2345 \
2346 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2347 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2348 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2349 src += 8*srcStride;\
2350 dst += 8*dstStride;\
2351 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2352 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2353 }\
2354
2355 #define H264_MC(OPNAME, SIZE) \
2356 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2357 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2358 }\
2359 \
2360 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2361 uint8_t half[SIZE*SIZE];\
2362 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2363 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2364 }\
2365 \
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2367 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2368 }\
2369 \
2370 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2371 uint8_t half[SIZE*SIZE];\
2372 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2373 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2374 }\
2375 \
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2377 uint8_t full[SIZE*(SIZE+5)];\
2378 uint8_t * const full_mid= full + SIZE*2;\
2379 uint8_t half[SIZE*SIZE];\
2380 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2381 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2382 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2383 }\
2384 \
2385 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2386 uint8_t full[SIZE*(SIZE+5)];\
2387 uint8_t * const full_mid= full + SIZE*2;\
2388 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2389 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2390 }\
2391 \
2392 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2393 uint8_t full[SIZE*(SIZE+5)];\
2394 uint8_t * const full_mid= full + SIZE*2;\
2395 uint8_t half[SIZE*SIZE];\
2396 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2397 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2398 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2399 }\
2400 \
2401 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2402 uint8_t full[SIZE*(SIZE+5)];\
2403 uint8_t * const full_mid= full + SIZE*2;\
2404 uint8_t halfH[SIZE*SIZE];\
2405 uint8_t halfV[SIZE*SIZE];\
2406 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2407 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2408 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2409 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2410 }\
2411 \
2412 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2413 uint8_t full[SIZE*(SIZE+5)];\
2414 uint8_t * const full_mid= full + SIZE*2;\
2415 uint8_t halfH[SIZE*SIZE];\
2416 uint8_t halfV[SIZE*SIZE];\
2417 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2418 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2419 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2420 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2421 }\
2422 \
2423 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2424 uint8_t full[SIZE*(SIZE+5)];\
2425 uint8_t * const full_mid= full + SIZE*2;\
2426 uint8_t halfH[SIZE*SIZE];\
2427 uint8_t halfV[SIZE*SIZE];\
2428 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2429 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2430 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2431 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2432 }\
2433 \
2434 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2435 uint8_t full[SIZE*(SIZE+5)];\
2436 uint8_t * const full_mid= full + SIZE*2;\
2437 uint8_t halfH[SIZE*SIZE];\
2438 uint8_t halfV[SIZE*SIZE];\
2439 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2440 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2441 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2442 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2443 }\
2444 \
2445 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2446 int16_t tmp[SIZE*(SIZE+5)];\
2447 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2448 }\
2449 \
2450 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2451 int16_t tmp[SIZE*(SIZE+5)];\
2452 uint8_t halfH[SIZE*SIZE];\
2453 uint8_t halfHV[SIZE*SIZE];\
2454 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2455 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2456 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2457 }\
2458 \
2459 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2460 int16_t tmp[SIZE*(SIZE+5)];\
2461 uint8_t halfH[SIZE*SIZE];\
2462 uint8_t halfHV[SIZE*SIZE];\
2463 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2464 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2465 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2466 }\
2467 \
2468 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2469 uint8_t full[SIZE*(SIZE+5)];\
2470 uint8_t * const full_mid= full + SIZE*2;\
2471 int16_t tmp[SIZE*(SIZE+5)];\
2472 uint8_t halfV[SIZE*SIZE];\
2473 uint8_t halfHV[SIZE*SIZE];\
2474 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2475 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2476 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2477 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2478 }\
2479 \
2480 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2481 uint8_t full[SIZE*(SIZE+5)];\
2482 uint8_t * const full_mid= full + SIZE*2;\
2483 int16_t tmp[SIZE*(SIZE+5)];\
2484 uint8_t halfV[SIZE*SIZE];\
2485 uint8_t halfHV[SIZE*SIZE];\
2486 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2487 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2488 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2489 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2490 }\
2491
2492 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2493 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2494 #define op_put(a, b) a = cm[((b) + 16)>>5]
2495 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2496 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2497
2498 H264_LOWPASS(put_ , op_put, op2_put)
2499 H264_LOWPASS(avg_ , op_avg, op2_avg)
2500 H264_MC(put_, 2)
2501 H264_MC(put_, 4)
2502 H264_MC(put_, 8)
2503 H264_MC(put_, 16)
2504 H264_MC(avg_, 4)
2505 H264_MC(avg_, 8)
2506 H264_MC(avg_, 16)
2507
2508 #undef op_avg
2509 #undef op_put
2510 #undef op2_avg
2511 #undef op2_put
2512 #endif
2513
2514 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2515 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2516 #define H264_WEIGHT(W,H) \
2517 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2518 int y; \
2519 offset <<= log2_denom; \
2520 if(log2_denom) offset += 1<<(log2_denom-1); \
2521 for(y=0; y<H; y++, block += stride){ \
2522 op_scale1(0); \
2523 op_scale1(1); \
2524 if(W==2) continue; \
2525 op_scale1(2); \
2526 op_scale1(3); \
2527 if(W==4) continue; \
2528 op_scale1(4); \
2529 op_scale1(5); \
2530 op_scale1(6); \
2531 op_scale1(7); \
2532 if(W==8) continue; \
2533 op_scale1(8); \
2534 op_scale1(9); \
2535 op_scale1(10); \
2536 op_scale1(11); \
2537 op_scale1(12); \
2538 op_scale1(13); \
2539 op_scale1(14); \
2540 op_scale1(15); \
2541 } \
2542 } \
2543 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2544 int y; \
2545 offset = ((offset + 1) | 1) << log2_denom; \
2546 for(y=0; y<H; y++, dst += stride, src += stride){ \
2547 op_scale2(0); \
2548 op_scale2(1); \
2549 if(W==2) continue; \
2550 op_scale2(2); \
2551 op_scale2(3); \
2552 if(W==4) continue; \
2553 op_scale2(4); \
2554 op_scale2(5); \
2555 op_scale2(6); \
2556 op_scale2(7); \
2557 if(W==8) continue; \
2558 op_scale2(8); \
2559 op_scale2(9); \
2560 op_scale2(10); \
2561 op_scale2(11); \
2562 op_scale2(12); \
2563 op_scale2(13); \
2564 op_scale2(14); \
2565 op_scale2(15); \
2566 } \
2567 }
2568
2569 H264_WEIGHT(16,16)
2570 H264_WEIGHT(16,8)
2571 H264_WEIGHT(8,16)
2572 H264_WEIGHT(8,8)
2573 H264_WEIGHT(8,4)
2574 H264_WEIGHT(4,8)
2575 H264_WEIGHT(4,4)
2576 H264_WEIGHT(4,2)
2577 H264_WEIGHT(2,4)
2578 H264_WEIGHT(2,2)
2579
2580 #undef op_scale1
2581 #undef op_scale2
2582 #undef H264_WEIGHT
2583
2584 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2585 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2586 int i;
2587
2588 for(i=0; i<h; i++){
2589 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2590 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2591 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2592 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2593 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2594 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2595 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2596 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2597 dst+=dstStride;
2598 src+=srcStride;
2599 }
2600 }
2601
2602 #ifdef CONFIG_CAVS_DECODER
2603 /* AVS specific */
2604 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2605
2606 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2607 put_pixels8_c(dst, src, stride, 8);
2608 }
2609 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2610 avg_pixels8_c(dst, src, stride, 8);
2611 }
2612 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2613 put_pixels16_c(dst, src, stride, 16);
2614 }
2615 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2616 avg_pixels16_c(dst, src, stride, 16);
2617 }
2618 #endif /* CONFIG_CAVS_DECODER */
2619
2620 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2621 /* VC-1 specific */
2622 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2623
2624 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2625 put_pixels8_c(dst, src, stride, 8);
2626 }
2627 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2628
2629 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2630 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2631 int i;
2632
2633 for(i=0; i<w; i++){
2634 const int src_1= src[ -srcStride];
2635 const int src0 = src[0 ];
2636 const int src1 = src[ srcStride];
2637 const int src2 = src[2*srcStride];
2638 const int src3 = src[3*srcStride];
2639 const int src4 = src[4*srcStride];
2640 const int src5 = src[5*srcStride];
2641 const int src6 = src[6*srcStride];
2642 const int src7 = src[7*srcStride];
2643 const int src8 = src[8*srcStride];
2644 const int src9 = src[9*srcStride];
2645 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2646 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2647 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2648 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2649 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2650 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2651 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2652 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2653 src++;
2654 dst++;
2655 }
2656 }
2657
2658 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2659 put_pixels8_c(dst, src, stride, 8);
2660 }
2661
2662 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2663 uint8_t half[64];
2664 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2665 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2666 }
2667
2668 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2669 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2670 }
2671
2672 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2673 uint8_t half[64];
2674 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2675 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2676 }
2677
2678 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2679 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2680 }
2681
2682 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2683 uint8_t halfH[88];
2684 uint8_t halfV[64];
2685 uint8_t halfHV[64];
2686 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2687 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2688 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2689 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2690 }
2691 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2692 uint8_t halfH[88];
2693 uint8_t halfV[64];
2694 uint8_t halfHV[64];
2695 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2696 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2697 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2698 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2699 }
2700 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2701 uint8_t halfH[88];
2702 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2703 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2704 }
2705
2706 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2707 int x;
2708 const int strength= ff_h263_loop_filter_strength[qscale];
2709
2710 for(x=0; x<8; x++){
2711 int d1, d2, ad1;
2712 int p0= src[x-2*stride];
2713 int p1= src[x-1*stride];
2714 int p2= src[x+0*stride];
2715 int p3= src[x+1*stride];
2716 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2717
2718 if (d<-2*strength) d1= 0;
2719 else if(d<- strength) d1=-2*strength - d;
2720 else if(d< strength) d1= d;
2721 else if(d< 2*strength) d1= 2*strength - d;
2722 else d1= 0;
2723
2724 p1 += d1;
2725 p2 -= d1;
2726 if(p1&256) p1= ~(p1>>31);
2727 if(p2&256) p2= ~(p2>>31);
2728
2729 src[x-1*stride] = p1;
2730 src[x+0*stride] = p2;
2731
2732 ad1= FFABS(d1)>>1;
2733
2734 d2= clip((p0-p3)/4, -ad1, ad1);
2735
2736 src[x-2*stride] = p0 - d2;
2737 src[x+ stride] = p3 + d2;
2738 }
2739 }
2740
2741 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2742 int y;
2743 const int strength= ff_h263_loop_filter_strength[qscale];
2744
2745 for(y=0; y<8; y++){
2746 int d1, d2, ad1;
2747 int p0= src[y*stride-2];
2748 int p1= src[y*stride-1];
2749 int p2= src[y*stride+0];
2750 int p3= src[y*stride+1];
2751 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2752
2753 if (d<-2*strength) d1= 0;
2754 else if(d<- strength) d1=-2*strength - d;
2755 else if(d< strength) d1= d;
2756 else if(d< 2*strength) d1= 2*strength - d;
2757 else d1= 0;
2758
2759 p1 += d1;
2760 p2 -= d1;
2761 if(p1&256) p1= ~(p1>>31);
2762 if(p2&256) p2= ~(p2>>31);
2763
2764 src[y*stride-1] = p1;
2765 src[y*stride+0] = p2;
2766
2767 ad1= FFABS(d1)>>1;
2768
2769 d2= clip((p0-p3)/4, -ad1, ad1);
2770
2771 src[y*stride-2] = p0 - d2;
2772 src[y*stride+1] = p3 + d2;
2773 }
2774 }
2775
2776 static void h261_loop_filter_c(uint8_t *src, int stride){
2777 int x,y,xy,yz;
2778 int temp[64];
2779
2780 for(x=0; x<8; x++){
2781 temp[x ] = 4*src[x ];
2782 temp[x + 7*8] = 4*src[x + 7*stride];
2783 }
2784 for(y=1; y<7; y++){
2785 for(x=0; x<8; x++){
2786 xy = y * stride + x;
2787 yz = y * 8 + x;
2788 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2789 }
2790 }
2791
2792 for(y=0; y<8; y++){
2793 src[ y*stride] = (temp[ y*8] + 2)>>2;
2794 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2795 for(x=1; x<7; x++){
2796 xy = y * stride + x;
2797 yz = y * 8 + x;
2798 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2799 }
2800 }
2801 }
2802
2803 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2804 {
2805 int i, d;
2806 for( i = 0; i < 4; i++ ) {
2807 if( tc0[i] < 0 ) {
2808 pix += 4*ystride;
2809 continue;
2810 }
2811 for( d = 0; d < 4; d++ ) {
2812 const int p0 = pix[-1*xstride];
2813 const int p1 = pix[-2*xstride];
2814 const int p2 = pix[-3*xstride];
2815 const int q0 = pix[0];
2816 const int q1 = pix[1*xstride];
2817 const int q2 = pix[2*xstride];
2818
2819 if( FFABS( p0 - q0 ) < alpha &&
2820 FFABS( p1 - p0 ) < beta &&
2821 FFABS( q1 - q0 ) < beta ) {
2822
2823 int tc = tc0[i];
2824 int i_delta;
2825
2826 if( FFABS( p2 - p0 ) < beta ) {
2827 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2828 tc++;
2829 }
2830 if( FFABS( q2 - q0 ) < beta ) {
2831 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2832 tc++;
2833 }
2834
2835 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2836 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2837 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2838 }
2839 pix += ystride;
2840 }
2841 }
2842 }
2843 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2844 {
2845 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2846 }
2847 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2848 {
2849 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2850 }
2851
2852 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2853 {
2854 int i, d;
2855 for( i = 0; i < 4; i++ ) {
2856 const int tc = tc0[i];
2857 if( tc <= 0 ) {
2858 pix += 2*ystride;
2859 continue;
2860 }
2861 for( d = 0; d < 2; d++ ) {
2862 const int p0 = pix[-1*xstride];
2863 const int p1 = pix[-2*xstride];
2864 const int q0 = pix[0];
2865 const int q1 = pix[1*xstride];
2866
2867 if( FFABS( p0 - q0 ) < alpha &&
2868 FFABS( p1 - p0 ) < beta &&
2869 FFABS( q1 - q0 ) < beta ) {
2870
2871 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2872
2873 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2874 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2875 }
2876 pix += ystride;
2877 }
2878 }
2879 }
2880 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2881 {
2882 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2883 }
2884 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2885 {
2886 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2887 }
2888
2889 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2890 {
2891 int d;
2892 for( d = 0; d < 8; d++ ) {
2893 const int p0 = pix[-1*xstride];
2894 const int p1 = pix[-2*xstride];
2895 const int q0 = pix[0];
2896 const int q1 = pix[1*xstride];
2897
2898 if( FFABS( p0 - q0 ) < alpha &&
2899 FFABS( p1 - p0 ) < beta &&
2900 FFABS( q1 - q0 ) < beta ) {
2901
2902 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2903 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2904 }
2905 pix += ystride;
2906 }
2907 }
2908 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2909 {
2910 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2911 }
2912 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2913 {
2914 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2915 }
2916
2917 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2918 {
2919 int s, i;
2920
2921 s = 0;
2922 for(i=0;i<h;i++) {
2923 s += abs(pix1[0] - pix2[0]);
2924 s += abs(pix1[1] - pix2[1]);
2925 s += abs(pix1[2] - pix2[2]);
2926 s += abs(pix1[3] - pix2[3]);
2927 s += abs(pix1[4] - pix2[4]);
2928 s += abs(pix1[5] - pix2[5]);
2929 s += abs(pix1[6] - pix2[6]);
2930 s += abs(pix1[7] - pix2[7]);
2931 s += abs(pix1[8] - pix2[8]);
2932 s += abs(pix1[9] - pix2[9]);
2933 s += abs(pix1[10] - pix2[10]);
2934 s += abs(pix1[11] - pix2[11]);
2935 s += abs(pix1[12] - pix2[12]);
2936 s += abs(pix1[13] - pix2[13]);
2937 s += abs(pix1[14] - pix2[14]);
2938 s += abs(pix1[15] - pix2[15]);
2939 pix1 += line_size;
2940 pix2 += line_size;
2941 }
2942 return s;
2943 }
2944
2945 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2946 {
2947 int s, i;
2948
2949 s = 0;
2950 for(i=0;i<h;i++) {
2951 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2952 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2953 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2954 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2955 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2956 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2957 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2958 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2959 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2960 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2961 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2962 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2963 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2964 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2965 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2966 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2967 pix1 += line_size;
2968 pix2 += line_size;
2969 }
2970 return s;
2971 }
2972
2973 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2974 {
2975 int s, i;
2976 uint8_t *pix3 = pix2 + line_size;
2977
2978 s = 0;
2979 for(i=0;i<h;i++) {
2980 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2981 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2982 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2983 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2984 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2985 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2986 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2987 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2988 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2989 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2990 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2991 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2992 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2993 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2994 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2995 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2996 pix1 += line_size;
2997 pix2 += line_size;
2998 pix3 += line_size;
2999 }
3000 return s;
3001 }
3002
3003 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3004 {
3005 int s, i;
3006 uint8_t *pix3 = pix2 + line_size;
3007
3008 s = 0;
3009 for(i=0;i<h;i++) {
3010 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3011 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3012 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3013 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3014 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3015 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3016 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3017 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3018 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3019 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3020 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3021 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3022 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3023 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3024 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3025 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3026 pix1 += line_size;
3027 pix2 += line_size;
3028 pix3 += line_size;
3029 }
3030 return s;
3031 }
3032
3033 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3034 {
3035 int s, i;
3036
3037 s = 0;
3038 for(i=0;i<h;i++) {
3039 s += abs(pix1[0] - pix2[0]);
3040 s += abs(pix1[1] - pix2[1]);
3041 s += abs(pix1[2] - pix2[2]);
3042 s += abs(pix1[3] - pix2[3]);
3043 s += abs(pix1[4] - pix2[4]);
3044 s += abs(pix1[5] - pix2[5]);
3045 s += abs(pix1[6] - pix2[6]);
3046 s += abs(pix1[7] - pix2[7]);
3047 pix1 += line_size;
3048 pix2 += line_size;
3049 }
3050 return s;
3051 }
3052
3053 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3054 {
3055 int s, i;
3056
3057 s = 0;
3058 for(i=0;i<h;i++) {
3059 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3060 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3061 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3062 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3063 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3064 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3065 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3066 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3067 pix1 += line_size;
3068 pix2 += line_size;
3069 }
3070 return s;
3071 }
3072
3073 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3074 {
3075 int s, i;
3076 uint8_t *pix3 = pix2 + line_size;
3077
3078 s = 0;
3079 for(i=0;i<h;i++) {
3080 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3081 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3082 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3083 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3084 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3085 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3086 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3087 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3088 pix1 += line_size;
3089 pix2 += line_size;
3090 pix3 += line_size;
3091 }
3092 return s;
3093 }
3094
3095 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3096 {
3097 int s, i;
3098 uint8_t *pix3 = pix2 + line_size;
3099
3100 s = 0;
3101 for(i=0;i<h;i++) {
3102 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3103 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3104 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3105 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3106 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3107 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3108 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3109 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3110 pix1 += line_size;
3111 pix2 += line_size;
3112 pix3 += line_size;
3113 }
3114 return s;
3115 }
3116
3117 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3118 MpegEncContext *c = v;
3119 int score1=0;
3120 int score2=0;
3121 int x,y;
3122
3123 for(y=0; y<h; y++){
3124 for(x=0; x<16; x++){
3125 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3126 }
3127 if(y+1<h){
3128 for(x=0; x<15; x++){
3129 score2+= FFABS( s1[x ] - s1[x +stride]
3130 - s1[x+1] + s1[x+1+stride])
3131 -FFABS( s2[x ] - s2[x +stride]
3132 - s2[x+1] + s2[x+1+stride]);
3133 }
3134 }
3135 s1+= stride;
3136 s2+= stride;
3137 }
3138
3139 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3140 else return score1 + FFABS(score2)*8;
3141 }
3142
3143 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3144 MpegEncContext *c = v;
3145 int score1=0;
3146 int score2=0;
3147 int x,y;
3148
3149 for(y=0; y<h; y++){
3150 for(x=0; x<8; x++){
3151 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3152 }
3153 if(y+1<h){
3154 for(x=0; x<7; x++){
3155 score2+= FFABS( s1[x ] - s1[x +stride]
3156 - s1[x+1] + s1[x+1+stride])
3157 -FFABS( s2[x ] - s2[x +stride]
3158 - s2[x+1] + s2[x+1+stride]);
3159 }
3160 }
3161 s1+= stride;
3162 s2+= stride;
3163 }
3164
3165 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3166 else return score1 + FFABS(score2)*8;
3167 }
3168
3169 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3170 int i;
3171 unsigned int sum=0;
3172
3173 for(i=0; i<8*8; i++){
3174 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3175 int w= weight[i];
3176 b>>= RECON_SHIFT;
3177 assert(-512<b && b<512);
3178
3179 sum += (w*b)*(w*b)>>4;
3180 }
3181 return sum>>2;
3182 }
3183
3184 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3185 int i;
3186
3187 for(i=0; i<8*8; i++){
3188 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3189 }
3190 }
3191
3192 /**
3193 * permutes an 8x8 block.
3194 * @param block the block which will be permuted according to the given permutation vector
3195 * @param permutation the permutation vector
3196 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3197 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3198 * (inverse) permutated to scantable order!
3199 */
3200 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3201 {
3202 int i;
3203 DCTELEM temp[64];
3204
3205 if(last<=0) return;
3206 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3207
3208 for(i=0; i<=last; i++){
3209 const int j= scantable[i];
3210 temp[j]= block[j];
3211 block[j]=0;
3212 }
3213
3214 for(i=0; i<=last; i++){
3215 const int j= scantable[i];
3216 const int perm_j= permutation[j];
3217 block[perm_j]= temp[j];
3218 }
3219 }
3220
3221 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3222 return 0;
3223 }
3224
3225 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3226 int i;
3227
3228 memset(cmp, 0, sizeof(void*)*5);
3229
3230 for(i=0; i<5; i++){
3231 switch(type&0xFF){
3232 case FF_CMP_SAD:
3233 cmp[i]= c->sad[i];
3234 break;
3235 case FF_CMP_SATD:
3236 cmp[i]= c->hadamard8_diff[i];
3237 break;
3238 case FF_CMP_SSE:
3239 cmp[i]= c->sse[i];
3240 break;
3241 case FF_CMP_DCT:
3242 cmp[i]= c->dct_sad[i];
3243 break;
3244 case FF_CMP_DCT264:
3245 cmp[i]= c->dct264_sad[i];
3246 break;
3247 case FF_CMP_DCTMAX:
3248 cmp[i]= c->dct_max[i];
3249 break;
3250 case FF_CMP_PSNR:
3251 cmp[i]= c->quant_psnr[i];
3252 break;
3253 case FF_CMP_BIT:
3254 cmp[i]= c->bit[i];
3255 break;
3256 case FF_CMP_RD:
3257 cmp[i]= c->rd[i];
3258 break;
3259 case FF_CMP_VSAD:
3260 cmp[i]= c->vsad[i];
3261 break;
3262 case FF_CMP_VSSE:
3263 cmp[i]= c->vsse[i];
3264 break;
3265 case FF_CMP_ZERO:
3266 cmp[i]= zero_cmp;
3267 break;
3268 case FF_CMP_NSSE:
3269 cmp[i]= c->nsse[i];
3270 break;
3271 #ifdef CONFIG_SNOW_ENCODER
3272 case FF_CMP_W53:
3273 cmp[i]= c->w53[i];
3274 break;
3275 case FF_CMP_W97:
3276 cmp[i]= c->w97[i];
3277 break;
3278 #endif
3279 default:
3280 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3281 }
3282 }
3283 }
3284
3285 /**
3286 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3287 */
3288 static void clear_blocks_c(DCTELEM *blocks)
3289 {
3290 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3291 }
3292
3293 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3294 int i;
3295 for(i=0; i+7<w; i+=8){
3296 dst[i+0] += src[i+0];
3297 dst[i+1] += src[i+1];
3298 dst[i+2] += src[i+2];
3299 dst[i+3] += src[i+3];
3300 dst[i+4] += src[i+4];
3301 dst[i+5] += src[i+5];
3302 dst[i+6] += src[i+6];
3303 dst[i+7] += src[i+7];
3304 }
3305 for(; i<w; i++)
3306 dst[i+0] += src[i+0];
3307 }
3308
3309 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3310 int i;
3311 for(i=0; i+7<w; i+=8){
3312 dst[i+0] = src1[i+0]-src2[i+0];
3313 dst[i+1] = src1[i+1]-src2[i+1];
3314 dst[i+2] = src1[i+2]-src2[i+2];
3315 dst[i+3] = src1[i+3]-src2[i+3];
3316 dst[i+4] = src1[i+4]-src2[i+4];
3317 dst[i+5] = src1[i+5]-src2[i+5];
3318 dst[i+6] = src1[i+6]-src2[i+6];
3319 dst[i+7] = src1[i+7]-src2[i+7];
3320 }
3321 for(; i<w; i++)
3322 dst[i+0] = src1[i+0]-src2[i+0];
3323 }
3324
3325 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3326 int i;
3327 uint8_t l, lt;
3328
3329 l= *left;
3330 lt= *left_top;
3331
3332 for(i=0; i<w; i++){
3333 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3334 lt= src1[i];
3335 l= src2[i];
3336 dst[i]= l - pred;
3337 }
3338
3339 *left= l;
3340 *left_top= lt;
3341 }
3342
3343 #define BUTTERFLY2(o1,o2,i1,i2) \
3344 o1= (i1)+(i2);\
3345 o2= (i1)-(i2);
3346
3347 #define BUTTERFLY1(x,y) \
3348 {\
3349 int a,b;\
3350 a= x;\
3351 b= y;\
3352 x= a+b;\
3353 y= a-b;\
3354 }
3355
3356 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3357
3358 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3359 int i;
3360 int temp[64];
3361 int sum=0;
3362
3363 assert(h==8);
3364
3365 for(i=0; i<8; i++){
3366 //FIXME try pointer walks
3367 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3368 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3369 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3370 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3371
3372 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3373 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3374 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3375 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3376
3377 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3378 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3379 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3380 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3381 }
3382
3383 for(i=0; i<8; i++){
3384 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3385 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3386 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3387 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3388
3389 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3390 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3391 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3392 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3393
3394 sum +=
3395 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3396 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3397 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3398 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3399 }
3400 #if 0
3401 static int maxi=0;
3402 if(sum>maxi){
3403 maxi=sum;
3404 printf("MAX:%d\n", maxi);
3405 }
3406 #endif
3407 return sum;
3408 }
3409
3410 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3411 int i;
3412 int temp[64];
3413 int sum=0;
3414
3415 assert(h==8);
3416
3417 for(i=0; i<8; i++){
3418 //FIXME try pointer walks
3419 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3420 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3421 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3422 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3423
3424 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3425 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3426 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3427 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3428
3429 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3430 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3431 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3432 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3433 }
3434
3435 for(i=0; i<8; i++){
3436 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3437 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3438 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3439 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3440
3441 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3442 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3443 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3444 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3445
3446 sum +=
3447 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3448 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3449 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3450 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3451 }
3452
3453 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3454
3455 return sum;
3456 }
3457
3458 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3459 MpegEncContext * const s= (MpegEncContext *)c;
3460 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3461 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3462 int sum=0, i;
3463
3464 assert(h==8);
3465
3466 s->dsp.diff_pixels(temp, src1, src2, stride);
3467 s->dsp.fdct(temp);
3468
3469 for(i=0; i<64; i++)
3470 sum+= FFABS(temp[i]);
3471
3472 return sum;
3473 }
3474
3475 #ifdef CONFIG_GPL
3476 #define DCT8_1D {\
3477 const int s07 = SRC(0) + SRC(7);\
3478 const int s16 = SRC(1) + SRC(6);\
3479 const int s25 = SRC(2) + SRC(5);\
3480 const int s34 = SRC(3) + SRC(4);\
3481 const int a0 = s07 + s34;\
3482 const int a1 = s16 + s25;\
3483 const int a2 = s07 - s34;\
3484 const int a3 = s16 - s25;\
3485 const int d07 = SRC(0) - SRC(7);\
3486 const int d16 = SRC(1) - SRC(6);\
3487 const int d25 = SRC(2) - SRC(5);\
3488 const int d34 = SRC(3) - SRC(4);\
3489 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3490 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3491 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3492 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3493 DST(0, a0 + a1 ) ;\
3494 DST(1, a4 + (a7>>2)) ;\
3495 DST(2, a2 + (a3>>1)) ;\
3496 DST(3, a5 + (a6>>2)) ;\
3497 DST(4, a0 - a1 ) ;\
3498 DST(5, a6 - (a5>>2)) ;\
3499 DST(6, (a2>>1) - a3 ) ;\
3500 DST(7, (a4>>2) - a7 ) ;\
3501 }
3502
3503 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3504 MpegEncContext * const s= (MpegEncContext *)c;
3505 int16_t dct[8][8];
3506 int i;
3507 int sum=0;
3508
3509 s->dsp.diff_pixels(dct, src1, src2, stride);
3510
3511 #define SRC(x) dct[i][x]
3512 #define DST(x,v) dct[i][x]= v
3513 for( i = 0; i < 8; i++ )
3514 DCT8_1D
3515 #undef SRC
3516 #undef DST
3517
3518 #define SRC(x) dct[x][i]
3519 #define DST(x,v) sum += FFABS(v)
3520 for( i = 0; i < 8; i++ )
3521 DCT8_1D
3522 #undef SRC
3523 #undef DST
3524 return sum;
3525 }
3526 #endif
3527
3528 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3529 MpegEncContext * const s= (MpegEncContext *)c;
3530 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3531 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3532 int sum=0, i;
3533
3534 assert(h==8);
3535
3536 s->dsp.diff_pixels(temp, src1, src2, stride);
3537 s->dsp.fdct(temp);
3538
3539 for(i=0; i<64; i++)
3540 sum= FFMAX(sum, FFABS(temp[i]));
3541
3542 return sum;
3543 }
3544
3545 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3546 MpegEncContext * const s= (MpegEncContext *)c;
3547 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3548 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3549 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3550 int sum=0, i;
3551
3552 assert(h==8);
3553 s->mb_intra=0;
3554
3555 s->dsp.diff_pixels(temp, src1, src2, stride);
3556
3557 memcpy(bak, temp, 64*sizeof(DCTELEM));
3558
3559 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3560 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3561 simple_idct(temp); //FIXME
3562
3563 for(i=0; i<64; i++)
3564 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3565
3566 return sum;
3567 }
3568
3569 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3570 MpegEncContext * const s= (MpegEncContext *)c;
3571 const uint8_t *scantable= s->intra_scantable.permutated;
3572 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3573 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3574 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3575 uint8_t * const bak= (uint8_t*)aligned_bak;
3576 int i, last, run, bits, level, distoration, start_i;
3577 const int esc_length= s->ac_esc_length;
3578 uint8_t * length;
3579 uint8_t * last_length;
3580
3581 assert(h==8);
3582
3583 for(i=0; i<8; i++){
3584 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3585 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3586 }
3587
3588 s->dsp.diff_pixels(temp, src1, src2, stride);
3589
3590 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3591
3592 bits=0;
3593
3594 if (s->mb_intra) {
3595 start_i = 1;
3596 length = s->intra_ac_vlc_length;
3597 last_length= s->intra_ac_vlc_last_length;
3598 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3599 } else {
3600 start_i = 0;
3601 length = s->inter_ac_vlc_length;
3602 last_length= s->inter_ac_vlc_last_length;
3603 }
3604
3605 if(last>=start_i){
3606 run=0;
3607 for(i=start_i; i<last; i++){
3608 int j= scantable[i];
3609 level= temp[j];
3610
3611 if(level){
3612 level+=64;
3613 if((level&(~127)) == 0){
3614 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3615 }else
3616 bits+= esc_length;
3617 run=0;
3618 }else
3619 run++;
3620 }
3621 i= scantable[last];
3622
3623 level= temp[i] + 64;
3624
3625 assert(level - 64);
3626
3627 if((level&(~127)) == 0){
3628 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3629 }else
3630 bits+= esc_length;
3631
3632 }
3633
3634 if(last>=0){
3635 if(s->mb_intra)
3636 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3637 else
3638 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3639 }
3640
3641 s->dsp.idct_add(bak, stride, temp);
3642
3643 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3644
3645 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3646 }
3647
3648 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3649 MpegEncContext * const s= (MpegEncContext *)c;
3650 const uint8_t *scantable= s->intra_scantable.permutated;
3651 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3652 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3653 int i, last, run, bits, level, start_i;
3654 const int esc_length= s->ac_esc_length;
3655 uint8_t * length;
3656 uint8_t * last_length;
3657
3658 assert(h==8);
3659
3660 s->dsp.diff_pixels(temp, src1, src2, stride);
3661
3662 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3663
3664 bits=0;
3665
3666 if (s->mb_intra) {
3667 start_i = 1;
3668 length = s->intra_ac_vlc_length;
3669 last_length= s->intra_ac_vlc_last_length;
3670 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3671 } else {
3672 start_i = 0;
3673 length = s->inter_ac_vlc_length;
3674 last_length= s->inter_ac_vlc_last_length;
3675 }
3676
3677 if(last>=start_i){
3678 run=0;
3679 for(i=start_i; i<last; i++){
3680 int j= scantable[i];
3681 level= temp[j];
3682
3683 if(level){
3684 level+=64;
3685 if((level&(~127)) == 0){
3686 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3687 }else
3688 bits+= esc_length;
3689 run=0;
3690 }else
3691 run++;
3692 }
3693 i= scantable[last];
3694
3695 level= temp[i] + 64;
3696
3697 assert(level - 64);
3698
3699 if((level&(~127)) == 0){
3700 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3701 }else
3702 bits+= esc_length;
3703 }
3704
3705 return bits;
3706 }
3707
3708 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3709 int score=0;
3710 int x,y;
3711
3712 for(y=1; y<h; y++){
3713 for(x=0; x<16; x+=4){
3714 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3715 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3716 }
3717 s+= stride;
3718 }
3719
3720 return score;
3721 }
3722
3723 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3724 int score=0;
3725 int x,y;
3726
3727 for(y=1; y<h; y++){
3728 for(x=0; x<16; x++){
3729 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3730 }
3731 s1+= stride;
3732 s2+= stride;
3733 }
3734
3735 return score;
3736 }
3737
3738 #define SQ(a) ((a)*(a))
3739 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3740 int score=0;
3741 int x,y;
3742
3743 for(y=1; y<h; y++){
3744 for(x=0; x<16; x+=4){
3745 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3746 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3747 }
3748 s+= stride;
3749 }
3750
3751 return score;
3752 }
3753
3754 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3755 int score=0;
3756 int x,y;
3757
3758 for(y=1; y<h; y++){
3759 for(x=0; x<16; x++){
3760 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3761 }
3762 s1+= stride;
3763 s2+= stride;
3764 }
3765
3766 return score;
3767 }
3768
3769 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3770 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3771 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3772 #ifdef CONFIG_GPL
3773 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3774 #endif
3775 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3776 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3777 WARPER8_16_SQ(rd8x8_c, rd16_c)
3778 WARPER8_16_SQ(bit8x8_c, bit16_c)
3779
3780 static void vector_fmul_c(float *dst, const float *src, int len){
3781 int i;
3782 for(i=0; i<len; i++)
3783 dst[i] *= src[i];
3784 }
3785
3786 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3787 int i;
3788 src1 += len-1;
3789 for(i=0; i<len; i++)
3790 dst[i] = src0[i] * src1[-i];
3791 }
3792
3793 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3794 int i;
3795 for(i=0; i<len; i++)
3796 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3797 }
3798
3799 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3800 int i;
3801 for(i=0; i<len; i++) {
3802 int_fast32_t tmp = ((int32_t*)src)[i];
3803 if(tmp & 0xf0000){
3804 tmp = (0x43c0ffff - tmp)>>31;
3805 // is this faster on some gcc/cpu combinations?
3806 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3807 // else tmp = 0;
3808 }
3809 dst[i] = tmp - 0x8000;
3810 }
3811 }
3812
3813 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3814 converted */
3815 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3816 {
3817 j_rev_dct (block);
3818 put_pixels_clamped_c(block, dest, line_size);
3819 }
3820 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3821 {
3822 j_rev_dct (block);
3823 add_pixels_clamped_c(block, dest, line_size);
3824 }
3825
3826 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3827 {
3828 j_rev_dct4 (block);
3829 put_pixels_clamped4_c(block, dest, line_size);
3830 }
3831 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3832 {
3833 j_rev_dct4 (block);
3834 add_pixels_clamped4_c(block, dest, line_size);
3835 }
3836
3837 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3838 {
3839 j_rev_dct2 (block);
3840 put_pixels_clamped2_c(block, dest, line_size);
3841 }
3842 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3843 {
3844 j_rev_dct2 (block);
3845 add_pixels_clamped2_c(block, dest, line_size);
3846 }
3847
3848 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3849 {
3850 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3851
3852 dest[0] = cm[(block[0] + 4)>>3];
3853 }
3854 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3855 {
3856 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3857
3858 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3859 }
3860
3861 static void just_return() { return; }
3862
3863 /* init static data */
3864 void dsputil_static_init(void)
3865 {
3866 int i;
3867
3868 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3869 for(i=0;i<MAX_NEG_CROP;i++) {
3870 cropTbl[i] = 0;
3871 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3872 }
3873
3874 for(i=0;i<512;i++) {
3875 squareTbl[i] = (i - 256) * (i - 256);
3876 }
3877
3878 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3879 }
3880
3881
3882 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3883 {
3884 int i;
3885
3886 #ifdef CONFIG_ENCODERS
3887 if(avctx->dct_algo==FF_DCT_FASTINT) {
3888 c->fdct = fdct_ifast;
3889 c->fdct248 = fdct_ifast248;
3890 }
3891 else if(avctx->dct_algo==FF_DCT_FAAN) {
3892 c->fdct = ff_faandct;
3893 c->fdct248 = ff_faandct248;
3894 }
3895 else {
3896 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3897 c->fdct248 = ff_fdct248_islow;
3898 }
3899 #endif //CONFIG_ENCODERS
3900
3901 if(avctx->lowres==1){
3902 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3903 c->idct_put= ff_jref_idct4_put;
3904 c->idct_add= ff_jref_idct4_add;
3905 }else{
3906 c->idct_put= ff_h264_lowres_idct_put_c;
3907 c->idct_add= ff_h264_lowres_idct_add_c;
3908 }
3909 c->idct = j_rev_dct4;
3910 c->idct_permutation_type= FF_NO_IDCT_PERM;
3911 }else if(avctx->lowres==2){
3912 c->idct_put= ff_jref_idct2_put;
3913 c->idct_add= ff_jref_idct2_add;
3914 c->idct = j_rev_dct2;
3915 c->idct_permutation_type= FF_NO_IDCT_PERM;
3916 }else if(avctx->lowres==3){
3917 c->idct_put= ff_jref_idct1_put;
3918 c->idct_add= ff_jref_idct1_add;
3919 c->idct = j_rev_dct1;
3920 c->idct_permutation_type= FF_NO_IDCT_PERM;
3921 }else{
3922 if(avctx->idct_algo==FF_IDCT_INT){
3923 c->idct_put= ff_jref_idct_put;
3924 c->idct_add= ff_jref_idct_add;
3925 c->idct = j_rev_dct;
3926 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3927 }else if(avctx->idct_algo==FF_IDCT_VP3){
3928 c->idct_put= ff_vp3_idct_put_c;
3929 c->idct_add= ff_vp3_idct_add_c;
3930 c->idct = ff_vp3_idct_c;
3931 c->idct_permutation_type= FF_NO_IDCT_PERM;
3932 }else{ //accurate/default
3933 c->idct_put= simple_idct_put;
3934 c->idct_add= simple_idct_add;
3935 c->idct = simple_idct;
3936 c->idct_permutation_type= FF_NO_IDCT_PERM;
3937 }
3938 }
3939
3940 c->h264_idct_add= ff_h264_idct_add_c;
3941 c->h264_idct8_add= ff_h264_idct8_add_c;
3942 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3943 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3944
3945 c->get_pixels = get_pixels_c;
3946 c->diff_pixels = diff_pixels_c;
3947 c->put_pixels_clamped = put_pixels_clamped_c;
3948 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3949 c->add_pixels_clamped = add_pixels_clamped_c;
3950 c->add_pixels8 = add_pixels8_c;
3951 c->add_pixels4 = add_pixels4_c;
3952 c->gmc1 = gmc1_c;
3953 c->gmc = ff_gmc_c;
3954 c->clear_blocks = clear_blocks_c;
3955 c->pix_sum = pix_sum_c;
3956 c->pix_norm1 = pix_norm1_c;
3957
3958 /* TODO [0] 16 [1] 8 */
3959 c->pix_abs[0][0] = pix_abs16_c;
3960 c->pix_abs[0][1] = pix_abs16_x2_c;
3961 c->pix_abs[0][2] = pix_abs16_y2_c;
3962 c->pix_abs[0][3] = pix_abs16_xy2_c;
3963 c->pix_abs[1][0] = pix_abs8_c;
3964 c->pix_abs[1][1] = pix_abs8_x2_c;
3965 c->pix_abs[1][2] = pix_abs8_y2_c;
3966 c->pix_abs[1][3] = pix_abs8_xy2_c;
3967
3968 #define dspfunc(PFX, IDX, NUM) \
3969 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3970 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3971 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3972 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3973
3974 dspfunc(put, 0, 16);
3975 dspfunc(put_no_rnd, 0, 16);
3976 dspfunc(put, 1, 8);
3977 dspfunc(put_no_rnd, 1, 8);
3978 dspfunc(put, 2, 4);
3979 dspfunc(put, 3, 2);
3980
3981 dspfunc(avg, 0, 16);
3982 dspfunc(avg_no_rnd, 0, 16);
3983 dspfunc(avg, 1, 8);
3984 dspfunc(avg_no_rnd, 1, 8);
3985 dspfunc(avg, 2, 4);
3986 dspfunc(avg, 3, 2);
3987 #undef dspfunc
3988
3989 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3990 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3991
3992 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3993 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3994 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3995 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3996 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3997 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3998 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3999 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4000 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4001
4002 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4003 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4004 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4005 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4006 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4007 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4008 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4009 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4010 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4011
4012 #define dspfunc(PFX, IDX, NUM) \
4013 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4014 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4015 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4016 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4017 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4018 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4019 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4020 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4021 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4022 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4023 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4024 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4025 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4026 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4027 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4028 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4029
4030 dspfunc(put_qpel, 0, 16);
4031 dspfunc(put_no_rnd_qpel, 0, 16);
4032
4033 dspfunc(avg_qpel, 0, 16);
4034 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4035
4036 dspfunc(put_qpel, 1, 8);
4037 dspfunc(put_no_rnd_qpel, 1, 8);
4038
4039 dspfunc(avg_qpel, 1, 8);
4040 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4041
4042 dspfunc(put_h264_qpel, 0, 16);
4043 dspfunc(put_h264_qpel, 1, 8);
4044 dspfunc(put_h264_qpel, 2, 4);
4045 dspfunc(put_h264_qpel, 3, 2);
4046 dspfunc(avg_h264_qpel, 0, 16);
4047 dspfunc(avg_h264_qpel, 1, 8);
4048 dspfunc(avg_h264_qpel, 2, 4);
4049
4050 #undef dspfunc
4051 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4052 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4053 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4054 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4055 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4056 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4057 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4058
4059 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4060 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4061 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4062 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4063 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4064 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4065 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4066 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4067 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4068 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4069 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4070 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4071 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4072 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4073 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4074 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4075 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4076 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4077 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4078 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4079
4080 #ifdef CONFIG_CAVS_DECODER
4081 ff_cavsdsp_init(c,avctx);
4082 #endif
4083 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4084 ff_vc1dsp_init(c,avctx);
4085 #endif
4086
4087 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4088 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4089 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4090 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4091 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4092 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4093 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4094 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4095
4096 #define SET_CMP_FUNC(name) \
4097 c->name[0]= name ## 16_c;\
4098 c->name[1]= name ## 8x8_c;
4099
4100 SET_CMP_FUNC(hadamard8_diff)
4101 c->hadamard8_diff[4]= hadamard8_intra16_c;
4102 SET_CMP_FUNC(dct_sad)
4103 SET_CMP_FUNC(dct_max)
4104 #ifdef CONFIG_GPL
4105 SET_CMP_FUNC(dct264_sad)
4106 #endif
4107 c->sad[0]= pix_abs16_c;
4108 c->sad[1]= pix_abs8_c;
4109 c->sse[0]= sse16_c;
4110 c->sse[1]= sse8_c;
4111 c->sse[2]= sse4_c;
4112 SET_CMP_FUNC(quant_psnr)
4113 SET_CMP_FUNC(rd)
4114 SET_CMP_FUNC(bit)
4115 c->vsad[0]= vsad16_c;
4116 c->vsad[4]= vsad_intra16_c;
4117 c->vsse[0]= vsse16_c;
4118 c->vsse[4]= vsse_intra16_c;
4119 c->nsse[0]= nsse16_c;
4120 c->nsse[1]= nsse8_c;
4121 #ifdef CONFIG_SNOW_ENCODER
4122 c->w53[0]= w53_16_c;
4123 c->w53[1]= w53_8_c;
4124 c->w97[0]= w97_16_c;
4125 c->w97[1]= w97_8_c;
4126 #endif
4127
4128 c->add_bytes= add_bytes_c;
4129 c->diff_bytes= diff_bytes_c;
4130 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4131 c->bswap_buf= bswap_buf;
4132
4133 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4134 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4135 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4136 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4137 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4138 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4139 c->h264_loop_filter_strength= NULL;
4140
4141 c->h263_h_loop_filter= h263_h_loop_filter_c;
4142 c->h263_v_loop_filter= h263_v_loop_filter_c;
4143
4144 c->h261_loop_filter= h261_loop_filter_c;
4145
4146 c->try_8x8basis= try_8x8basis_c;
4147 c->add_8x8basis= add_8x8basis_c;
4148
4149 #ifdef CONFIG_SNOW_ENCODER
4150 c->vertical_compose97i = ff_snow_vertical_compose97i;
4151 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4152 c->inner_add_yblock = ff_snow_inner_add_yblock;
4153 #endif
4154
4155 #ifdef CONFIG_VORBIS_DECODER
4156 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4157 #endif
4158 c->vector_fmul = vector_fmul_c;
4159 c->vector_fmul_reverse = vector_fmul_reverse_c;
4160 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4161 c->float_to_int16 = ff_float_to_int16_c;
4162
4163 c->shrink[0]= ff_img_copy_plane;
4164 c->shrink[1]= ff_shrink22;
4165 c->shrink[2]= ff_shrink44;
4166 c->shrink[3]= ff_shrink88;
4167
4168 c->prefetch= just_return;
4169
4170 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4171 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4172
4173 #ifdef HAVE_MMX
4174 dsputil_init_mmx(c, avctx);
4175 #endif
4176 #ifdef ARCH_ARMV4L
4177 dsputil_init_armv4l(c, avctx);
4178 #endif
4179 #ifdef HAVE_MLIB
4180 dsputil_init_mlib(c, avctx);
4181 #endif
4182 #ifdef ARCH_SPARC
4183 dsputil_init_vis(c,avctx);
4184 #endif
4185 #ifdef ARCH_ALPHA
4186 dsputil_init_alpha(c, avctx);
4187 #endif
4188 #ifdef ARCH_POWERPC
4189 dsputil_init_ppc(c, avctx);
4190 #endif
4191 #ifdef HAVE_MMI
4192 dsputil_init_mmi(c, avctx);
4193 #endif
4194 #ifdef ARCH_SH4
4195 dsputil_init_sh4(c,avctx);
4196 #endif
4197 #ifdef ARCH_BFIN
4198 dsputil_init_bfin(c,avctx);
4199 #endif
4200
4201 for(i=0; i<64; i++){
4202 if(!c->put_2tap_qpel_pixels_tab[0][i])
4203 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4204 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4205 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4206 }
4207
4208 switch(c->idct_permutation_type){
4209 case FF_NO_IDCT_PERM:
4210 for(i=0; i<64; i++)
4211 c->idct_permutation[i]= i;
4212 break;
4213 case FF_LIBMPEG2_IDCT_PERM:
4214 for(i=0; i<64; i++)
4215 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4216 break;
4217 case FF_SIMPLE_IDCT_PERM:
4218 for(i=0; i<64; i++)
4219 c->idct_permutation[i]= simple_mmx_permutation[i];
4220 break;
4221 case FF_TRANSPOSE_IDCT_PERM:
4222 for(i=0; i<64; i++)
4223 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4224 break;
4225 case FF_PARTTRANS_IDCT_PERM:
4226 for(i=0; i<64; i++)
4227 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4228 break;
4229 default:
4230 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4231 }
4232 }
4233