comparison src/ffmpeg/libavcodec/dsputil.c @ 832:52c45849841e trunk

[svn] - works
author nenolod
date Mon, 12 Mar 2007 15:43:38 -0700
parents b3b7a4e480b2
children
comparison
equal deleted inserted replaced
831:dec0488e1344 832:52c45849841e
1 /* 1 /*
2 * DSP utils 2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard. 3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 * 5 *
6 * This library is free software; you can redistribute it and/or 6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public 9 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either 10 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version. 11 * version 2.1 of the License, or (at your option) any later version.
10 * 12 *
11 * This library is distributed in the hope that it will be useful, 13 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details. 16 * Lesser General Public License for more details.
15 * 17 *
16 * You should have received a copy of the GNU Lesser General Public 18 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software 19 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 * 21 *
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at> 22 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
21 */ 23 */
22 24
23 /** 25 /**
24 * @file dsputil.c 26 * @file dsputil.c
25 * DSP utils 27 * DSP utils
26 */ 28 */
27 29
28 #include "avcodec.h" 30 #include "avcodec.h"
29 #include "dsputil.h" 31 #include "dsputil.h"
30 #include "simple_idct.h" 32 #include "simple_idct.h"
31 33 #include "faandct.h"
32 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP]; 34 #include "snow.h"
33 uint32_t squareTbl[512]; 35
36 /* snow.c */
37 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
38
39 /* vorbis.c */
40 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
41
42 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
43 uint32_t squareTbl[512] = {0, };
34 44
35 const uint8_t ff_zigzag_direct[64] = { 45 const uint8_t ff_zigzag_direct[64] = {
36 0, 1, 8, 16, 9, 2, 3, 10, 46 0, 1, 8, 16, 9, 2, 3, 10,
37 17, 24, 32, 25, 18, 11, 4, 5, 47 17, 24, 32, 25, 18, 11, 4, 5,
38 12, 19, 26, 33, 40, 48, 41, 34, 48 12, 19, 26, 33, 40, 48, 41, 34,
55 22, 30, 7, 15, 23, 31, 38, 46, 65 22, 30, 7, 15, 23, 31, 38, 46,
56 53, 61, 54, 62, 39, 47, 55, 63, 66 53, 61, 54, 62, 39, 47, 55, 63,
57 }; 67 };
58 68
59 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */ 69 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
60 uint16_t inv_zigzag_direct16[64]; 70 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
61 71
62 const uint8_t ff_alternate_horizontal_scan[64] = { 72 const uint8_t ff_alternate_horizontal_scan[64] = {
63 0, 1, 2, 3, 8, 9, 16, 17, 73 0, 1, 2, 3, 8, 9, 16, 17,
64 10, 11, 4, 5, 6, 7, 15, 14, 74 10, 11, 4, 5, 6, 7, 15, 14,
65 13, 12, 19, 18, 24, 25, 32, 33, 75 13, 12, 19, 18, 24, 25, 32, 33,
66 26, 27, 20, 21, 22, 23, 28, 29, 76 26, 27, 20, 21, 22, 23, 28, 29,
67 30, 31, 34, 35, 40, 41, 48, 49, 77 30, 31, 34, 35, 40, 41, 48, 49,
68 42, 43, 36, 37, 38, 39, 44, 45, 78 42, 43, 36, 37, 38, 39, 44, 45,
69 46, 47, 50, 51, 56, 57, 58, 59, 79 46, 47, 50, 51, 56, 57, 58, 59,
70 52, 53, 54, 55, 60, 61, 62, 63, 80 52, 53, 54, 55, 60, 61, 62, 63,
71 }; 81 };
72 82
73 const uint8_t ff_alternate_vertical_scan[64] = { 83 const uint8_t ff_alternate_vertical_scan[64] = {
74 0, 8, 16, 24, 1, 9, 2, 10, 84 0, 8, 16, 24, 1, 9, 2, 10,
75 17, 25, 32, 40, 48, 56, 57, 49, 85 17, 25, 32, 40, 48, 56, 57, 49,
76 41, 33, 26, 18, 3, 11, 4, 12, 86 41, 33, 26, 18, 3, 11, 4, 12,
77 19, 27, 34, 42, 50, 58, 35, 43, 87 19, 27, 34, 42, 50, 58, 35, 43,
78 51, 59, 20, 28, 5, 13, 6, 14, 88 51, 59, 20, 28, 5, 13, 6, 14,
79 21, 29, 36, 44, 52, 60, 37, 45, 89 21, 29, 36, 44, 52, 60, 37, 45,
80 53, 61, 22, 30, 7, 15, 23, 31, 90 53, 61, 22, 30, 7, 15, 23, 31,
81 38, 46, 54, 62, 39, 47, 55, 63, 91 38, 46, 54, 62, 39, 47, 55, 63,
82 }; 92 };
83 93
84 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */ 94 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
85 const uint32_t inverse[256]={ 95 const uint32_t inverse[256]={
86 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757, 96 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
87 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154, 97 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
88 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709, 98 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
89 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333, 99 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
90 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367, 100 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
91 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283, 101 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
92 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315, 102 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
93 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085, 103 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
94 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498, 104 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
95 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675, 105 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
96 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441, 106 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
97 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183, 107 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
98 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712, 108 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
99 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400, 109 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
100 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163, 110 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
101 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641, 111 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
102 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573, 112 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
103 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737, 113 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
104 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493, 114 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
105 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373, 115 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
106 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368, 116 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
107 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671, 117 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
108 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767, 118 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
109 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740, 119 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
110 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751, 120 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
111 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635, 121 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
112 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593, 122 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
113 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944, 123 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
114 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933, 124 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
115 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575, 125 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
116 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532, 126 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
117 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010, 127 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
118 }; 128 };
119 129
120 /* Input permutation for the simple_idct_mmx */ 130 /* Input permutation for the simple_idct_mmx */
121 static const uint8_t simple_mmx_permutation[64]={ 131 static const uint8_t simple_mmx_permutation[64]={
122 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 132 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
123 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 133 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
124 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 134 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
125 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 135 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
126 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 136 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
127 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 137 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
128 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 138 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
129 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F, 139 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
130 }; 140 };
131 #if 0 141
132 static int pix_sum_c(uint8_t * pix, int line_size) 142 static int pix_sum_c(uint8_t * pix, int line_size)
133 { 143 {
134 int s, i, j; 144 int s, i, j;
135 145
136 s = 0; 146 s = 0;
137 for (i = 0; i < 16; i++) { 147 for (i = 0; i < 16; i++) {
138 for (j = 0; j < 16; j += 8) { 148 for (j = 0; j < 16; j += 8) {
139 s += pix[0]; 149 s += pix[0];
140 s += pix[1]; 150 s += pix[1];
141 s += pix[2]; 151 s += pix[2];
142 s += pix[3]; 152 s += pix[3];
143 s += pix[4]; 153 s += pix[4];
144 s += pix[5]; 154 s += pix[5];
145 s += pix[6]; 155 s += pix[6];
146 s += pix[7]; 156 s += pix[7];
147 pix += 8; 157 pix += 8;
148 } 158 }
149 pix += line_size - 16; 159 pix += line_size - 16;
150 } 160 }
151 return s; 161 return s;
152 } 162 }
153 163
154 static int pix_norm1_c(uint8_t * pix, int line_size) 164 static int pix_norm1_c(uint8_t * pix, int line_size)
156 int s, i, j; 166 int s, i, j;
157 uint32_t *sq = squareTbl + 256; 167 uint32_t *sq = squareTbl + 256;
158 168
159 s = 0; 169 s = 0;
160 for (i = 0; i < 16; i++) { 170 for (i = 0; i < 16; i++) {
161 for (j = 0; j < 16; j += 8) { 171 for (j = 0; j < 16; j += 8) {
162 #if 0 172 #if 0
163 s += sq[pix[0]]; 173 s += sq[pix[0]];
164 s += sq[pix[1]]; 174 s += sq[pix[1]];
165 s += sq[pix[2]]; 175 s += sq[pix[2]];
166 s += sq[pix[3]]; 176 s += sq[pix[3]];
167 s += sq[pix[4]]; 177 s += sq[pix[4]];
168 s += sq[pix[5]]; 178 s += sq[pix[5]];
169 s += sq[pix[6]]; 179 s += sq[pix[6]];
170 s += sq[pix[7]]; 180 s += sq[pix[7]];
171 #else 181 #else
172 #if LONG_MAX > 2147483647 182 #if LONG_MAX > 2147483647
173 register uint64_t x=*(uint64_t*)pix; 183 register uint64_t x=*(uint64_t*)pix;
174 s += sq[x&0xff]; 184 s += sq[x&0xff];
175 s += sq[(x>>8)&0xff]; 185 s += sq[(x>>8)&0xff];
176 s += sq[(x>>16)&0xff]; 186 s += sq[(x>>16)&0xff];
177 s += sq[(x>>24)&0xff]; 187 s += sq[(x>>24)&0xff];
178 s += sq[(x>>32)&0xff]; 188 s += sq[(x>>32)&0xff];
179 s += sq[(x>>40)&0xff]; 189 s += sq[(x>>40)&0xff];
180 s += sq[(x>>48)&0xff]; 190 s += sq[(x>>48)&0xff];
181 s += sq[(x>>56)&0xff]; 191 s += sq[(x>>56)&0xff];
182 #else 192 #else
183 register uint32_t x=*(uint32_t*)pix; 193 register uint32_t x=*(uint32_t*)pix;
184 s += sq[x&0xff]; 194 s += sq[x&0xff];
185 s += sq[(x>>8)&0xff]; 195 s += sq[(x>>8)&0xff];
186 s += sq[(x>>16)&0xff]; 196 s += sq[(x>>16)&0xff];
187 s += sq[(x>>24)&0xff]; 197 s += sq[(x>>24)&0xff];
188 x=*(uint32_t*)(pix+4); 198 x=*(uint32_t*)(pix+4);
189 s += sq[x&0xff]; 199 s += sq[x&0xff];
190 s += sq[(x>>8)&0xff]; 200 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff]; 201 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff]; 202 s += sq[(x>>24)&0xff];
193 #endif 203 #endif
194 #endif 204 #endif
195 pix += 8; 205 pix += 8;
196 } 206 }
197 pix += line_size - 16; 207 pix += line_size - 16;
198 } 208 }
199 return s; 209 return s;
200 } 210 }
201 211
202 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){ 212 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
203 int i; 213 int i;
204 214
205 for(i=0; i+8<=w; i+=8){ 215 for(i=0; i+8<=w; i+=8){
206 dst[i+0]= bswap_32(src[i+0]); 216 dst[i+0]= bswap_32(src[i+0]);
207 dst[i+1]= bswap_32(src[i+1]); 217 dst[i+1]= bswap_32(src[i+1]);
208 dst[i+2]= bswap_32(src[i+2]); 218 dst[i+2]= bswap_32(src[i+2]);
209 dst[i+3]= bswap_32(src[i+3]); 219 dst[i+3]= bswap_32(src[i+3]);
213 dst[i+7]= bswap_32(src[i+7]); 223 dst[i+7]= bswap_32(src[i+7]);
214 } 224 }
215 for(;i<w; i++){ 225 for(;i<w; i++){
216 dst[i+0]= bswap_32(src[i+0]); 226 dst[i+0]= bswap_32(src[i+0]);
217 } 227 }
228 }
229
230 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
231 {
232 int s, i;
233 uint32_t *sq = squareTbl + 256;
234
235 s = 0;
236 for (i = 0; i < h; i++) {
237 s += sq[pix1[0] - pix2[0]];
238 s += sq[pix1[1] - pix2[1]];
239 s += sq[pix1[2] - pix2[2]];
240 s += sq[pix1[3] - pix2[3]];
241 pix1 += line_size;
242 pix2 += line_size;
243 }
244 return s;
218 } 245 }
219 246
220 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) 247 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
221 { 248 {
222 int s, i; 249 int s, i;
266 pix2 += line_size; 293 pix2 += line_size;
267 } 294 }
268 return s; 295 return s;
269 } 296 }
270 297
298
299 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
300 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
301 int s, i, j;
302 const int dec_count= w==8 ? 3 : 4;
303 int tmp[32*32];
304 int level, ori;
305 static const int scale[2][2][4][4]={
306 {
307 {
308 // 9/7 8x8 dec=3
309 {268, 239, 239, 213},
310 { 0, 224, 224, 152},
311 { 0, 135, 135, 110},
312 },{
313 // 9/7 16x16 or 32x32 dec=4
314 {344, 310, 310, 280},
315 { 0, 320, 320, 228},
316 { 0, 175, 175, 136},
317 { 0, 129, 129, 102},
318 }
319 },{
320 {
321 // 5/3 8x8 dec=3
322 {275, 245, 245, 218},
323 { 0, 230, 230, 156},
324 { 0, 138, 138, 113},
325 },{
326 // 5/3 16x16 or 32x32 dec=4
327 {352, 317, 317, 286},
328 { 0, 328, 328, 233},
329 { 0, 180, 180, 140},
330 { 0, 132, 132, 105},
331 }
332 }
333 };
334
335 for (i = 0; i < h; i++) {
336 for (j = 0; j < w; j+=4) {
337 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
338 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
339 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
340 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
341 }
342 pix1 += line_size;
343 pix2 += line_size;
344 }
345
346 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
347
348 s=0;
349 assert(w==h);
350 for(level=0; level<dec_count; level++){
351 for(ori= level ? 1 : 0; ori<4; ori++){
352 int size= w>>(dec_count-level);
353 int sx= (ori&1) ? size : 0;
354 int stride= 32<<(dec_count-level);
355 int sy= (ori&2) ? stride>>1 : 0;
356
357 for(i=0; i<size; i++){
358 for(j=0; j<size; j++){
359 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
360 s += FFABS(v);
361 }
362 }
363 }
364 }
365 assert(s>=0);
366 return s>>9;
367 }
368
369 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
370 return w_c(v, pix1, pix2, line_size, 8, h, 1);
371 }
372
373 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
374 return w_c(v, pix1, pix2, line_size, 8, h, 0);
375 }
376
377 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378 return w_c(v, pix1, pix2, line_size, 16, h, 1);
379 }
380
381 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382 return w_c(v, pix1, pix2, line_size, 16, h, 0);
383 }
384
385 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386 return w_c(v, pix1, pix2, line_size, 32, h, 1);
387 }
388
389 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
390 return w_c(v, pix1, pix2, line_size, 32, h, 0);
391 }
392 #endif
393
271 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size) 394 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
272 { 395 {
273 int i; 396 int i;
274 397
275 /* read the pixels */ 398 /* read the pixels */
286 block += 8; 409 block += 8;
287 } 410 }
288 } 411 }
289 412
290 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1, 413 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
291 const uint8_t *s2, int stride){ 414 const uint8_t *s2, int stride){
292 int i; 415 int i;
293 416
294 /* read the pixels */ 417 /* read the pixels */
295 for(i=0;i<8;i++) { 418 for(i=0;i<8;i++) {
296 block[0] = s1[0] - s2[0]; 419 block[0] = s1[0] - s2[0];
307 } 430 }
308 } 431 }
309 432
310 433
311 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, 434 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
312 int line_size) 435 int line_size)
313 { 436 {
314 int i; 437 int i;
315 uint8_t *cm = cropTbl + MAX_NEG_CROP; 438 uint8_t *cm = cropTbl + MAX_NEG_CROP;
316 439
317 /* read the pixels */ 440 /* read the pixels */
318 for(i=0;i<8;i++) { 441 for(i=0;i<8;i++) {
319 pixels[0] = cm[block[0]]; 442 pixels[0] = cm[block[0]];
320 pixels[1] = cm[block[1]]; 443 pixels[1] = cm[block[1]];
321 pixels[2] = cm[block[2]]; 444 pixels[2] = cm[block[2]];
328 pixels += line_size; 451 pixels += line_size;
329 block += 8; 452 block += 8;
330 } 453 }
331 } 454 }
332 455
456 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
457 int line_size)
458 {
459 int i;
460 uint8_t *cm = cropTbl + MAX_NEG_CROP;
461
462 /* read the pixels */
463 for(i=0;i<4;i++) {
464 pixels[0] = cm[block[0]];
465 pixels[1] = cm[block[1]];
466 pixels[2] = cm[block[2]];
467 pixels[3] = cm[block[3]];
468
469 pixels += line_size;
470 block += 8;
471 }
472 }
473
474 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
475 int line_size)
476 {
477 int i;
478 uint8_t *cm = cropTbl + MAX_NEG_CROP;
479
480 /* read the pixels */
481 for(i=0;i<2;i++) {
482 pixels[0] = cm[block[0]];
483 pixels[1] = cm[block[1]];
484
485 pixels += line_size;
486 block += 8;
487 }
488 }
489
490 static void put_signed_pixels_clamped_c(const DCTELEM *block,
491 uint8_t *restrict pixels,
492 int line_size)
493 {
494 int i, j;
495
496 for (i = 0; i < 8; i++) {
497 for (j = 0; j < 8; j++) {
498 if (*block < -128)
499 *pixels = 0;
500 else if (*block > 127)
501 *pixels = 255;
502 else
503 *pixels = (uint8_t)(*block + 128);
504 block++;
505 pixels++;
506 }
507 pixels += (line_size - 8);
508 }
509 }
510
333 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels, 511 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
334 int line_size) 512 int line_size)
335 { 513 {
336 int i; 514 int i;
337 uint8_t *cm = cropTbl + MAX_NEG_CROP; 515 uint8_t *cm = cropTbl + MAX_NEG_CROP;
338 516
339 /* read the pixels */ 517 /* read the pixels */
340 for(i=0;i<8;i++) { 518 for(i=0;i<8;i++) {
341 pixels[0] = cm[pixels[0] + block[0]]; 519 pixels[0] = cm[pixels[0] + block[0]];
342 pixels[1] = cm[pixels[1] + block[1]]; 520 pixels[1] = cm[pixels[1] + block[1]];
343 pixels[2] = cm[pixels[2] + block[2]]; 521 pixels[2] = cm[pixels[2] + block[2]];
348 pixels[7] = cm[pixels[7] + block[7]]; 526 pixels[7] = cm[pixels[7] + block[7]];
349 pixels += line_size; 527 pixels += line_size;
350 block += 8; 528 block += 8;
351 } 529 }
352 } 530 }
353 #endif 531
532 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
533 int line_size)
534 {
535 int i;
536 uint8_t *cm = cropTbl + MAX_NEG_CROP;
537
538 /* read the pixels */
539 for(i=0;i<4;i++) {
540 pixels[0] = cm[pixels[0] + block[0]];
541 pixels[1] = cm[pixels[1] + block[1]];
542 pixels[2] = cm[pixels[2] + block[2]];
543 pixels[3] = cm[pixels[3] + block[3]];
544 pixels += line_size;
545 block += 8;
546 }
547 }
548
549 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
550 int line_size)
551 {
552 int i;
553 uint8_t *cm = cropTbl + MAX_NEG_CROP;
554
555 /* read the pixels */
556 for(i=0;i<2;i++) {
557 pixels[0] = cm[pixels[0] + block[0]];
558 pixels[1] = cm[pixels[1] + block[1]];
559 pixels += line_size;
560 block += 8;
561 }
562 }
563
564 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
565 {
566 int i;
567 for(i=0;i<8;i++) {
568 pixels[0] += block[0];
569 pixels[1] += block[1];
570 pixels[2] += block[2];
571 pixels[3] += block[3];
572 pixels[4] += block[4];
573 pixels[5] += block[5];
574 pixels[6] += block[6];
575 pixels[7] += block[7];
576 pixels += line_size;
577 block += 8;
578 }
579 }
580
581 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
582 {
583 int i;
584 for(i=0;i<4;i++) {
585 pixels[0] += block[0];
586 pixels[1] += block[1];
587 pixels[2] += block[2];
588 pixels[3] += block[3];
589 pixels += line_size;
590 block += 4;
591 }
592 }
593
354 #if 0 594 #if 0
355 595
356 #define PIXOP2(OPNAME, OP) \ 596 #define PIXOP2(OPNAME, OP) \
357 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ 597 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
358 {\ 598 {\
863 1103
864 #define op_avg(a, b) a = rnd_avg32(a, b) 1104 #define op_avg(a, b) a = rnd_avg32(a, b)
865 #endif 1105 #endif
866 #define op_put(a, b) a = b 1106 #define op_put(a, b) a = b
867 1107
868 //PIXOP2(avg, op_avg) 1108 PIXOP2(avg, op_avg)
869 //PIXOP2(put, op_put) 1109 PIXOP2(put, op_put)
870 #undef op_avg 1110 #undef op_avg
871 #undef op_put 1111 #undef op_put
872 1112
873 #define avg2(a,b) ((a+b+1)>>1) 1113 #define avg2(a,b) ((a+b+1)>>1)
874 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) 1114 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1115
1116 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1118 }
1119
1120 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1121 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1122 }
1123
1124 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1125 {
1126 const int A=(16-x16)*(16-y16);
1127 const int B=( x16)*(16-y16);
1128 const int C=(16-x16)*( y16);
1129 const int D=( x16)*( y16);
1130 int i;
1131
1132 for(i=0; i<h; i++)
1133 {
1134 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1135 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1136 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1137 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1138 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1139 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1140 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1141 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1142 dst+= stride;
1143 src+= stride;
1144 }
1145 }
1146
1147 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1148 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1149 {
1150 int y, vx, vy;
1151 const int s= 1<<shift;
1152
1153 width--;
1154 height--;
1155
1156 for(y=0; y<h; y++){
1157 int x;
1158
1159 vx= ox;
1160 vy= oy;
1161 for(x=0; x<8; x++){ //XXX FIXME optimize
1162 int src_x, src_y, frac_x, frac_y, index;
1163
1164 src_x= vx>>16;
1165 src_y= vy>>16;
1166 frac_x= src_x&(s-1);
1167 frac_y= src_y&(s-1);
1168 src_x>>=shift;
1169 src_y>>=shift;
1170
1171 if((unsigned)src_x < width){
1172 if((unsigned)src_y < height){
1173 index= src_x + src_y*stride;
1174 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1175 + src[index +1]* frac_x )*(s-frac_y)
1176 + ( src[index+stride ]*(s-frac_x)
1177 + src[index+stride+1]* frac_x )* frac_y
1178 + r)>>(shift*2);
1179 }else{
1180 index= src_x + clip(src_y, 0, height)*stride;
1181 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1182 + src[index +1]* frac_x )*s
1183 + r)>>(shift*2);
1184 }
1185 }else{
1186 if((unsigned)src_y < height){
1187 index= clip(src_x, 0, width) + src_y*stride;
1188 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1189 + src[index+stride ]* frac_y )*s
1190 + r)>>(shift*2);
1191 }else{
1192 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1193 dst[y*stride + x]= src[index ];
1194 }
1195 }
1196
1197 vx+= dxx;
1198 vy+= dyx;
1199 }
1200 ox += dxy;
1201 oy += dyy;
1202 }
1203 }
1204
1205 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1206 switch(width){
1207 case 2: put_pixels2_c (dst, src, stride, height); break;
1208 case 4: put_pixels4_c (dst, src, stride, height); break;
1209 case 8: put_pixels8_c (dst, src, stride, height); break;
1210 case 16:put_pixels16_c(dst, src, stride, height); break;
1211 }
1212 }
1213
1214 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1215 int i,j;
1216 for (i=0; i < height; i++) {
1217 for (j=0; j < width; j++) {
1218 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1219 }
1220 src += stride;
1221 dst += stride;
1222 }
1223 }
1224
1225 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226 int i,j;
1227 for (i=0; i < height; i++) {
1228 for (j=0; j < width; j++) {
1229 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1230 }
1231 src += stride;
1232 dst += stride;
1233 }
1234 }
1235
1236 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1237 int i,j;
1238 for (i=0; i < height; i++) {
1239 for (j=0; j < width; j++) {
1240 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1241 }
1242 src += stride;
1243 dst += stride;
1244 }
1245 }
1246
1247 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1248 int i,j;
1249 for (i=0; i < height; i++) {
1250 for (j=0; j < width; j++) {
1251 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1252 }
1253 src += stride;
1254 dst += stride;
1255 }
1256 }
1257
1258 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1259 int i,j;
1260 for (i=0; i < height; i++) {
1261 for (j=0; j < width; j++) {
1262 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1263 }
1264 src += stride;
1265 dst += stride;
1266 }
1267 }
1268
1269 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1270 int i,j;
1271 for (i=0; i < height; i++) {
1272 for (j=0; j < width; j++) {
1273 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1274 }
1275 src += stride;
1276 dst += stride;
1277 }
1278 }
1279
1280 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 int i,j;
1282 for (i=0; i < height; i++) {
1283 for (j=0; j < width; j++) {
1284 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1285 }
1286 src += stride;
1287 dst += stride;
1288 }
1289 }
1290
1291 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 int i,j;
1293 for (i=0; i < height; i++) {
1294 for (j=0; j < width; j++) {
1295 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1296 }
1297 src += stride;
1298 dst += stride;
1299 }
1300 }
1301
1302 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303 switch(width){
1304 case 2: avg_pixels2_c (dst, src, stride, height); break;
1305 case 4: avg_pixels4_c (dst, src, stride, height); break;
1306 case 8: avg_pixels8_c (dst, src, stride, height); break;
1307 case 16:avg_pixels16_c(dst, src, stride, height); break;
1308 }
1309 }
1310
1311 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312 int i,j;
1313 for (i=0; i < height; i++) {
1314 for (j=0; j < width; j++) {
1315 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1316 }
1317 src += stride;
1318 dst += stride;
1319 }
1320 }
1321
1322 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 int i,j;
1324 for (i=0; i < height; i++) {
1325 for (j=0; j < width; j++) {
1326 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1327 }
1328 src += stride;
1329 dst += stride;
1330 }
1331 }
1332
1333 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 int i,j;
1335 for (i=0; i < height; i++) {
1336 for (j=0; j < width; j++) {
1337 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1338 }
1339 src += stride;
1340 dst += stride;
1341 }
1342 }
1343
1344 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 int i,j;
1346 for (i=0; i < height; i++) {
1347 for (j=0; j < width; j++) {
1348 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1349 }
1350 src += stride;
1351 dst += stride;
1352 }
1353 }
1354
1355 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 int i,j;
1357 for (i=0; i < height; i++) {
1358 for (j=0; j < width; j++) {
1359 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1360 }
1361 src += stride;
1362 dst += stride;
1363 }
1364 }
1365
1366 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 int i,j;
1368 for (i=0; i < height; i++) {
1369 for (j=0; j < width; j++) {
1370 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1371 }
1372 src += stride;
1373 dst += stride;
1374 }
1375 }
1376
1377 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 int i,j;
1379 for (i=0; i < height; i++) {
1380 for (j=0; j < width; j++) {
1381 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1382 }
1383 src += stride;
1384 dst += stride;
1385 }
1386 }
1387
1388 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 int i,j;
1390 for (i=0; i < height; i++) {
1391 for (j=0; j < width; j++) {
1392 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1393 }
1394 src += stride;
1395 dst += stride;
1396 }
1397 }
1398 #if 0
1399 #define TPEL_WIDTH(width)\
1400 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1402 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1404 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1406 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1408 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1410 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1412 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1414 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1415 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1416 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1418 #endif
1419
1420 #define H264_CHROMA_MC(OPNAME, OP)\
1421 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1422 const int A=(8-x)*(8-y);\
1423 const int B=( x)*(8-y);\
1424 const int C=(8-x)*( y);\
1425 const int D=( x)*( y);\
1426 int i;\
1427 \
1428 assert(x<8 && y<8 && x>=0 && y>=0);\
1429 \
1430 for(i=0; i<h; i++)\
1431 {\
1432 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1433 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1434 dst+= stride;\
1435 src+= stride;\
1436 }\
1437 }\
1438 \
1439 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1440 const int A=(8-x)*(8-y);\
1441 const int B=( x)*(8-y);\
1442 const int C=(8-x)*( y);\
1443 const int D=( x)*( y);\
1444 int i;\
1445 \
1446 assert(x<8 && y<8 && x>=0 && y>=0);\
1447 \
1448 for(i=0; i<h; i++)\
1449 {\
1450 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1451 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1452 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1453 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1454 dst+= stride;\
1455 src+= stride;\
1456 }\
1457 }\
1458 \
1459 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1460 const int A=(8-x)*(8-y);\
1461 const int B=( x)*(8-y);\
1462 const int C=(8-x)*( y);\
1463 const int D=( x)*( y);\
1464 int i;\
1465 \
1466 assert(x<8 && y<8 && x>=0 && y>=0);\
1467 \
1468 for(i=0; i<h; i++)\
1469 {\
1470 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1471 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1472 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1473 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1474 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1475 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1476 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1477 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1478 dst+= stride;\
1479 src+= stride;\
1480 }\
1481 }
1482
1483 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1484 #define op_put(a, b) a = (((b) + 32)>>6)
1485
1486 H264_CHROMA_MC(put_ , op_put)
1487 H264_CHROMA_MC(avg_ , op_avg)
1488 #undef op_avg
1489 #undef op_put
1490
1491 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1492 const int A=(8-x)*(8-y);
1493 const int B=( x)*(8-y);
1494 const int C=(8-x)*( y);
1495 const int D=( x)*( y);
1496 int i;
1497
1498 assert(x<8 && y<8 && x>=0 && y>=0);
1499
1500 for(i=0; i<h; i++)
1501 {
1502 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1503 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1504 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1505 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1506 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1507 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1508 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1509 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1510 dst+= stride;
1511 src+= stride;
1512 }
1513 }
1514
1515 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1516 {
1517 int i;
1518 for(i=0; i<h; i++)
1519 {
1520 ST16(dst , LD16(src ));
1521 dst+=dstStride;
1522 src+=srcStride;
1523 }
1524 }
1525
1526 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1527 {
1528 int i;
1529 for(i=0; i<h; i++)
1530 {
1531 ST32(dst , LD32(src ));
1532 dst+=dstStride;
1533 src+=srcStride;
1534 }
1535 }
1536
1537 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1538 {
1539 int i;
1540 for(i=0; i<h; i++)
1541 {
1542 ST32(dst , LD32(src ));
1543 ST32(dst+4 , LD32(src+4 ));
1544 dst+=dstStride;
1545 src+=srcStride;
1546 }
1547 }
1548
1549 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1550 {
1551 int i;
1552 for(i=0; i<h; i++)
1553 {
1554 ST32(dst , LD32(src ));
1555 ST32(dst+4 , LD32(src+4 ));
1556 ST32(dst+8 , LD32(src+8 ));
1557 ST32(dst+12, LD32(src+12));
1558 dst+=dstStride;
1559 src+=srcStride;
1560 }
1561 }
1562
1563 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1564 {
1565 int i;
1566 for(i=0; i<h; i++)
1567 {
1568 ST32(dst , LD32(src ));
1569 ST32(dst+4 , LD32(src+4 ));
1570 ST32(dst+8 , LD32(src+8 ));
1571 ST32(dst+12, LD32(src+12));
1572 dst[16]= src[16];
1573 dst+=dstStride;
1574 src+=srcStride;
1575 }
1576 }
1577
1578 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1579 {
1580 int i;
1581 for(i=0; i<h; i++)
1582 {
1583 ST32(dst , LD32(src ));
1584 ST32(dst+4 , LD32(src+4 ));
1585 dst[8]= src[8];
1586 dst+=dstStride;
1587 src+=srcStride;
1588 }
1589 }
1590
1591
1592 #define QPEL_MC(r, OPNAME, RND, OP) \
1593 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1594 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1595 int i;\
1596 for(i=0; i<h; i++)\
1597 {\
1598 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1599 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1600 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1601 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1602 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1603 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1604 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1605 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1606 dst+=dstStride;\
1607 src+=srcStride;\
1608 }\
1609 }\
1610 \
1611 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1612 const int w=8;\
1613 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1614 int i;\
1615 for(i=0; i<w; i++)\
1616 {\
1617 const int src0= src[0*srcStride];\
1618 const int src1= src[1*srcStride];\
1619 const int src2= src[2*srcStride];\
1620 const int src3= src[3*srcStride];\
1621 const int src4= src[4*srcStride];\
1622 const int src5= src[5*srcStride];\
1623 const int src6= src[6*srcStride];\
1624 const int src7= src[7*srcStride];\
1625 const int src8= src[8*srcStride];\
1626 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1627 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1628 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1629 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1630 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1631 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1632 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1633 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1634 dst++;\
1635 src++;\
1636 }\
1637 }\
1638 \
1639 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1640 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1641 int i;\
1642 \
1643 for(i=0; i<h; i++)\
1644 {\
1645 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1646 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1647 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1648 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1649 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1650 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1651 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1652 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1653 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1654 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1655 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1656 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1657 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1658 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1659 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1660 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1661 dst+=dstStride;\
1662 src+=srcStride;\
1663 }\
1664 }\
1665 \
1666 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1667 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1668 int i;\
1669 const int w=16;\
1670 for(i=0; i<w; i++)\
1671 {\
1672 const int src0= src[0*srcStride];\
1673 const int src1= src[1*srcStride];\
1674 const int src2= src[2*srcStride];\
1675 const int src3= src[3*srcStride];\
1676 const int src4= src[4*srcStride];\
1677 const int src5= src[5*srcStride];\
1678 const int src6= src[6*srcStride];\
1679 const int src7= src[7*srcStride];\
1680 const int src8= src[8*srcStride];\
1681 const int src9= src[9*srcStride];\
1682 const int src10= src[10*srcStride];\
1683 const int src11= src[11*srcStride];\
1684 const int src12= src[12*srcStride];\
1685 const int src13= src[13*srcStride];\
1686 const int src14= src[14*srcStride];\
1687 const int src15= src[15*srcStride];\
1688 const int src16= src[16*srcStride];\
1689 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1690 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1691 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1692 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1693 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1694 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1695 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1696 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1697 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1698 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1699 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1700 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1701 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1702 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1703 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1704 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1705 dst++;\
1706 src++;\
1707 }\
1708 }\
1709 \
1710 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1711 OPNAME ## pixels8_c(dst, src, stride, 8);\
1712 }\
1713 \
1714 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1715 uint8_t half[64];\
1716 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1717 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1718 }\
1719 \
1720 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1721 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1722 }\
1723 \
1724 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1725 uint8_t half[64];\
1726 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1727 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1728 }\
1729 \
1730 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1731 uint8_t full[16*9];\
1732 uint8_t half[64];\
1733 copy_block9(full, src, 16, stride, 9);\
1734 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1735 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1736 }\
1737 \
1738 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1739 uint8_t full[16*9];\
1740 copy_block9(full, src, 16, stride, 9);\
1741 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1742 }\
1743 \
1744 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1745 uint8_t full[16*9];\
1746 uint8_t half[64];\
1747 copy_block9(full, src, 16, stride, 9);\
1748 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1749 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1750 }\
1751 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1752 uint8_t full[16*9];\
1753 uint8_t halfH[72];\
1754 uint8_t halfV[64];\
1755 uint8_t halfHV[64];\
1756 copy_block9(full, src, 16, stride, 9);\
1757 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1758 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1759 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1760 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1761 }\
1762 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1763 uint8_t full[16*9];\
1764 uint8_t halfH[72];\
1765 uint8_t halfHV[64];\
1766 copy_block9(full, src, 16, stride, 9);\
1767 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1768 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1769 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1770 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1771 }\
1772 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1773 uint8_t full[16*9];\
1774 uint8_t halfH[72];\
1775 uint8_t halfV[64];\
1776 uint8_t halfHV[64];\
1777 copy_block9(full, src, 16, stride, 9);\
1778 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1779 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1780 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1781 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1782 }\
1783 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1784 uint8_t full[16*9];\
1785 uint8_t halfH[72];\
1786 uint8_t halfHV[64];\
1787 copy_block9(full, src, 16, stride, 9);\
1788 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1790 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1792 }\
1793 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1794 uint8_t full[16*9];\
1795 uint8_t halfH[72];\
1796 uint8_t halfV[64];\
1797 uint8_t halfHV[64];\
1798 copy_block9(full, src, 16, stride, 9);\
1799 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1800 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1801 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1802 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1803 }\
1804 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1805 uint8_t full[16*9];\
1806 uint8_t halfH[72];\
1807 uint8_t halfHV[64];\
1808 copy_block9(full, src, 16, stride, 9);\
1809 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1810 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1811 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1812 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1813 }\
1814 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1815 uint8_t full[16*9];\
1816 uint8_t halfH[72];\
1817 uint8_t halfV[64];\
1818 uint8_t halfHV[64];\
1819 copy_block9(full, src, 16, stride, 9);\
1820 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1821 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1822 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1823 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1824 }\
1825 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1826 uint8_t full[16*9];\
1827 uint8_t halfH[72];\
1828 uint8_t halfHV[64];\
1829 copy_block9(full, src, 16, stride, 9);\
1830 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1831 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1832 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1833 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1834 }\
1835 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1836 uint8_t halfH[72];\
1837 uint8_t halfHV[64];\
1838 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1839 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1840 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1841 }\
1842 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1843 uint8_t halfH[72];\
1844 uint8_t halfHV[64];\
1845 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1846 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1847 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1848 }\
1849 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1850 uint8_t full[16*9];\
1851 uint8_t halfH[72];\
1852 uint8_t halfV[64];\
1853 uint8_t halfHV[64];\
1854 copy_block9(full, src, 16, stride, 9);\
1855 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1857 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1858 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1859 }\
1860 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1861 uint8_t full[16*9];\
1862 uint8_t halfH[72];\
1863 copy_block9(full, src, 16, stride, 9);\
1864 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1865 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1866 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1867 }\
1868 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1869 uint8_t full[16*9];\
1870 uint8_t halfH[72];\
1871 uint8_t halfV[64];\
1872 uint8_t halfHV[64];\
1873 copy_block9(full, src, 16, stride, 9);\
1874 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1875 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1876 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1877 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1878 }\
1879 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1880 uint8_t full[16*9];\
1881 uint8_t halfH[72];\
1882 copy_block9(full, src, 16, stride, 9);\
1883 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1884 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1885 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1886 }\
1887 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1888 uint8_t halfH[72];\
1889 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1890 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1891 }\
1892 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1893 OPNAME ## pixels16_c(dst, src, stride, 16);\
1894 }\
1895 \
1896 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1897 uint8_t half[256];\
1898 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1899 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1900 }\
1901 \
1902 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1903 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1904 }\
1905 \
1906 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1907 uint8_t half[256];\
1908 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1909 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1910 }\
1911 \
1912 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1913 uint8_t full[24*17];\
1914 uint8_t half[256];\
1915 copy_block17(full, src, 24, stride, 17);\
1916 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1917 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1918 }\
1919 \
1920 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1921 uint8_t full[24*17];\
1922 copy_block17(full, src, 24, stride, 17);\
1923 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1924 }\
1925 \
1926 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1927 uint8_t full[24*17];\
1928 uint8_t half[256];\
1929 copy_block17(full, src, 24, stride, 17);\
1930 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1931 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1932 }\
1933 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1934 uint8_t full[24*17];\
1935 uint8_t halfH[272];\
1936 uint8_t halfV[256];\
1937 uint8_t halfHV[256];\
1938 copy_block17(full, src, 24, stride, 17);\
1939 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1940 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1941 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1942 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1943 }\
1944 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1945 uint8_t full[24*17];\
1946 uint8_t halfH[272];\
1947 uint8_t halfHV[256];\
1948 copy_block17(full, src, 24, stride, 17);\
1949 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1950 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1951 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1952 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1953 }\
1954 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1955 uint8_t full[24*17];\
1956 uint8_t halfH[272];\
1957 uint8_t halfV[256];\
1958 uint8_t halfHV[256];\
1959 copy_block17(full, src, 24, stride, 17);\
1960 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1961 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1962 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1963 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1964 }\
1965 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1966 uint8_t full[24*17];\
1967 uint8_t halfH[272];\
1968 uint8_t halfHV[256];\
1969 copy_block17(full, src, 24, stride, 17);\
1970 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1972 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1974 }\
1975 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1976 uint8_t full[24*17];\
1977 uint8_t halfH[272];\
1978 uint8_t halfV[256];\
1979 uint8_t halfHV[256];\
1980 copy_block17(full, src, 24, stride, 17);\
1981 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1982 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1983 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1984 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1985 }\
1986 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1987 uint8_t full[24*17];\
1988 uint8_t halfH[272];\
1989 uint8_t halfHV[256];\
1990 copy_block17(full, src, 24, stride, 17);\
1991 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1992 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1993 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1995 }\
1996 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1997 uint8_t full[24*17];\
1998 uint8_t halfH[272];\
1999 uint8_t halfV[256];\
2000 uint8_t halfHV[256];\
2001 copy_block17(full, src, 24, stride, 17);\
2002 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2003 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2004 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2005 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2006 }\
2007 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2008 uint8_t full[24*17];\
2009 uint8_t halfH[272];\
2010 uint8_t halfHV[256];\
2011 copy_block17(full, src, 24, stride, 17);\
2012 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2013 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2014 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2016 }\
2017 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2018 uint8_t halfH[272];\
2019 uint8_t halfHV[256];\
2020 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2021 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2022 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2023 }\
2024 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2025 uint8_t halfH[272];\
2026 uint8_t halfHV[256];\
2027 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2028 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2030 }\
2031 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2032 uint8_t full[24*17];\
2033 uint8_t halfH[272];\
2034 uint8_t halfV[256];\
2035 uint8_t halfHV[256];\
2036 copy_block17(full, src, 24, stride, 17);\
2037 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2039 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2040 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2041 }\
2042 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2043 uint8_t full[24*17];\
2044 uint8_t halfH[272];\
2045 copy_block17(full, src, 24, stride, 17);\
2046 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2047 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2048 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2049 }\
2050 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2051 uint8_t full[24*17];\
2052 uint8_t halfH[272];\
2053 uint8_t halfV[256];\
2054 uint8_t halfHV[256];\
2055 copy_block17(full, src, 24, stride, 17);\
2056 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2057 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2058 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2059 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2060 }\
2061 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[24*17];\
2063 uint8_t halfH[272];\
2064 copy_block17(full, src, 24, stride, 17);\
2065 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2066 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2067 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2068 }\
2069 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2070 uint8_t halfH[272];\
2071 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2072 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2073 }
2074
2075 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2076 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2077 #define op_put(a, b) a = cm[((b) + 16)>>5]
2078 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2079
2080 QPEL_MC(0, put_ , _ , op_put)
2081 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2082 QPEL_MC(0, avg_ , _ , op_avg)
2083 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2084 #undef op_avg
2085 #undef op_avg_no_rnd
2086 #undef op_put
2087 #undef op_put_no_rnd
2088
2089 #if 1
2090 #define H264_LOWPASS(OPNAME, OP, OP2) \
2091 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2092 const int h=2;\
2093 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2094 int i;\
2095 for(i=0; i<h; i++)\
2096 {\
2097 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2098 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2099 dst+=dstStride;\
2100 src+=srcStride;\
2101 }\
2102 }\
2103 \
2104 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2105 const int w=2;\
2106 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2107 int i;\
2108 for(i=0; i<w; i++)\
2109 {\
2110 const int srcB= src[-2*srcStride];\
2111 const int srcA= src[-1*srcStride];\
2112 const int src0= src[0 *srcStride];\
2113 const int src1= src[1 *srcStride];\
2114 const int src2= src[2 *srcStride];\
2115 const int src3= src[3 *srcStride];\
2116 const int src4= src[4 *srcStride];\
2117 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2118 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2119 dst++;\
2120 src++;\
2121 }\
2122 }\
2123 \
2124 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2125 const int h=2;\
2126 const int w=2;\
2127 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2128 int i;\
2129 src -= 2*srcStride;\
2130 for(i=0; i<h+5; i++)\
2131 {\
2132 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2133 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2134 tmp+=tmpStride;\
2135 src+=srcStride;\
2136 }\
2137 tmp -= tmpStride*(h+5-2);\
2138 for(i=0; i<w; i++)\
2139 {\
2140 const int tmpB= tmp[-2*tmpStride];\
2141 const int tmpA= tmp[-1*tmpStride];\
2142 const int tmp0= tmp[0 *tmpStride];\
2143 const int tmp1= tmp[1 *tmpStride];\
2144 const int tmp2= tmp[2 *tmpStride];\
2145 const int tmp3= tmp[3 *tmpStride];\
2146 const int tmp4= tmp[4 *tmpStride];\
2147 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2148 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2149 dst++;\
2150 tmp++;\
2151 }\
2152 }\
2153 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2154 const int h=4;\
2155 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2156 int i;\
2157 for(i=0; i<h; i++)\
2158 {\
2159 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2160 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2161 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2162 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2163 dst+=dstStride;\
2164 src+=srcStride;\
2165 }\
2166 }\
2167 \
2168 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2169 const int w=4;\
2170 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2171 int i;\
2172 for(i=0; i<w; i++)\
2173 {\
2174 const int srcB= src[-2*srcStride];\
2175 const int srcA= src[-1*srcStride];\
2176 const int src0= src[0 *srcStride];\
2177 const int src1= src[1 *srcStride];\
2178 const int src2= src[2 *srcStride];\
2179 const int src3= src[3 *srcStride];\
2180 const int src4= src[4 *srcStride];\
2181 const int src5= src[5 *srcStride];\
2182 const int src6= src[6 *srcStride];\
2183 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2184 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2185 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2186 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2187 dst++;\
2188 src++;\
2189 }\
2190 }\
2191 \
2192 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2193 const int h=4;\
2194 const int w=4;\
2195 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2196 int i;\
2197 src -= 2*srcStride;\
2198 for(i=0; i<h+5; i++)\
2199 {\
2200 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2201 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2202 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2203 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2204 tmp+=tmpStride;\
2205 src+=srcStride;\
2206 }\
2207 tmp -= tmpStride*(h+5-2);\
2208 for(i=0; i<w; i++)\
2209 {\
2210 const int tmpB= tmp[-2*tmpStride];\
2211 const int tmpA= tmp[-1*tmpStride];\
2212 const int tmp0= tmp[0 *tmpStride];\
2213 const int tmp1= tmp[1 *tmpStride];\
2214 const int tmp2= tmp[2 *tmpStride];\
2215 const int tmp3= tmp[3 *tmpStride];\
2216 const int tmp4= tmp[4 *tmpStride];\
2217 const int tmp5= tmp[5 *tmpStride];\
2218 const int tmp6= tmp[6 *tmpStride];\
2219 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2220 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2221 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2222 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2223 dst++;\
2224 tmp++;\
2225 }\
2226 }\
2227 \
2228 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2229 const int h=8;\
2230 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2231 int i;\
2232 for(i=0; i<h; i++)\
2233 {\
2234 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2235 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2236 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2237 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2238 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2239 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2240 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2241 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2242 dst+=dstStride;\
2243 src+=srcStride;\
2244 }\
2245 }\
2246 \
2247 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2248 const int w=8;\
2249 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2250 int i;\
2251 for(i=0; i<w; i++)\
2252 {\
2253 const int srcB= src[-2*srcStride];\
2254 const int srcA= src[-1*srcStride];\
2255 const int src0= src[0 *srcStride];\
2256 const int src1= src[1 *srcStride];\
2257 const int src2= src[2 *srcStride];\
2258 const int src3= src[3 *srcStride];\
2259 const int src4= src[4 *srcStride];\
2260 const int src5= src[5 *srcStride];\
2261 const int src6= src[6 *srcStride];\
2262 const int src7= src[7 *srcStride];\
2263 const int src8= src[8 *srcStride];\
2264 const int src9= src[9 *srcStride];\
2265 const int src10=src[10*srcStride];\
2266 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2267 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2268 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2269 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2270 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2271 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2272 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2273 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2274 dst++;\
2275 src++;\
2276 }\
2277 }\
2278 \
2279 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2280 const int h=8;\
2281 const int w=8;\
2282 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2283 int i;\
2284 src -= 2*srcStride;\
2285 for(i=0; i<h+5; i++)\
2286 {\
2287 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2288 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2289 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2290 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2291 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2292 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2293 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2294 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2295 tmp+=tmpStride;\
2296 src+=srcStride;\
2297 }\
2298 tmp -= tmpStride*(h+5-2);\
2299 for(i=0; i<w; i++)\
2300 {\
2301 const int tmpB= tmp[-2*tmpStride];\
2302 const int tmpA= tmp[-1*tmpStride];\
2303 const int tmp0= tmp[0 *tmpStride];\
2304 const int tmp1= tmp[1 *tmpStride];\
2305 const int tmp2= tmp[2 *tmpStride];\
2306 const int tmp3= tmp[3 *tmpStride];\
2307 const int tmp4= tmp[4 *tmpStride];\
2308 const int tmp5= tmp[5 *tmpStride];\
2309 const int tmp6= tmp[6 *tmpStride];\
2310 const int tmp7= tmp[7 *tmpStride];\
2311 const int tmp8= tmp[8 *tmpStride];\
2312 const int tmp9= tmp[9 *tmpStride];\
2313 const int tmp10=tmp[10*tmpStride];\
2314 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2315 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2316 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2317 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2318 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2319 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2320 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2321 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2322 dst++;\
2323 tmp++;\
2324 }\
2325 }\
2326 \
2327 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2328 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2329 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2330 src += 8*srcStride;\
2331 dst += 8*dstStride;\
2332 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2333 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2334 }\
2335 \
2336 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2337 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2338 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2339 src += 8*srcStride;\
2340 dst += 8*dstStride;\
2341 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2342 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2343 }\
2344 \
2345 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2346 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2347 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2348 src += 8*srcStride;\
2349 dst += 8*dstStride;\
2350 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2351 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2352 }\
2353
2354 #define H264_MC(OPNAME, SIZE) \
2355 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2356 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2357 }\
2358 \
2359 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2360 uint8_t half[SIZE*SIZE];\
2361 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2362 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2363 }\
2364 \
2365 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2366 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2367 }\
2368 \
2369 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2370 uint8_t half[SIZE*SIZE];\
2371 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2372 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2373 }\
2374 \
2375 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2376 uint8_t full[SIZE*(SIZE+5)];\
2377 uint8_t * const full_mid= full + SIZE*2;\
2378 uint8_t half[SIZE*SIZE];\
2379 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2380 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2381 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2382 }\
2383 \
2384 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2385 uint8_t full[SIZE*(SIZE+5)];\
2386 uint8_t * const full_mid= full + SIZE*2;\
2387 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2388 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2389 }\
2390 \
2391 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2392 uint8_t full[SIZE*(SIZE+5)];\
2393 uint8_t * const full_mid= full + SIZE*2;\
2394 uint8_t half[SIZE*SIZE];\
2395 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2396 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2397 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2398 }\
2399 \
2400 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2401 uint8_t full[SIZE*(SIZE+5)];\
2402 uint8_t * const full_mid= full + SIZE*2;\
2403 uint8_t halfH[SIZE*SIZE];\
2404 uint8_t halfV[SIZE*SIZE];\
2405 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2406 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2407 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2408 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2409 }\
2410 \
2411 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2412 uint8_t full[SIZE*(SIZE+5)];\
2413 uint8_t * const full_mid= full + SIZE*2;\
2414 uint8_t halfH[SIZE*SIZE];\
2415 uint8_t halfV[SIZE*SIZE];\
2416 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2417 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2418 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2419 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2420 }\
2421 \
2422 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2423 uint8_t full[SIZE*(SIZE+5)];\
2424 uint8_t * const full_mid= full + SIZE*2;\
2425 uint8_t halfH[SIZE*SIZE];\
2426 uint8_t halfV[SIZE*SIZE];\
2427 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2428 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2429 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2430 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2431 }\
2432 \
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2434 uint8_t full[SIZE*(SIZE+5)];\
2435 uint8_t * const full_mid= full + SIZE*2;\
2436 uint8_t halfH[SIZE*SIZE];\
2437 uint8_t halfV[SIZE*SIZE];\
2438 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2440 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2441 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2442 }\
2443 \
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2445 int16_t tmp[SIZE*(SIZE+5)];\
2446 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2447 }\
2448 \
2449 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2450 int16_t tmp[SIZE*(SIZE+5)];\
2451 uint8_t halfH[SIZE*SIZE];\
2452 uint8_t halfHV[SIZE*SIZE];\
2453 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2454 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2455 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2456 }\
2457 \
2458 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2459 int16_t tmp[SIZE*(SIZE+5)];\
2460 uint8_t halfH[SIZE*SIZE];\
2461 uint8_t halfHV[SIZE*SIZE];\
2462 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2463 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2464 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2465 }\
2466 \
2467 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2468 uint8_t full[SIZE*(SIZE+5)];\
2469 uint8_t * const full_mid= full + SIZE*2;\
2470 int16_t tmp[SIZE*(SIZE+5)];\
2471 uint8_t halfV[SIZE*SIZE];\
2472 uint8_t halfHV[SIZE*SIZE];\
2473 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2474 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2475 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2476 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2477 }\
2478 \
2479 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2480 uint8_t full[SIZE*(SIZE+5)];\
2481 uint8_t * const full_mid= full + SIZE*2;\
2482 int16_t tmp[SIZE*(SIZE+5)];\
2483 uint8_t halfV[SIZE*SIZE];\
2484 uint8_t halfHV[SIZE*SIZE];\
2485 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2486 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2487 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2488 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2489 }\
2490
2491 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2492 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2493 #define op_put(a, b) a = cm[((b) + 16)>>5]
2494 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2495 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2496
2497 H264_LOWPASS(put_ , op_put, op2_put)
2498 H264_LOWPASS(avg_ , op_avg, op2_avg)
2499 H264_MC(put_, 2)
2500 H264_MC(put_, 4)
2501 H264_MC(put_, 8)
2502 H264_MC(put_, 16)
2503 H264_MC(avg_, 4)
2504 H264_MC(avg_, 8)
2505 H264_MC(avg_, 16)
2506
2507 #undef op_avg
2508 #undef op_put
2509 #undef op2_avg
2510 #undef op2_put
2511 #endif
2512
2513 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2514 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2515 #define H264_WEIGHT(W,H) \
2516 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2517 int y; \
2518 offset <<= log2_denom; \
2519 if(log2_denom) offset += 1<<(log2_denom-1); \
2520 for(y=0; y<H; y++, block += stride){ \
2521 op_scale1(0); \
2522 op_scale1(1); \
2523 if(W==2) continue; \
2524 op_scale1(2); \
2525 op_scale1(3); \
2526 if(W==4) continue; \
2527 op_scale1(4); \
2528 op_scale1(5); \
2529 op_scale1(6); \
2530 op_scale1(7); \
2531 if(W==8) continue; \
2532 op_scale1(8); \
2533 op_scale1(9); \
2534 op_scale1(10); \
2535 op_scale1(11); \
2536 op_scale1(12); \
2537 op_scale1(13); \
2538 op_scale1(14); \
2539 op_scale1(15); \
2540 } \
2541 } \
2542 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2543 int y; \
2544 offset = ((offset + 1) | 1) << log2_denom; \
2545 for(y=0; y<H; y++, dst += stride, src += stride){ \
2546 op_scale2(0); \
2547 op_scale2(1); \
2548 if(W==2) continue; \
2549 op_scale2(2); \
2550 op_scale2(3); \
2551 if(W==4) continue; \
2552 op_scale2(4); \
2553 op_scale2(5); \
2554 op_scale2(6); \
2555 op_scale2(7); \
2556 if(W==8) continue; \
2557 op_scale2(8); \
2558 op_scale2(9); \
2559 op_scale2(10); \
2560 op_scale2(11); \
2561 op_scale2(12); \
2562 op_scale2(13); \
2563 op_scale2(14); \
2564 op_scale2(15); \
2565 } \
2566 }
2567
2568 H264_WEIGHT(16,16)
2569 H264_WEIGHT(16,8)
2570 H264_WEIGHT(8,16)
2571 H264_WEIGHT(8,8)
2572 H264_WEIGHT(8,4)
2573 H264_WEIGHT(4,8)
2574 H264_WEIGHT(4,4)
2575 H264_WEIGHT(4,2)
2576 H264_WEIGHT(2,4)
2577 H264_WEIGHT(2,2)
2578
2579 #undef op_scale1
2580 #undef op_scale2
2581 #undef H264_WEIGHT
2582
2583 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2584 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2585 int i;
2586
2587 for(i=0; i<h; i++){
2588 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2589 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2590 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2591 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2592 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2593 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2594 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2595 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2596 dst+=dstStride;
2597 src+=srcStride;
2598 }
2599 }
2600
2601 #ifdef CONFIG_CAVS_DECODER
2602 /* AVS specific */
2603 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2604
2605 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2606 put_pixels8_c(dst, src, stride, 8);
2607 }
2608 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2609 avg_pixels8_c(dst, src, stride, 8);
2610 }
2611 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2612 put_pixels16_c(dst, src, stride, 16);
2613 }
2614 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2615 avg_pixels16_c(dst, src, stride, 16);
2616 }
2617 #endif /* CONFIG_CAVS_DECODER */
2618
2619 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2620 /* VC-1 specific */
2621 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2622
2623 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2624 put_pixels8_c(dst, src, stride, 8);
2625 }
2626 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2627
2628 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2629 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2630 int i;
2631
2632 for(i=0; i<w; i++){
2633 const int src_1= src[ -srcStride];
2634 const int src0 = src[0 ];
2635 const int src1 = src[ srcStride];
2636 const int src2 = src[2*srcStride];
2637 const int src3 = src[3*srcStride];
2638 const int src4 = src[4*srcStride];
2639 const int src5 = src[5*srcStride];
2640 const int src6 = src[6*srcStride];
2641 const int src7 = src[7*srcStride];
2642 const int src8 = src[8*srcStride];
2643 const int src9 = src[9*srcStride];
2644 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2645 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2646 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2647 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2648 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2649 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2650 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2651 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2652 src++;
2653 dst++;
2654 }
2655 }
2656
2657 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2658 put_pixels8_c(dst, src, stride, 8);
2659 }
2660
2661 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2662 uint8_t half[64];
2663 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2664 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2665 }
2666
2667 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2668 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2669 }
2670
2671 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2672 uint8_t half[64];
2673 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2674 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2675 }
2676
2677 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2678 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2679 }
2680
2681 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2682 uint8_t halfH[88];
2683 uint8_t halfV[64];
2684 uint8_t halfHV[64];
2685 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2686 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2687 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2688 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2689 }
2690 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2691 uint8_t halfH[88];
2692 uint8_t halfV[64];
2693 uint8_t halfHV[64];
2694 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2695 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2696 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2697 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2698 }
2699 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2700 uint8_t halfH[88];
2701 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2702 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2703 }
2704
2705 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2706 {
2707 int s, i;
2708
2709 s = 0;
2710 for(i=0;i<h;i++) {
2711 s += abs(pix1[0] - pix2[0]);
2712 s += abs(pix1[1] - pix2[1]);
2713 s += abs(pix1[2] - pix2[2]);
2714 s += abs(pix1[3] - pix2[3]);
2715 s += abs(pix1[4] - pix2[4]);
2716 s += abs(pix1[5] - pix2[5]);
2717 s += abs(pix1[6] - pix2[6]);
2718 s += abs(pix1[7] - pix2[7]);
2719 s += abs(pix1[8] - pix2[8]);
2720 s += abs(pix1[9] - pix2[9]);
2721 s += abs(pix1[10] - pix2[10]);
2722 s += abs(pix1[11] - pix2[11]);
2723 s += abs(pix1[12] - pix2[12]);
2724 s += abs(pix1[13] - pix2[13]);
2725 s += abs(pix1[14] - pix2[14]);
2726 s += abs(pix1[15] - pix2[15]);
2727 pix1 += line_size;
2728 pix2 += line_size;
2729 }
2730 return s;
2731 }
2732
2733 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2734 {
2735 int s, i;
2736
2737 s = 0;
2738 for(i=0;i<h;i++) {
2739 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2740 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2741 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2742 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2743 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2744 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2745 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2746 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2747 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2748 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2749 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2750 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2751 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2752 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2753 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2754 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2755 pix1 += line_size;
2756 pix2 += line_size;
2757 }
2758 return s;
2759 }
2760
2761 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2762 {
2763 int s, i;
2764 uint8_t *pix3 = pix2 + line_size;
2765
2766 s = 0;
2767 for(i=0;i<h;i++) {
2768 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2769 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2770 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2771 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2772 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2773 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2774 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2775 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2776 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2777 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2778 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2779 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2780 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2781 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2782 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2783 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2784 pix1 += line_size;
2785 pix2 += line_size;
2786 pix3 += line_size;
2787 }
2788 return s;
2789 }
2790
2791 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2792 {
2793 int s, i;
2794 uint8_t *pix3 = pix2 + line_size;
2795
2796 s = 0;
2797 for(i=0;i<h;i++) {
2798 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2799 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2800 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2801 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2802 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2803 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2804 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2805 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2806 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2807 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2808 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2809 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2810 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2811 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2812 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2813 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2814 pix1 += line_size;
2815 pix2 += line_size;
2816 pix3 += line_size;
2817 }
2818 return s;
2819 }
2820
2821 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2822 {
2823 int s, i;
2824
2825 s = 0;
2826 for(i=0;i<h;i++) {
2827 s += abs(pix1[0] - pix2[0]);
2828 s += abs(pix1[1] - pix2[1]);
2829 s += abs(pix1[2] - pix2[2]);
2830 s += abs(pix1[3] - pix2[3]);
2831 s += abs(pix1[4] - pix2[4]);
2832 s += abs(pix1[5] - pix2[5]);
2833 s += abs(pix1[6] - pix2[6]);
2834 s += abs(pix1[7] - pix2[7]);
2835 pix1 += line_size;
2836 pix2 += line_size;
2837 }
2838 return s;
2839 }
2840
2841 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2842 {
2843 int s, i;
2844
2845 s = 0;
2846 for(i=0;i<h;i++) {
2847 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2848 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2849 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2850 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2851 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2852 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2853 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2854 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2855 pix1 += line_size;
2856 pix2 += line_size;
2857 }
2858 return s;
2859 }
2860
2861 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2862 {
2863 int s, i;
2864 uint8_t *pix3 = pix2 + line_size;
2865
2866 s = 0;
2867 for(i=0;i<h;i++) {
2868 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2869 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2870 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2871 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2872 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2873 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2874 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2875 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2876 pix1 += line_size;
2877 pix2 += line_size;
2878 pix3 += line_size;
2879 }
2880 return s;
2881 }
2882
2883 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2884 {
2885 int s, i;
2886 uint8_t *pix3 = pix2 + line_size;
2887
2888 s = 0;
2889 for(i=0;i<h;i++) {
2890 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2891 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2892 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2893 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2894 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2895 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2896 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2897 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2898 pix1 += line_size;
2899 pix2 += line_size;
2900 pix3 += line_size;
2901 }
2902 return s;
2903 }
2904
2905 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2906 int i;
2907 unsigned int sum=0;
2908
2909 for(i=0; i<8*8; i++){
2910 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2911 int w= weight[i];
2912 b>>= RECON_SHIFT;
2913 assert(-512<b && b<512);
2914
2915 sum += (w*b)*(w*b)>>4;
2916 }
2917 return sum>>2;
2918 }
2919
2920 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2921 int i;
2922
2923 for(i=0; i<8*8; i++){
2924 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2925 }
2926 }
2927
2928 /**
2929 * permutes an 8x8 block.
2930 * @param block the block which will be permuted according to the given permutation vector
2931 * @param permutation the permutation vector
2932 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2933 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2934 * (inverse) permutated to scantable order!
2935 */
2936 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2937 {
2938 int i;
2939 DCTELEM temp[64];
2940
2941 if(last<=0) return;
2942 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2943
2944 for(i=0; i<=last; i++){
2945 const int j= scantable[i];
2946 temp[j]= block[j];
2947 block[j]=0;
2948 }
2949
2950 for(i=0; i<=last; i++){
2951 const int j= scantable[i];
2952 const int perm_j= permutation[j];
2953 block[perm_j]= temp[j];
2954 }
2955 }
2956
2957 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2958 return 0;
2959 }
2960
2961 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2962 int i;
2963
2964 memset(cmp, 0, sizeof(void*)*5);
2965
2966 for(i=0; i<5; i++){
2967 switch(type&0xFF){
2968 case FF_CMP_SAD:
2969 cmp[i]= c->sad[i];
2970 break;
2971 case FF_CMP_SATD:
2972 cmp[i]= c->hadamard8_diff[i];
2973 break;
2974 case FF_CMP_SSE:
2975 cmp[i]= c->sse[i];
2976 break;
2977 case FF_CMP_DCT:
2978 cmp[i]= c->dct_sad[i];
2979 break;
2980 case FF_CMP_DCT264:
2981 cmp[i]= c->dct264_sad[i];
2982 break;
2983 case FF_CMP_DCTMAX:
2984 cmp[i]= c->dct_max[i];
2985 break;
2986 case FF_CMP_PSNR:
2987 cmp[i]= c->quant_psnr[i];
2988 break;
2989 case FF_CMP_BIT:
2990 cmp[i]= c->bit[i];
2991 break;
2992 case FF_CMP_RD:
2993 cmp[i]= c->rd[i];
2994 break;
2995 case FF_CMP_VSAD:
2996 cmp[i]= c->vsad[i];
2997 break;
2998 case FF_CMP_VSSE:
2999 cmp[i]= c->vsse[i];
3000 break;
3001 case FF_CMP_ZERO:
3002 cmp[i]= zero_cmp;
3003 break;
3004 case FF_CMP_NSSE:
3005 cmp[i]= c->nsse[i];
3006 break;
3007 #ifdef CONFIG_SNOW_ENCODER
3008 case FF_CMP_W53:
3009 cmp[i]= c->w53[i];
3010 break;
3011 case FF_CMP_W97:
3012 cmp[i]= c->w97[i];
3013 break;
3014 #endif
3015 default:
3016 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3017 }
3018 }
3019 }
3020
3021 /**
3022 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3023 */
3024 static void clear_blocks_c(DCTELEM *blocks)
3025 {
3026 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3027 }
3028
3029 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3030 int i;
3031 for(i=0; i+7<w; i+=8){
3032 dst[i+0] += src[i+0];
3033 dst[i+1] += src[i+1];
3034 dst[i+2] += src[i+2];
3035 dst[i+3] += src[i+3];
3036 dst[i+4] += src[i+4];
3037 dst[i+5] += src[i+5];
3038 dst[i+6] += src[i+6];
3039 dst[i+7] += src[i+7];
3040 }
3041 for(; i<w; i++)
3042 dst[i+0] += src[i+0];
3043 }
3044
3045 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3046 int i;
3047 for(i=0; i+7<w; i+=8){
3048 dst[i+0] = src1[i+0]-src2[i+0];
3049 dst[i+1] = src1[i+1]-src2[i+1];
3050 dst[i+2] = src1[i+2]-src2[i+2];
3051 dst[i+3] = src1[i+3]-src2[i+3];
3052 dst[i+4] = src1[i+4]-src2[i+4];
3053 dst[i+5] = src1[i+5]-src2[i+5];
3054 dst[i+6] = src1[i+6]-src2[i+6];
3055 dst[i+7] = src1[i+7]-src2[i+7];
3056 }
3057 for(; i<w; i++)
3058 dst[i+0] = src1[i+0]-src2[i+0];
3059 }
3060
3061 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3062 int i;
3063 uint8_t l, lt;
3064
3065 l= *left;
3066 lt= *left_top;
3067
3068 for(i=0; i<w; i++){
3069 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3070 lt= src1[i];
3071 l= src2[i];
3072 dst[i]= l - pred;
3073 }
3074
3075 *left= l;
3076 *left_top= lt;
3077 }
3078
3079 #define BUTTERFLY2(o1,o2,i1,i2) \
3080 o1= (i1)+(i2);\
3081 o2= (i1)-(i2);
3082
3083 #define BUTTERFLY1(x,y) \
3084 {\
3085 int a,b;\
3086 a= x;\
3087 b= y;\
3088 x= a+b;\
3089 y= a-b;\
3090 }
3091
3092 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3093
3094 #define DCT8_1D {\
3095 const int s07 = SRC(0) + SRC(7);\
3096 const int s16 = SRC(1) + SRC(6);\
3097 const int s25 = SRC(2) + SRC(5);\
3098 const int s34 = SRC(3) + SRC(4);\
3099 const int a0 = s07 + s34;\
3100 const int a1 = s16 + s25;\
3101 const int a2 = s07 - s34;\
3102 const int a3 = s16 - s25;\
3103 const int d07 = SRC(0) - SRC(7);\
3104 const int d16 = SRC(1) - SRC(6);\
3105 const int d25 = SRC(2) - SRC(5);\
3106 const int d34 = SRC(3) - SRC(4);\
3107 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3108 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3109 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3110 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3111 DST(0, a0 + a1 ) ;\
3112 DST(1, a4 + (a7>>2)) ;\
3113 DST(2, a2 + (a3>>1)) ;\
3114 DST(3, a5 + (a6>>2)) ;\
3115 DST(4, a0 - a1 ) ;\
3116 DST(5, a6 - (a5>>2)) ;\
3117 DST(6, (a2>>1) - a3 ) ;\
3118 DST(7, (a4>>2) - a7 ) ;\
3119 }
3120
3121 static void vector_fmul_c(float *dst, const float *src, int len){
3122 int i;
3123 for(i=0; i<len; i++)
3124 dst[i] *= src[i];
3125 }
3126
3127 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3128 int i;
3129 src1 += len-1;
3130 for(i=0; i<len; i++)
3131 dst[i] = src0[i] * src1[-i];
3132 }
3133
3134 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3135 int i;
3136 for(i=0; i<len; i++)
3137 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3138 }
3139
3140 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3141 int i;
3142 for(i=0; i<len; i++) {
3143 int_fast32_t tmp = ((int32_t*)src)[i];
3144 if(tmp & 0xf0000){
3145 tmp = (0x43c0ffff - tmp)>>31;
3146 // is this faster on some gcc/cpu combinations?
3147 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3148 // else tmp = 0;
3149 }
3150 dst[i] = tmp - 0x8000;
3151 }
3152 }
3153
3154 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3155 converted */
3156 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3157 {
3158 j_rev_dct (block);
3159 put_pixels_clamped_c(block, dest, line_size);
3160 }
3161 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3162 {
3163 j_rev_dct (block);
3164 add_pixels_clamped_c(block, dest, line_size);
3165 }
3166
3167 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3168 {
3169 j_rev_dct4 (block);
3170 put_pixels_clamped4_c(block, dest, line_size);
3171 }
3172 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3173 {
3174 j_rev_dct4 (block);
3175 add_pixels_clamped4_c(block, dest, line_size);
3176 }
3177
3178 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3179 {
3180 j_rev_dct2 (block);
3181 put_pixels_clamped2_c(block, dest, line_size);
3182 }
3183 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3184 {
3185 j_rev_dct2 (block);
3186 add_pixels_clamped2_c(block, dest, line_size);
3187 }
3188
3189 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3190 {
3191 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3192
3193 dest[0] = cm[(block[0] + 4)>>3];
3194 }
3195 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3196 {
3197 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3198
3199 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3200 }
3201
3202 static void just_return() { return; }
875 3203
876 /* init static data */ 3204 /* init static data */
877 void dsputil_static_init(void) 3205 void dsputil_static_init(void)
878 { 3206 {
879 int i; 3207 int i;
881 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i; 3209 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
882 for(i=0;i<MAX_NEG_CROP;i++) { 3210 for(i=0;i<MAX_NEG_CROP;i++) {
883 cropTbl[i] = 0; 3211 cropTbl[i] = 0;
884 cropTbl[i + MAX_NEG_CROP + 256] = 255; 3212 cropTbl[i + MAX_NEG_CROP + 256] = 255;
885 } 3213 }
886 3214
887 for(i=0;i<512;i++) { 3215 for(i=0;i<512;i++) {
888 squareTbl[i] = (i - 256) * (i - 256); 3216 squareTbl[i] = (i - 256) * (i - 256);
889 } 3217 }
890 3218
891 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1; 3219 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
892 } 3220 }
3221
3222
3223 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3224 {
3225 int i;
3226
3227 #ifdef CONFIG_ENCODERS
3228 if(avctx->dct_algo==FF_DCT_FASTINT) {
3229 c->fdct = fdct_ifast;
3230 c->fdct248 = fdct_ifast248;
3231 }
3232 else if(avctx->dct_algo==FF_DCT_FAAN) {
3233 c->fdct = ff_faandct;
3234 c->fdct248 = ff_faandct248;
3235 }
3236 else {
3237 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3238 c->fdct248 = ff_fdct248_islow;
3239 }
3240 #endif //CONFIG_ENCODERS
3241
3242 c->get_pixels = get_pixels_c;
3243 c->diff_pixels = diff_pixels_c;
3244 c->put_pixels_clamped = put_pixels_clamped_c;
3245 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3246 c->add_pixels_clamped = add_pixels_clamped_c;
3247 c->add_pixels8 = add_pixels8_c;
3248 c->add_pixels4 = add_pixels4_c;
3249 c->gmc1 = gmc1_c;
3250 c->gmc = ff_gmc_c;
3251 c->clear_blocks = clear_blocks_c;
3252 c->pix_sum = pix_sum_c;
3253 c->pix_norm1 = pix_norm1_c;
3254
3255 /* TODO [0] 16 [1] 8 */
3256 c->pix_abs[0][0] = pix_abs16_c;
3257 c->pix_abs[0][1] = pix_abs16_x2_c;
3258 c->pix_abs[0][2] = pix_abs16_y2_c;
3259 c->pix_abs[0][3] = pix_abs16_xy2_c;
3260 c->pix_abs[1][0] = pix_abs8_c;
3261 c->pix_abs[1][1] = pix_abs8_x2_c;
3262 c->pix_abs[1][2] = pix_abs8_y2_c;
3263 c->pix_abs[1][3] = pix_abs8_xy2_c;
3264
3265 #define dspfunc(PFX, IDX, NUM) \
3266 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3267 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3268 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3269 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3270
3271 dspfunc(put, 0, 16);
3272 dspfunc(put_no_rnd, 0, 16);
3273 dspfunc(put, 1, 8);
3274 dspfunc(put_no_rnd, 1, 8);
3275 dspfunc(put, 2, 4);
3276 dspfunc(put, 3, 2);
3277
3278 dspfunc(avg, 0, 16);
3279 dspfunc(avg_no_rnd, 0, 16);
3280 dspfunc(avg, 1, 8);
3281 dspfunc(avg_no_rnd, 1, 8);
3282 dspfunc(avg, 2, 4);
3283 dspfunc(avg, 3, 2);
3284 #undef dspfunc
3285
3286 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3287 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3288
3289 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3290 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3291 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3292 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3293 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3294 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3295 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3296 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3297 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3298
3299 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3300 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3301 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3302 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3303 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3304 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3305 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3306 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3307 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3308
3309 #define dspfunc(PFX, IDX, NUM) \
3310 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3311 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3312 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3313 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3314 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3315 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3316 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3317 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3318 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3319 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3320 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3321 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3322 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3323 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3324 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3325 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3326
3327 dspfunc(put_qpel, 0, 16);
3328 dspfunc(put_no_rnd_qpel, 0, 16);
3329
3330 dspfunc(avg_qpel, 0, 16);
3331 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3332
3333 dspfunc(put_qpel, 1, 8);
3334 dspfunc(put_no_rnd_qpel, 1, 8);
3335
3336 dspfunc(avg_qpel, 1, 8);
3337 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3338
3339 dspfunc(put_h264_qpel, 0, 16);
3340 dspfunc(put_h264_qpel, 1, 8);
3341 dspfunc(put_h264_qpel, 2, 4);
3342 dspfunc(put_h264_qpel, 3, 2);
3343 dspfunc(avg_h264_qpel, 0, 16);
3344 dspfunc(avg_h264_qpel, 1, 8);
3345 dspfunc(avg_h264_qpel, 2, 4);
3346
3347 #undef dspfunc
3348 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3349 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3350 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3351 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3352 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3353 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3354 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
3355
3356 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3357 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3358 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3359 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3360 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3361 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3362 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3363 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3364 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3365 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3366 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3367 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3368 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3369 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3370 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3371 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3372 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3373 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3374 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3375 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3376
3377 #ifdef CONFIG_CAVS_DECODER
3378 ff_cavsdsp_init(c,avctx);
3379 #endif
3380 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
3381 ff_vc1dsp_init(c,avctx);
3382 #endif
3383
3384 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3385 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3386 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3387 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3388 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3389 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3390 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3391 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3392
3393 #define SET_CMP_FUNC(name) \
3394 c->name[0]= name ## 16_c;\
3395 c->name[1]= name ## 8x8_c;
3396
3397 c->add_bytes= add_bytes_c;
3398 c->diff_bytes= diff_bytes_c;
3399 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3400 c->bswap_buf= bswap_buf;
3401
3402 c->try_8x8basis= try_8x8basis_c;
3403 c->add_8x8basis= add_8x8basis_c;
3404
3405 #ifdef CONFIG_SNOW_ENCODER
3406 c->vertical_compose97i = ff_snow_vertical_compose97i;
3407 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
3408 c->inner_add_yblock = ff_snow_inner_add_yblock;
3409 #endif
3410
3411 #ifdef CONFIG_VORBIS_DECODER
3412 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3413 #endif
3414 c->vector_fmul = vector_fmul_c;
3415 c->vector_fmul_reverse = vector_fmul_reverse_c;
3416 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
3417 c->float_to_int16 = ff_float_to_int16_c;
3418
3419 c->prefetch= just_return;
3420
3421 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3422 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3423
3424 #ifdef HAVE_MMX
3425 dsputil_init_mmx(c, avctx);
3426 #endif
3427 #ifdef ARCH_ARMV4L
3428 dsputil_init_armv4l(c, avctx);
3429 #endif
3430 #ifdef HAVE_MLIB
3431 dsputil_init_mlib(c, avctx);
3432 #endif
3433 #ifdef ARCH_SPARC
3434 dsputil_init_vis(c,avctx);
3435 #endif
3436 #ifdef ARCH_ALPHA
3437 dsputil_init_alpha(c, avctx);
3438 #endif
3439 #ifdef ARCH_POWERPC
3440 dsputil_init_ppc(c, avctx);
3441 #endif
3442 #ifdef HAVE_MMI
3443 dsputil_init_mmi(c, avctx);
3444 #endif
3445 #ifdef ARCH_SH4
3446 dsputil_init_sh4(c,avctx);
3447 #endif
3448 #ifdef ARCH_BFIN
3449 dsputil_init_bfin(c,avctx);
3450 #endif
3451
3452 for(i=0; i<64; i++){
3453 if(!c->put_2tap_qpel_pixels_tab[0][i])
3454 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3455 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3456 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3457 }
3458
3459 switch(c->idct_permutation_type){
3460 case FF_NO_IDCT_PERM:
3461 for(i=0; i<64; i++)
3462 c->idct_permutation[i]= i;
3463 break;
3464 case FF_LIBMPEG2_IDCT_PERM:
3465 for(i=0; i<64; i++)
3466 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3467 break;
3468 case FF_SIMPLE_IDCT_PERM:
3469 for(i=0; i<64; i++)
3470 c->idct_permutation[i]= simple_mmx_permutation[i];
3471 break;
3472 case FF_TRANSPOSE_IDCT_PERM:
3473 for(i=0; i<64; i++)
3474 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3475 break;
3476 case FF_PARTTRANS_IDCT_PERM:
3477 for(i=0; i<64; i++)
3478 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3479 break;
3480 default:
3481 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
3482 }
3483 }
3484