|
808
|
1 /*
|
|
|
2 * Copyright (c) 2002 Brian Foley
|
|
|
3 * Copyright (c) 2002 Dieter Shirley
|
|
|
4 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
|
|
|
5 *
|
|
|
6 * This file is part of FFmpeg.
|
|
|
7 *
|
|
|
8 * FFmpeg is free software; you can redistribute it and/or
|
|
|
9 * modify it under the terms of the GNU Lesser General Public
|
|
|
10 * License as published by the Free Software Foundation; either
|
|
|
11 * version 2.1 of the License, or (at your option) any later version.
|
|
|
12 *
|
|
|
13 * FFmpeg is distributed in the hope that it will be useful,
|
|
|
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
16 * Lesser General Public License for more details.
|
|
|
17 *
|
|
|
18 * You should have received a copy of the GNU Lesser General Public
|
|
|
19 * License along with FFmpeg; if not, write to the Free Software
|
|
|
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
21 */
|
|
|
22
|
|
|
23 #include "../dsputil.h"
|
|
|
24
|
|
|
25 #include "gcc_fixes.h"
|
|
|
26
|
|
|
27 #include "dsputil_altivec.h"
|
|
|
28
|
|
|
29 #ifdef CONFIG_DARWIN
|
|
|
30 #include <sys/sysctl.h>
|
|
|
31 #else /* CONFIG_DARWIN */
|
|
|
32 #ifdef __AMIGAOS4__
|
|
|
33 #include <exec/exec.h>
|
|
|
34 #include <interfaces/exec.h>
|
|
|
35 #include <proto/exec.h>
|
|
|
36 #else /* __AMIGAOS4__ */
|
|
|
37 #include <signal.h>
|
|
|
38 #include <setjmp.h>
|
|
|
39
|
|
|
40 static sigjmp_buf jmpbuf;
|
|
|
41 static volatile sig_atomic_t canjump = 0;
|
|
|
42
|
|
|
43 static void sigill_handler (int sig)
|
|
|
44 {
|
|
|
45 if (!canjump) {
|
|
|
46 signal (sig, SIG_DFL);
|
|
|
47 raise (sig);
|
|
|
48 }
|
|
|
49
|
|
|
50 canjump = 0;
|
|
|
51 siglongjmp (jmpbuf, 1);
|
|
|
52 }
|
|
|
53 #endif /* CONFIG_DARWIN */
|
|
|
54 #endif /* __AMIGAOS4__ */
|
|
|
55
|
|
|
56 int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
|
|
57 {
|
|
|
58 int i;
|
|
|
59 int s __attribute__((aligned(16)));
|
|
|
60 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
|
|
|
61 vector unsigned char *tv;
|
|
|
62 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
|
|
|
63 vector unsigned int sad;
|
|
|
64 vector signed int sumdiffs;
|
|
|
65
|
|
|
66 s = 0;
|
|
|
67 sad = (vector unsigned int)vec_splat_u32(0);
|
|
|
68 for(i=0;i<h;i++) {
|
|
|
69 /*
|
|
|
70 Read unaligned pixels into our vectors. The vectors are as follows:
|
|
|
71 pix1v: pix1[0]-pix1[15]
|
|
|
72 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
|
|
|
73 */
|
|
|
74 tv = (vector unsigned char *) pix1;
|
|
|
75 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
|
|
|
76
|
|
|
77 tv = (vector unsigned char *) &pix2[0];
|
|
|
78 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
|
|
|
79
|
|
|
80 tv = (vector unsigned char *) &pix2[1];
|
|
|
81 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
|
|
|
82
|
|
|
83 /* Calculate the average vector */
|
|
|
84 avgv = vec_avg(pix2v, pix2iv);
|
|
|
85
|
|
|
86 /* Calculate a sum of abs differences vector */
|
|
|
87 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
|
|
|
88
|
|
|
89 /* Add each 4 pixel group together and put 4 results into sad */
|
|
|
90 sad = vec_sum4s(t5, sad);
|
|
|
91
|
|
|
92 pix1 += line_size;
|
|
|
93 pix2 += line_size;
|
|
|
94 }
|
|
|
95 /* Sum up the four partial sums, and put the result into s */
|
|
|
96 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
|
|
97 sumdiffs = vec_splat(sumdiffs, 3);
|
|
|
98 vec_ste(sumdiffs, 0, &s);
|
|
|
99
|
|
|
100 return s;
|
|
|
101 }
|
|
|
102
|
|
|
103 int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
|
|
104 {
|
|
|
105 int i;
|
|
|
106 int s __attribute__((aligned(16)));
|
|
|
107 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
|
|
|
108 vector unsigned char *tv;
|
|
|
109 vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
|
|
|
110 vector unsigned int sad;
|
|
|
111 vector signed int sumdiffs;
|
|
|
112 uint8_t *pix3 = pix2 + line_size;
|
|
|
113
|
|
|
114 s = 0;
|
|
|
115 sad = (vector unsigned int)vec_splat_u32(0);
|
|
|
116
|
|
|
117 /*
|
|
|
118 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
|
|
|
119 iteration becomes pix2 in the next iteration. We can use this
|
|
|
120 fact to avoid a potentially expensive unaligned read, each
|
|
|
121 time around the loop.
|
|
|
122 Read unaligned pixels into our vectors. The vectors are as follows:
|
|
|
123 pix2v: pix2[0]-pix2[15]
|
|
|
124 Split the pixel vectors into shorts
|
|
|
125 */
|
|
|
126 tv = (vector unsigned char *) &pix2[0];
|
|
|
127 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
|
|
|
128
|
|
|
129 for(i=0;i<h;i++) {
|
|
|
130 /*
|
|
|
131 Read unaligned pixels into our vectors. The vectors are as follows:
|
|
|
132 pix1v: pix1[0]-pix1[15]
|
|
|
133 pix3v: pix3[0]-pix3[15]
|
|
|
134 */
|
|
|
135 tv = (vector unsigned char *) pix1;
|
|
|
136 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
|
|
|
137
|
|
|
138 tv = (vector unsigned char *) &pix3[0];
|
|
|
139 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
|
|
|
140
|
|
|
141 /* Calculate the average vector */
|
|
|
142 avgv = vec_avg(pix2v, pix3v);
|
|
|
143
|
|
|
144 /* Calculate a sum of abs differences vector */
|
|
|
145 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
|
|
|
146
|
|
|
147 /* Add each 4 pixel group together and put 4 results into sad */
|
|
|
148 sad = vec_sum4s(t5, sad);
|
|
|
149
|
|
|
150 pix1 += line_size;
|
|
|
151 pix2v = pix3v;
|
|
|
152 pix3 += line_size;
|
|
|
153
|
|
|
154 }
|
|
|
155
|
|
|
156 /* Sum up the four partial sums, and put the result into s */
|
|
|
157 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
|
|
158 sumdiffs = vec_splat(sumdiffs, 3);
|
|
|
159 vec_ste(sumdiffs, 0, &s);
|
|
|
160 return s;
|
|
|
161 }
|
|
|
162
|
|
|
163 int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
|
|
164 {
|
|
|
165 int i;
|
|
|
166 int s __attribute__((aligned(16)));
|
|
|
167 uint8_t *pix3 = pix2 + line_size;
|
|
|
168 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
|
|
|
169 const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
|
|
|
170 vector unsigned char *tv, avgv, t5;
|
|
|
171 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
|
|
|
172 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
|
|
|
173 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
|
|
|
174 vector unsigned short avghv, avglv;
|
|
|
175 vector unsigned short t1, t2, t3, t4;
|
|
|
176 vector unsigned int sad;
|
|
|
177 vector signed int sumdiffs;
|
|
|
178
|
|
|
179 sad = (vector unsigned int)vec_splat_u32(0);
|
|
|
180
|
|
|
181 s = 0;
|
|
|
182
|
|
|
183 /*
|
|
|
184 Due to the fact that pix3 = pix2 + line_size, the pix3 of one
|
|
|
185 iteration becomes pix2 in the next iteration. We can use this
|
|
|
186 fact to avoid a potentially expensive unaligned read, as well
|
|
|
187 as some splitting, and vector addition each time around the loop.
|
|
|
188 Read unaligned pixels into our vectors. The vectors are as follows:
|
|
|
189 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
|
|
|
190 Split the pixel vectors into shorts
|
|
|
191 */
|
|
|
192 tv = (vector unsigned char *) &pix2[0];
|
|
|
193 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
|
|
|
194
|
|
|
195 tv = (vector unsigned char *) &pix2[1];
|
|
|
196 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
|
|
|
197
|
|
|
198 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
|
|
|
199 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
|
|
|
200 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
|
|
|
201 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
|
|
|
202 t1 = vec_add(pix2hv, pix2ihv);
|
|
|
203 t2 = vec_add(pix2lv, pix2ilv);
|
|
|
204
|
|
|
205 for(i=0;i<h;i++) {
|
|
|
206 /*
|
|
|
207 Read unaligned pixels into our vectors. The vectors are as follows:
|
|
|
208 pix1v: pix1[0]-pix1[15]
|
|
|
209 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
|
|
|
210 */
|
|
|
211 tv = (vector unsigned char *) pix1;
|
|
|
212 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
|
|
|
213
|
|
|
214 tv = (vector unsigned char *) &pix3[0];
|
|
|
215 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
|
|
|
216
|
|
|
217 tv = (vector unsigned char *) &pix3[1];
|
|
|
218 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
|
|
|
219
|
|
|
220 /*
|
|
|
221 Note that Altivec does have vec_avg, but this works on vector pairs
|
|
|
222 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
|
|
|
223 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
|
|
|
224 Instead, we have to split the pixel vectors into vectors of shorts,
|
|
|
225 and do the averaging by hand.
|
|
|
226 */
|
|
|
227
|
|
|
228 /* Split the pixel vectors into shorts */
|
|
|
229 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
|
|
|
230 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
|
|
|
231 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
|
|
|
232 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
|
|
|
233
|
|
|
234 /* Do the averaging on them */
|
|
|
235 t3 = vec_add(pix3hv, pix3ihv);
|
|
|
236 t4 = vec_add(pix3lv, pix3ilv);
|
|
|
237
|
|
|
238 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
|
|
|
239 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
|
|
|
240
|
|
|
241 /* Pack the shorts back into a result */
|
|
|
242 avgv = vec_pack(avghv, avglv);
|
|
|
243
|
|
|
244 /* Calculate a sum of abs differences vector */
|
|
|
245 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
|
|
|
246
|
|
|
247 /* Add each 4 pixel group together and put 4 results into sad */
|
|
|
248 sad = vec_sum4s(t5, sad);
|
|
|
249
|
|
|
250 pix1 += line_size;
|
|
|
251 pix3 += line_size;
|
|
|
252 /* Transfer the calculated values for pix3 into pix2 */
|
|
|
253 t1 = t3;
|
|
|
254 t2 = t4;
|
|
|
255 }
|
|
|
256 /* Sum up the four partial sums, and put the result into s */
|
|
|
257 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
|
|
258 sumdiffs = vec_splat(sumdiffs, 3);
|
|
|
259 vec_ste(sumdiffs, 0, &s);
|
|
|
260
|
|
|
261 return s;
|
|
|
262 }
|
|
|
263
|
|
|
264 int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
|
|
265 {
|
|
|
266 int i;
|
|
|
267 int s __attribute__((aligned(16)));
|
|
|
268 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
|
|
269 vector unsigned char perm1, perm2, *pix1v, *pix2v;
|
|
|
270 vector unsigned char t1, t2, t3,t4, t5;
|
|
|
271 vector unsigned int sad;
|
|
|
272 vector signed int sumdiffs;
|
|
|
273
|
|
|
274 sad = (vector unsigned int)vec_splat_u32(0);
|
|
|
275
|
|
|
276
|
|
|
277 for(i=0;i<h;i++) {
|
|
|
278 /* Read potentially unaligned pixels into t1 and t2 */
|
|
|
279 perm1 = vec_lvsl(0, pix1);
|
|
|
280 pix1v = (vector unsigned char *) pix1;
|
|
|
281 perm2 = vec_lvsl(0, pix2);
|
|
|
282 pix2v = (vector unsigned char *) pix2;
|
|
|
283 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
|
|
|
284 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
|
|
|
285
|
|
|
286 /* Calculate a sum of abs differences vector */
|
|
|
287 t3 = vec_max(t1, t2);
|
|
|
288 t4 = vec_min(t1, t2);
|
|
|
289 t5 = vec_sub(t3, t4);
|
|
|
290
|
|
|
291 /* Add each 4 pixel group together and put 4 results into sad */
|
|
|
292 sad = vec_sum4s(t5, sad);
|
|
|
293
|
|
|
294 pix1 += line_size;
|
|
|
295 pix2 += line_size;
|
|
|
296 }
|
|
|
297
|
|
|
298 /* Sum up the four partial sums, and put the result into s */
|
|
|
299 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
|
|
300 sumdiffs = vec_splat(sumdiffs, 3);
|
|
|
301 vec_ste(sumdiffs, 0, &s);
|
|
|
302
|
|
|
303 return s;
|
|
|
304 }
|
|
|
305
|
|
|
306 int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
|
|
307 {
|
|
|
308 int i;
|
|
|
309 int s __attribute__((aligned(16)));
|
|
|
310 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
|
|
311 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
|
|
|
312 vector unsigned char t1, t2, t3,t4, t5;
|
|
|
313 vector unsigned int sad;
|
|
|
314 vector signed int sumdiffs;
|
|
|
315
|
|
|
316 sad = (vector unsigned int)vec_splat_u32(0);
|
|
|
317
|
|
|
318 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
|
|
|
319
|
|
|
320 for(i=0;i<h;i++) {
|
|
|
321 /* Read potentially unaligned pixels into t1 and t2
|
|
|
322 Since we're reading 16 pixels, and actually only want 8,
|
|
|
323 mask out the last 8 pixels. The 0s don't change the sum. */
|
|
|
324 perm1 = vec_lvsl(0, pix1);
|
|
|
325 pix1v = (vector unsigned char *) pix1;
|
|
|
326 perm2 = vec_lvsl(0, pix2);
|
|
|
327 pix2v = (vector unsigned char *) pix2;
|
|
|
328 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
|
|
|
329 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
|
|
|
330
|
|
|
331 /* Calculate a sum of abs differences vector */
|
|
|
332 t3 = vec_max(t1, t2);
|
|
|
333 t4 = vec_min(t1, t2);
|
|
|
334 t5 = vec_sub(t3, t4);
|
|
|
335
|
|
|
336 /* Add each 4 pixel group together and put 4 results into sad */
|
|
|
337 sad = vec_sum4s(t5, sad);
|
|
|
338
|
|
|
339 pix1 += line_size;
|
|
|
340 pix2 += line_size;
|
|
|
341 }
|
|
|
342
|
|
|
343 /* Sum up the four partial sums, and put the result into s */
|
|
|
344 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
|
|
345 sumdiffs = vec_splat(sumdiffs, 3);
|
|
|
346 vec_ste(sumdiffs, 0, &s);
|
|
|
347
|
|
|
348 return s;
|
|
|
349 }
|
|
|
350
|
|
|
351 int pix_norm1_altivec(uint8_t *pix, int line_size)
|
|
|
352 {
|
|
|
353 int i;
|
|
|
354 int s __attribute__((aligned(16)));
|
|
|
355 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
|
|
356 vector unsigned char *tv;
|
|
|
357 vector unsigned char pixv;
|
|
|
358 vector unsigned int sv;
|
|
|
359 vector signed int sum;
|
|
|
360
|
|
|
361 sv = (vector unsigned int)vec_splat_u32(0);
|
|
|
362
|
|
|
363 s = 0;
|
|
|
364 for (i = 0; i < 16; i++) {
|
|
|
365 /* Read in the potentially unaligned pixels */
|
|
|
366 tv = (vector unsigned char *) pix;
|
|
|
367 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
|
|
|
368
|
|
|
369 /* Square the values, and add them to our sum */
|
|
|
370 sv = vec_msum(pixv, pixv, sv);
|
|
|
371
|
|
|
372 pix += line_size;
|
|
|
373 }
|
|
|
374 /* Sum up the four partial sums, and put the result into s */
|
|
|
375 sum = vec_sums((vector signed int) sv, (vector signed int) zero);
|
|
|
376 sum = vec_splat(sum, 3);
|
|
|
377 vec_ste(sum, 0, &s);
|
|
|
378
|
|
|
379 return s;
|
|
|
380 }
|
|
|
381
|
|
|
382 /**
|
|
|
383 * Sum of Squared Errors for a 8x8 block.
|
|
|
384 * AltiVec-enhanced.
|
|
|
385 * It's the sad8_altivec code above w/ squaring added.
|
|
|
386 */
|
|
|
387 int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
|
|
388 {
|
|
|
389 int i;
|
|
|
390 int s __attribute__((aligned(16)));
|
|
|
391 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
|
|
392 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
|
|
|
393 vector unsigned char t1, t2, t3,t4, t5;
|
|
|
394 vector unsigned int sum;
|
|
|
395 vector signed int sumsqr;
|
|
|
396
|
|
|
397 sum = (vector unsigned int)vec_splat_u32(0);
|
|
|
398
|
|
|
399 permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
|
|
|
400
|
|
|
401
|
|
|
402 for(i=0;i<h;i++) {
|
|
|
403 /* Read potentially unaligned pixels into t1 and t2
|
|
|
404 Since we're reading 16 pixels, and actually only want 8,
|
|
|
405 mask out the last 8 pixels. The 0s don't change the sum. */
|
|
|
406 perm1 = vec_lvsl(0, pix1);
|
|
|
407 pix1v = (vector unsigned char *) pix1;
|
|
|
408 perm2 = vec_lvsl(0, pix2);
|
|
|
409 pix2v = (vector unsigned char *) pix2;
|
|
|
410 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
|
|
|
411 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
|
|
|
412
|
|
|
413 /*
|
|
|
414 Since we want to use unsigned chars, we can take advantage
|
|
|
415 of the fact that abs(a-b)^2 = (a-b)^2.
|
|
|
416 */
|
|
|
417
|
|
|
418 /* Calculate abs differences vector */
|
|
|
419 t3 = vec_max(t1, t2);
|
|
|
420 t4 = vec_min(t1, t2);
|
|
|
421 t5 = vec_sub(t3, t4);
|
|
|
422
|
|
|
423 /* Square the values and add them to our sum */
|
|
|
424 sum = vec_msum(t5, t5, sum);
|
|
|
425
|
|
|
426 pix1 += line_size;
|
|
|
427 pix2 += line_size;
|
|
|
428 }
|
|
|
429
|
|
|
430 /* Sum up the four partial sums, and put the result into s */
|
|
|
431 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
|
|
|
432 sumsqr = vec_splat(sumsqr, 3);
|
|
|
433 vec_ste(sumsqr, 0, &s);
|
|
|
434
|
|
|
435 return s;
|
|
|
436 }
|
|
|
437
|
|
|
438 /**
|
|
|
439 * Sum of Squared Errors for a 16x16 block.
|
|
|
440 * AltiVec-enhanced.
|
|
|
441 * It's the sad16_altivec code above w/ squaring added.
|
|
|
442 */
|
|
|
443 int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
|
|
|
444 {
|
|
|
445 int i;
|
|
|
446 int s __attribute__((aligned(16)));
|
|
|
447 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
|
|
448 vector unsigned char perm1, perm2, *pix1v, *pix2v;
|
|
|
449 vector unsigned char t1, t2, t3,t4, t5;
|
|
|
450 vector unsigned int sum;
|
|
|
451 vector signed int sumsqr;
|
|
|
452
|
|
|
453 sum = (vector unsigned int)vec_splat_u32(0);
|
|
|
454
|
|
|
455 for(i=0;i<h;i++) {
|
|
|
456 /* Read potentially unaligned pixels into t1 and t2 */
|
|
|
457 perm1 = vec_lvsl(0, pix1);
|
|
|
458 pix1v = (vector unsigned char *) pix1;
|
|
|
459 perm2 = vec_lvsl(0, pix2);
|
|
|
460 pix2v = (vector unsigned char *) pix2;
|
|
|
461 t1 = vec_perm(pix1v[0], pix1v[1], perm1);
|
|
|
462 t2 = vec_perm(pix2v[0], pix2v[1], perm2);
|
|
|
463
|
|
|
464 /*
|
|
|
465 Since we want to use unsigned chars, we can take advantage
|
|
|
466 of the fact that abs(a-b)^2 = (a-b)^2.
|
|
|
467 */
|
|
|
468
|
|
|
469 /* Calculate abs differences vector */
|
|
|
470 t3 = vec_max(t1, t2);
|
|
|
471 t4 = vec_min(t1, t2);
|
|
|
472 t5 = vec_sub(t3, t4);
|
|
|
473
|
|
|
474 /* Square the values and add them to our sum */
|
|
|
475 sum = vec_msum(t5, t5, sum);
|
|
|
476
|
|
|
477 pix1 += line_size;
|
|
|
478 pix2 += line_size;
|
|
|
479 }
|
|
|
480
|
|
|
481 /* Sum up the four partial sums, and put the result into s */
|
|
|
482 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
|
|
|
483 sumsqr = vec_splat(sumsqr, 3);
|
|
|
484 vec_ste(sumsqr, 0, &s);
|
|
|
485
|
|
|
486 return s;
|
|
|
487 }
|
|
|
488
|
|
|
489 int pix_sum_altivec(uint8_t * pix, int line_size)
|
|
|
490 {
|
|
|
491 const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
|
|
|
492 vector unsigned char perm, *pixv;
|
|
|
493 vector unsigned char t1;
|
|
|
494 vector unsigned int sad;
|
|
|
495 vector signed int sumdiffs;
|
|
|
496
|
|
|
497 int i;
|
|
|
498 int s __attribute__((aligned(16)));
|
|
|
499
|
|
|
500 sad = (vector unsigned int)vec_splat_u32(0);
|
|
|
501
|
|
|
502 for (i = 0; i < 16; i++) {
|
|
|
503 /* Read the potentially unaligned 16 pixels into t1 */
|
|
|
504 perm = vec_lvsl(0, pix);
|
|
|
505 pixv = (vector unsigned char *) pix;
|
|
|
506 t1 = vec_perm(pixv[0], pixv[1], perm);
|
|
|
507
|
|
|
508 /* Add each 4 pixel group together and put 4 results into sad */
|
|
|
509 sad = vec_sum4s(t1, sad);
|
|
|
510
|
|
|
511 pix += line_size;
|
|
|
512 }
|
|
|
513
|
|
|
514 /* Sum up the four partial sums, and put the result into s */
|
|
|
515 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
|
|
|
516 sumdiffs = vec_splat(sumdiffs, 3);
|
|
|
517 vec_ste(sumdiffs, 0, &s);
|
|
|
518
|
|
|
519 return s;
|
|
|
520 }
|
|
|
521
|
|
|
522 void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
|
|
|
523 {
|
|
|
524 int i;
|
|
|
525 vector unsigned char perm, bytes, *pixv;
|
|
|
526 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
|
|
|
527 vector signed short shorts;
|
|
|
528
|
|
|
529 for(i=0;i<8;i++)
|
|
|
530 {
|
|
|
531 // Read potentially unaligned pixels.
|
|
|
532 // We're reading 16 pixels, and actually only want 8,
|
|
|
533 // but we simply ignore the extras.
|
|
|
534 perm = vec_lvsl(0, pixels);
|
|
|
535 pixv = (vector unsigned char *) pixels;
|
|
|
536 bytes = vec_perm(pixv[0], pixv[1], perm);
|
|
|
537
|
|
|
538 // convert the bytes into shorts
|
|
|
539 shorts = (vector signed short)vec_mergeh(zero, bytes);
|
|
|
540
|
|
|
541 // save the data to the block, we assume the block is 16-byte aligned
|
|
|
542 vec_st(shorts, i*16, (vector signed short*)block);
|
|
|
543
|
|
|
544 pixels += line_size;
|
|
|
545 }
|
|
|
546 }
|
|
|
547
|
|
|
548 void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
|
|
|
549 const uint8_t *s2, int stride)
|
|
|
550 {
|
|
|
551 int i;
|
|
|
552 vector unsigned char perm, bytes, *pixv;
|
|
|
553 const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
|
|
|
554 vector signed short shorts1, shorts2;
|
|
|
555
|
|
|
556 for(i=0;i<4;i++)
|
|
|
557 {
|
|
|
558 // Read potentially unaligned pixels
|
|
|
559 // We're reading 16 pixels, and actually only want 8,
|
|
|
560 // but we simply ignore the extras.
|
|
|
561 perm = vec_lvsl(0, s1);
|
|
|
562 pixv = (vector unsigned char *) s1;
|
|
|
563 bytes = vec_perm(pixv[0], pixv[1], perm);
|
|
|
564
|
|
|
565 // convert the bytes into shorts
|
|
|
566 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
|
|
|
567
|
|
|
568 // Do the same for the second block of pixels
|
|
|
569 perm = vec_lvsl(0, s2);
|
|
|
570 pixv = (vector unsigned char *) s2;
|
|
|
571 bytes = vec_perm(pixv[0], pixv[1], perm);
|
|
|
572
|
|
|
573 // convert the bytes into shorts
|
|
|
574 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
|
|
|
575
|
|
|
576 // Do the subtraction
|
|
|
577 shorts1 = vec_sub(shorts1, shorts2);
|
|
|
578
|
|
|
579 // save the data to the block, we assume the block is 16-byte aligned
|
|
|
580 vec_st(shorts1, 0, (vector signed short*)block);
|
|
|
581
|
|
|
582 s1 += stride;
|
|
|
583 s2 += stride;
|
|
|
584 block += 8;
|
|
|
585
|
|
|
586
|
|
|
587 // The code below is a copy of the code above... This is a manual
|
|
|
588 // unroll.
|
|
|
589
|
|
|
590 // Read potentially unaligned pixels
|
|
|
591 // We're reading 16 pixels, and actually only want 8,
|
|
|
592 // but we simply ignore the extras.
|
|
|
593 perm = vec_lvsl(0, s1);
|
|
|
594 pixv = (vector unsigned char *) s1;
|
|
|
595 bytes = vec_perm(pixv[0], pixv[1], perm);
|
|
|
596
|
|
|
597 // convert the bytes into shorts
|
|
|
598 shorts1 = (vector signed short)vec_mergeh(zero, bytes);
|
|
|
599
|
|
|
600 // Do the same for the second block of pixels
|
|
|
601 perm = vec_lvsl(0, s2);
|
|
|
602 pixv = (vector unsigned char *) s2;
|
|
|
603 bytes = vec_perm(pixv[0], pixv[1], perm);
|
|
|
604
|
|
|
605 // convert the bytes into shorts
|
|
|
606 shorts2 = (vector signed short)vec_mergeh(zero, bytes);
|
|
|
607
|
|
|
608 // Do the subtraction
|
|
|
609 shorts1 = vec_sub(shorts1, shorts2);
|
|
|
610
|
|
|
611 // save the data to the block, we assume the block is 16-byte aligned
|
|
|
612 vec_st(shorts1, 0, (vector signed short*)block);
|
|
|
613
|
|
|
614 s1 += stride;
|
|
|
615 s2 += stride;
|
|
|
616 block += 8;
|
|
|
617 }
|
|
|
618 }
|
|
|
619
|
|
|
620 void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
|
|
|
621 register int i;
|
|
|
622 register vector unsigned char vdst, vsrc;
|
|
|
623
|
|
|
624 /* dst and src are 16 bytes-aligned (guaranteed) */
|
|
|
625 for(i = 0 ; (i + 15) < w ; i+=16)
|
|
|
626 {
|
|
|
627 vdst = vec_ld(i, (unsigned char*)dst);
|
|
|
628 vsrc = vec_ld(i, (unsigned char*)src);
|
|
|
629 vdst = vec_add(vsrc, vdst);
|
|
|
630 vec_st(vdst, i, (unsigned char*)dst);
|
|
|
631 }
|
|
|
632 /* if w is not a multiple of 16 */
|
|
|
633 for (; (i < w) ; i++)
|
|
|
634 {
|
|
|
635 dst[i] = src[i];
|
|
|
636 }
|
|
|
637 }
|
|
|
638
|
|
|
639 /* next one assumes that ((line_size % 16) == 0) */
|
|
|
640 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
|
641 {
|
|
|
642 POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
|
|
|
643 register vector unsigned char pixelsv1, pixelsv2;
|
|
|
644 register vector unsigned char pixelsv1B, pixelsv2B;
|
|
|
645 register vector unsigned char pixelsv1C, pixelsv2C;
|
|
|
646 register vector unsigned char pixelsv1D, pixelsv2D;
|
|
|
647
|
|
|
648 register vector unsigned char perm = vec_lvsl(0, pixels);
|
|
|
649 int i;
|
|
|
650 register int line_size_2 = line_size << 1;
|
|
|
651 register int line_size_3 = line_size + line_size_2;
|
|
|
652 register int line_size_4 = line_size << 2;
|
|
|
653
|
|
|
654 POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
|
|
|
655 // hand-unrolling the loop by 4 gains about 15%
|
|
|
656 // mininum execution time goes from 74 to 60 cycles
|
|
|
657 // it's faster than -funroll-loops, but using
|
|
|
658 // -funroll-loops w/ this is bad - 74 cycles again.
|
|
|
659 // all this is on a 7450, tuning for the 7450
|
|
|
660 #if 0
|
|
|
661 for(i=0; i<h; i++) {
|
|
|
662 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
|
|
|
663 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
|
|
|
664 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
|
|
|
665 0, (unsigned char*)block);
|
|
|
666 pixels+=line_size;
|
|
|
667 block +=line_size;
|
|
|
668 }
|
|
|
669 #else
|
|
|
670 for(i=0; i<h; i+=4) {
|
|
|
671 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
|
|
|
672 pixelsv2 = vec_ld(15, (unsigned char*)pixels);
|
|
|
673 pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
|
|
|
674 pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels);
|
|
|
675 pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
|
|
|
676 pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels);
|
|
|
677 pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
|
|
|
678 pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels);
|
|
|
679 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
|
|
|
680 0, (unsigned char*)block);
|
|
|
681 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
|
|
|
682 line_size, (unsigned char*)block);
|
|
|
683 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
|
|
|
684 line_size_2, (unsigned char*)block);
|
|
|
685 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
|
|
|
686 line_size_3, (unsigned char*)block);
|
|
|
687 pixels+=line_size_4;
|
|
|
688 block +=line_size_4;
|
|
|
689 }
|
|
|
690 #endif
|
|
|
691 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
|
|
|
692 }
|
|
|
693
|
|
|
694 /* next one assumes that ((line_size % 16) == 0) */
|
|
|
695 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
|
|
|
696 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
|
697 {
|
|
|
698 POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
|
|
|
699 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
|
|
|
700 register vector unsigned char perm = vec_lvsl(0, pixels);
|
|
|
701 int i;
|
|
|
702
|
|
|
703 POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
|
|
|
704
|
|
|
705 for(i=0; i<h; i++) {
|
|
|
706 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
|
|
|
707 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
|
|
|
708 blockv = vec_ld(0, block);
|
|
|
709 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
|
|
|
710 blockv = vec_avg(blockv,pixelsv);
|
|
|
711 vec_st(blockv, 0, (unsigned char*)block);
|
|
|
712 pixels+=line_size;
|
|
|
713 block +=line_size;
|
|
|
714 }
|
|
|
715
|
|
|
716 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
|
|
|
717 }
|
|
|
718
|
|
|
719 /* next one assumes that ((line_size % 8) == 0) */
|
|
|
720 void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
|
|
|
721 {
|
|
|
722 POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
|
|
|
723 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
|
|
|
724 int i;
|
|
|
725
|
|
|
726 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
|
|
|
727
|
|
|
728 for (i = 0; i < h; i++) {
|
|
|
729 /*
|
|
|
730 block is 8 bytes-aligned, so we're either in the
|
|
|
731 left block (16 bytes-aligned) or in the right block (not)
|
|
|
732 */
|
|
|
733 int rightside = ((unsigned long)block & 0x0000000F);
|
|
|
734
|
|
|
735 blockv = vec_ld(0, block);
|
|
|
736 pixelsv1 = vec_ld(0, (unsigned char*)pixels);
|
|
|
737 pixelsv2 = vec_ld(16, (unsigned char*)pixels);
|
|
|
738 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
|
|
|
739
|
|
|
740 if (rightside)
|
|
|
741 {
|
|
|
742 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
|
|
|
743 }
|
|
|
744 else
|
|
|
745 {
|
|
|
746 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
|
|
|
747 }
|
|
|
748
|
|
|
749 blockv = vec_avg(blockv, pixelsv);
|
|
|
750
|
|
|
751 vec_st(blockv, 0, block);
|
|
|
752
|
|
|
753 pixels += line_size;
|
|
|
754 block += line_size;
|
|
|
755 }
|
|
|
756
|
|
|
757 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
|
|
|
758 }
|
|
|
759
|
|
|
760 /* next one assumes that ((line_size % 8) == 0) */
|
|
|
761 void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
|
762 {
|
|
|
763 POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
|
|
|
764 register int i;
|
|
|
765 register vector unsigned char
|
|
|
766 pixelsv1, pixelsv2,
|
|
|
767 pixelsavg;
|
|
|
768 register vector unsigned char
|
|
|
769 blockv, temp1, temp2;
|
|
|
770 register vector unsigned short
|
|
|
771 pixelssum1, pixelssum2, temp3;
|
|
|
772 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
|
|
|
773 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
|
|
|
774
|
|
|
775 temp1 = vec_ld(0, pixels);
|
|
|
776 temp2 = vec_ld(16, pixels);
|
|
|
777 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
|
|
778 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
|
|
|
779 {
|
|
|
780 pixelsv2 = temp2;
|
|
|
781 }
|
|
|
782 else
|
|
|
783 {
|
|
|
784 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
|
|
785 }
|
|
|
786 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
|
787 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
|
788 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
|
|
789 (vector unsigned short)pixelsv2);
|
|
|
790 pixelssum1 = vec_add(pixelssum1, vctwo);
|
|
|
791
|
|
|
792 POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
|
|
|
793 for (i = 0; i < h ; i++) {
|
|
|
794 int rightside = ((unsigned long)block & 0x0000000F);
|
|
|
795 blockv = vec_ld(0, block);
|
|
|
796
|
|
|
797 temp1 = vec_ld(line_size, pixels);
|
|
|
798 temp2 = vec_ld(line_size + 16, pixels);
|
|
|
799 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
|
|
800 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
|
|
|
801 {
|
|
|
802 pixelsv2 = temp2;
|
|
|
803 }
|
|
|
804 else
|
|
|
805 {
|
|
|
806 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
|
|
807 }
|
|
|
808
|
|
|
809 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
|
810 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
|
811 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
|
|
812 (vector unsigned short)pixelsv2);
|
|
|
813 temp3 = vec_add(pixelssum1, pixelssum2);
|
|
|
814 temp3 = vec_sra(temp3, vctwo);
|
|
|
815 pixelssum1 = vec_add(pixelssum2, vctwo);
|
|
|
816 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
|
|
|
817
|
|
|
818 if (rightside)
|
|
|
819 {
|
|
|
820 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
|
|
|
821 }
|
|
|
822 else
|
|
|
823 {
|
|
|
824 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
|
|
|
825 }
|
|
|
826
|
|
|
827 vec_st(blockv, 0, block);
|
|
|
828
|
|
|
829 block += line_size;
|
|
|
830 pixels += line_size;
|
|
|
831 }
|
|
|
832
|
|
|
833 POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
|
|
|
834 }
|
|
|
835
|
|
|
836 /* next one assumes that ((line_size % 8) == 0) */
|
|
|
837 void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
|
838 {
|
|
|
839 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
|
|
840 register int i;
|
|
|
841 register vector unsigned char
|
|
|
842 pixelsv1, pixelsv2,
|
|
|
843 pixelsavg;
|
|
|
844 register vector unsigned char
|
|
|
845 blockv, temp1, temp2;
|
|
|
846 register vector unsigned short
|
|
|
847 pixelssum1, pixelssum2, temp3;
|
|
|
848 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
|
|
|
849 register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
|
|
|
850 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
|
|
|
851
|
|
|
852 temp1 = vec_ld(0, pixels);
|
|
|
853 temp2 = vec_ld(16, pixels);
|
|
|
854 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
|
|
855 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
|
|
|
856 {
|
|
|
857 pixelsv2 = temp2;
|
|
|
858 }
|
|
|
859 else
|
|
|
860 {
|
|
|
861 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
|
|
862 }
|
|
|
863 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
|
864 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
|
865 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
|
|
866 (vector unsigned short)pixelsv2);
|
|
|
867 pixelssum1 = vec_add(pixelssum1, vcone);
|
|
|
868
|
|
|
869 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
|
|
870 for (i = 0; i < h ; i++) {
|
|
|
871 int rightside = ((unsigned long)block & 0x0000000F);
|
|
|
872 blockv = vec_ld(0, block);
|
|
|
873
|
|
|
874 temp1 = vec_ld(line_size, pixels);
|
|
|
875 temp2 = vec_ld(line_size + 16, pixels);
|
|
|
876 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
|
|
877 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
|
|
|
878 {
|
|
|
879 pixelsv2 = temp2;
|
|
|
880 }
|
|
|
881 else
|
|
|
882 {
|
|
|
883 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
|
|
884 }
|
|
|
885
|
|
|
886 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
|
887 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
|
888 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
|
|
889 (vector unsigned short)pixelsv2);
|
|
|
890 temp3 = vec_add(pixelssum1, pixelssum2);
|
|
|
891 temp3 = vec_sra(temp3, vctwo);
|
|
|
892 pixelssum1 = vec_add(pixelssum2, vcone);
|
|
|
893 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
|
|
|
894
|
|
|
895 if (rightside)
|
|
|
896 {
|
|
|
897 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
|
|
|
898 }
|
|
|
899 else
|
|
|
900 {
|
|
|
901 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
|
|
|
902 }
|
|
|
903
|
|
|
904 vec_st(blockv, 0, block);
|
|
|
905
|
|
|
906 block += line_size;
|
|
|
907 pixels += line_size;
|
|
|
908 }
|
|
|
909
|
|
|
910 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
|
|
|
911 }
|
|
|
912
|
|
|
913 /* next one assumes that ((line_size % 16) == 0) */
|
|
|
914 void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
|
|
|
915 {
|
|
|
916 POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
|
|
|
917 register int i;
|
|
|
918 register vector unsigned char
|
|
|
919 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
|
|
|
920 register vector unsigned char
|
|
|
921 blockv, temp1, temp2;
|
|
|
922 register vector unsigned short
|
|
|
923 pixelssum1, pixelssum2, temp3,
|
|
|
924 pixelssum3, pixelssum4, temp4;
|
|
|
925 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
|
|
|
926 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
|
|
|
927
|
|
|
928 POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
|
|
|
929
|
|
|
930 temp1 = vec_ld(0, pixels);
|
|
|
931 temp2 = vec_ld(16, pixels);
|
|
|
932 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
|
|
933 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
|
|
|
934 {
|
|
|
935 pixelsv2 = temp2;
|
|
|
936 }
|
|
|
937 else
|
|
|
938 {
|
|
|
939 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
|
|
940 }
|
|
|
941 pixelsv3 = vec_mergel(vczero, pixelsv1);
|
|
|
942 pixelsv4 = vec_mergel(vczero, pixelsv2);
|
|
|
943 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
|
944 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
|
945 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
|
|
|
946 (vector unsigned short)pixelsv4);
|
|
|
947 pixelssum3 = vec_add(pixelssum3, vctwo);
|
|
|
948 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
|
|
949 (vector unsigned short)pixelsv2);
|
|
|
950 pixelssum1 = vec_add(pixelssum1, vctwo);
|
|
|
951
|
|
|
952 for (i = 0; i < h ; i++) {
|
|
|
953 blockv = vec_ld(0, block);
|
|
|
954
|
|
|
955 temp1 = vec_ld(line_size, pixels);
|
|
|
956 temp2 = vec_ld(line_size + 16, pixels);
|
|
|
957 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
|
|
958 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
|
|
|
959 {
|
|
|
960 pixelsv2 = temp2;
|
|
|
961 }
|
|
|
962 else
|
|
|
963 {
|
|
|
964 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
|
|
965 }
|
|
|
966
|
|
|
967 pixelsv3 = vec_mergel(vczero, pixelsv1);
|
|
|
968 pixelsv4 = vec_mergel(vczero, pixelsv2);
|
|
|
969 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
|
970 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
|
971
|
|
|
972 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
|
|
|
973 (vector unsigned short)pixelsv4);
|
|
|
974 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
|
|
975 (vector unsigned short)pixelsv2);
|
|
|
976 temp4 = vec_add(pixelssum3, pixelssum4);
|
|
|
977 temp4 = vec_sra(temp4, vctwo);
|
|
|
978 temp3 = vec_add(pixelssum1, pixelssum2);
|
|
|
979 temp3 = vec_sra(temp3, vctwo);
|
|
|
980
|
|
|
981 pixelssum3 = vec_add(pixelssum4, vctwo);
|
|
|
982 pixelssum1 = vec_add(pixelssum2, vctwo);
|
|
|
983
|
|
|
984 blockv = vec_packsu(temp3, temp4);
|
|
|
985
|
|
|
986 vec_st(blockv, 0, block);
|
|
|
987
|
|
|
988 block += line_size;
|
|
|
989 pixels += line_size;
|
|
|
990 }
|
|
|
991
|
|
|
992 POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
|
|
|
993 }
|
|
|
994
|
|
|
995 /* next one assumes that ((line_size % 16) == 0) */
|
|
|
996 void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
|
|
|
997 {
|
|
|
998 POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
|
|
999 register int i;
|
|
|
1000 register vector unsigned char
|
|
|
1001 pixelsv1, pixelsv2, pixelsv3, pixelsv4;
|
|
|
1002 register vector unsigned char
|
|
|
1003 blockv, temp1, temp2;
|
|
|
1004 register vector unsigned short
|
|
|
1005 pixelssum1, pixelssum2, temp3,
|
|
|
1006 pixelssum3, pixelssum4, temp4;
|
|
|
1007 register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
|
|
|
1008 register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
|
|
|
1009 register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
|
|
|
1010
|
|
|
1011 POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
|
|
1012
|
|
|
1013 temp1 = vec_ld(0, pixels);
|
|
|
1014 temp2 = vec_ld(16, pixels);
|
|
|
1015 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
|
|
1016 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
|
|
|
1017 {
|
|
|
1018 pixelsv2 = temp2;
|
|
|
1019 }
|
|
|
1020 else
|
|
|
1021 {
|
|
|
1022 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
|
|
1023 }
|
|
|
1024 pixelsv3 = vec_mergel(vczero, pixelsv1);
|
|
|
1025 pixelsv4 = vec_mergel(vczero, pixelsv2);
|
|
|
1026 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
|
1027 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
|
1028 pixelssum3 = vec_add((vector unsigned short)pixelsv3,
|
|
|
1029 (vector unsigned short)pixelsv4);
|
|
|
1030 pixelssum3 = vec_add(pixelssum3, vcone);
|
|
|
1031 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
|
|
1032 (vector unsigned short)pixelsv2);
|
|
|
1033 pixelssum1 = vec_add(pixelssum1, vcone);
|
|
|
1034
|
|
|
1035 for (i = 0; i < h ; i++) {
|
|
|
1036 blockv = vec_ld(0, block);
|
|
|
1037
|
|
|
1038 temp1 = vec_ld(line_size, pixels);
|
|
|
1039 temp2 = vec_ld(line_size + 16, pixels);
|
|
|
1040 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
|
|
1041 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
|
|
|
1042 {
|
|
|
1043 pixelsv2 = temp2;
|
|
|
1044 }
|
|
|
1045 else
|
|
|
1046 {
|
|
|
1047 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
|
|
1048 }
|
|
|
1049
|
|
|
1050 pixelsv3 = vec_mergel(vczero, pixelsv1);
|
|
|
1051 pixelsv4 = vec_mergel(vczero, pixelsv2);
|
|
|
1052 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
|
1053 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
|
1054
|
|
|
1055 pixelssum4 = vec_add((vector unsigned short)pixelsv3,
|
|
|
1056 (vector unsigned short)pixelsv4);
|
|
|
1057 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
|
|
1058 (vector unsigned short)pixelsv2);
|
|
|
1059 temp4 = vec_add(pixelssum3, pixelssum4);
|
|
|
1060 temp4 = vec_sra(temp4, vctwo);
|
|
|
1061 temp3 = vec_add(pixelssum1, pixelssum2);
|
|
|
1062 temp3 = vec_sra(temp3, vctwo);
|
|
|
1063
|
|
|
1064 pixelssum3 = vec_add(pixelssum4, vcone);
|
|
|
1065 pixelssum1 = vec_add(pixelssum2, vcone);
|
|
|
1066
|
|
|
1067 blockv = vec_packsu(temp3, temp4);
|
|
|
1068
|
|
|
1069 vec_st(blockv, 0, block);
|
|
|
1070
|
|
|
1071 block += line_size;
|
|
|
1072 pixels += line_size;
|
|
|
1073 }
|
|
|
1074
|
|
|
1075 POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
|
|
|
1076 }
|
|
|
1077
|
|
|
1078 int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
|
|
|
1079 POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
|
|
|
1080 int sum;
|
|
|
1081 register const_vector unsigned char vzero =
|
|
|
1082 (const_vector unsigned char)vec_splat_u8(0);
|
|
|
1083 register vector signed short temp0, temp1, temp2, temp3, temp4,
|
|
|
1084 temp5, temp6, temp7;
|
|
|
1085 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
|
|
|
1086 {
|
|
|
1087 register const_vector signed short vprod1 =(const_vector signed short)
|
|
|
1088 AVV( 1,-1, 1,-1, 1,-1, 1,-1);
|
|
|
1089 register const_vector signed short vprod2 =(const_vector signed short)
|
|
|
1090 AVV( 1, 1,-1,-1, 1, 1,-1,-1);
|
|
|
1091 register const_vector signed short vprod3 =(const_vector signed short)
|
|
|
1092 AVV( 1, 1, 1, 1,-1,-1,-1,-1);
|
|
|
1093 register const_vector unsigned char perm1 = (const_vector unsigned char)
|
|
|
1094 AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
|
|
|
1095 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
|
|
|
1096 register const_vector unsigned char perm2 = (const_vector unsigned char)
|
|
|
1097 AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
|
|
|
1098 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
|
|
|
1099 register const_vector unsigned char perm3 = (const_vector unsigned char)
|
|
|
1100 AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
|
|
1101 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
|
|
|
1102
|
|
|
1103 #define ONEITERBUTTERFLY(i, res) \
|
|
|
1104 { \
|
|
|
1105 register vector unsigned char src1, src2, srcO; \
|
|
|
1106 register vector unsigned char dst1, dst2, dstO; \
|
|
|
1107 register vector signed short srcV, dstV; \
|
|
|
1108 register vector signed short but0, but1, but2, op1, op2, op3; \
|
|
|
1109 src1 = vec_ld(stride * i, src); \
|
|
|
1110 if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
|
|
|
1111 src2 = vec_ld((stride * i) + 16, src); \
|
|
|
1112 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
|
|
|
1113 dst1 = vec_ld(stride * i, dst); \
|
|
|
1114 if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
|
|
|
1115 dst2 = vec_ld((stride * i) + 16, dst); \
|
|
|
1116 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
|
|
|
1117 /* promote the unsigned chars to signed shorts */ \
|
|
|
1118 /* we're in the 8x8 function, we only care for the first 8 */ \
|
|
|
1119 srcV = \
|
|
|
1120 (vector signed short)vec_mergeh((vector signed char)vzero, \
|
|
|
1121 (vector signed char)srcO); \
|
|
|
1122 dstV = \
|
|
|
1123 (vector signed short)vec_mergeh((vector signed char)vzero, \
|
|
|
1124 (vector signed char)dstO); \
|
|
|
1125 /* substractions inside the first butterfly */ \
|
|
|
1126 but0 = vec_sub(srcV, dstV); \
|
|
|
1127 op1 = vec_perm(but0, but0, perm1); \
|
|
|
1128 but1 = vec_mladd(but0, vprod1, op1); \
|
|
|
1129 op2 = vec_perm(but1, but1, perm2); \
|
|
|
1130 but2 = vec_mladd(but1, vprod2, op2); \
|
|
|
1131 op3 = vec_perm(but2, but2, perm3); \
|
|
|
1132 res = vec_mladd(but2, vprod3, op3); \
|
|
|
1133 }
|
|
|
1134 ONEITERBUTTERFLY(0, temp0);
|
|
|
1135 ONEITERBUTTERFLY(1, temp1);
|
|
|
1136 ONEITERBUTTERFLY(2, temp2);
|
|
|
1137 ONEITERBUTTERFLY(3, temp3);
|
|
|
1138 ONEITERBUTTERFLY(4, temp4);
|
|
|
1139 ONEITERBUTTERFLY(5, temp5);
|
|
|
1140 ONEITERBUTTERFLY(6, temp6);
|
|
|
1141 ONEITERBUTTERFLY(7, temp7);
|
|
|
1142 }
|
|
|
1143 #undef ONEITERBUTTERFLY
|
|
|
1144 {
|
|
|
1145 register vector signed int vsum;
|
|
|
1146 register vector signed short line0 = vec_add(temp0, temp1);
|
|
|
1147 register vector signed short line1 = vec_sub(temp0, temp1);
|
|
|
1148 register vector signed short line2 = vec_add(temp2, temp3);
|
|
|
1149 register vector signed short line3 = vec_sub(temp2, temp3);
|
|
|
1150 register vector signed short line4 = vec_add(temp4, temp5);
|
|
|
1151 register vector signed short line5 = vec_sub(temp4, temp5);
|
|
|
1152 register vector signed short line6 = vec_add(temp6, temp7);
|
|
|
1153 register vector signed short line7 = vec_sub(temp6, temp7);
|
|
|
1154
|
|
|
1155 register vector signed short line0B = vec_add(line0, line2);
|
|
|
1156 register vector signed short line2B = vec_sub(line0, line2);
|
|
|
1157 register vector signed short line1B = vec_add(line1, line3);
|
|
|
1158 register vector signed short line3B = vec_sub(line1, line3);
|
|
|
1159 register vector signed short line4B = vec_add(line4, line6);
|
|
|
1160 register vector signed short line6B = vec_sub(line4, line6);
|
|
|
1161 register vector signed short line5B = vec_add(line5, line7);
|
|
|
1162 register vector signed short line7B = vec_sub(line5, line7);
|
|
|
1163
|
|
|
1164 register vector signed short line0C = vec_add(line0B, line4B);
|
|
|
1165 register vector signed short line4C = vec_sub(line0B, line4B);
|
|
|
1166 register vector signed short line1C = vec_add(line1B, line5B);
|
|
|
1167 register vector signed short line5C = vec_sub(line1B, line5B);
|
|
|
1168 register vector signed short line2C = vec_add(line2B, line6B);
|
|
|
1169 register vector signed short line6C = vec_sub(line2B, line6B);
|
|
|
1170 register vector signed short line3C = vec_add(line3B, line7B);
|
|
|
1171 register vector signed short line7C = vec_sub(line3B, line7B);
|
|
|
1172
|
|
|
1173 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
|
|
|
1174 vsum = vec_sum4s(vec_abs(line1C), vsum);
|
|
|
1175 vsum = vec_sum4s(vec_abs(line2C), vsum);
|
|
|
1176 vsum = vec_sum4s(vec_abs(line3C), vsum);
|
|
|
1177 vsum = vec_sum4s(vec_abs(line4C), vsum);
|
|
|
1178 vsum = vec_sum4s(vec_abs(line5C), vsum);
|
|
|
1179 vsum = vec_sum4s(vec_abs(line6C), vsum);
|
|
|
1180 vsum = vec_sum4s(vec_abs(line7C), vsum);
|
|
|
1181 vsum = vec_sums(vsum, (vector signed int)vzero);
|
|
|
1182 vsum = vec_splat(vsum, 3);
|
|
|
1183 vec_ste(vsum, 0, &sum);
|
|
|
1184 }
|
|
|
1185 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
|
|
|
1186 return sum;
|
|
|
1187 }
|
|
|
1188
|
|
|
1189 /*
|
|
|
1190 16x8 works with 16 elements ; it allows to avoid replicating
|
|
|
1191 loads, and give the compiler more rooms for scheduling.
|
|
|
1192 It's only used from inside hadamard8_diff16_altivec.
|
|
|
1193
|
|
|
1194 Unfortunately, it seems gcc-3.3 is a bit dumb, and
|
|
|
1195 the compiled code has a LOT of spill code, it seems
|
|
|
1196 gcc (unlike xlc) cannot keep everything in registers
|
|
|
1197 by itself. The following code include hand-made
|
|
|
1198 registers allocation. It's not clean, but on
|
|
|
1199 a 7450 the resulting code is much faster (best case
|
|
|
1200 fall from 700+ cycles to 550).
|
|
|
1201
|
|
|
1202 xlc doesn't add spill code, but it doesn't know how to
|
|
|
1203 schedule for the 7450, and its code isn't much faster than
|
|
|
1204 gcc-3.3 on the 7450 (but uses 25% less instructions...)
|
|
|
1205
|
|
|
1206 On the 970, the hand-made RA is still a win (arount 690
|
|
|
1207 vs. around 780), but xlc goes to around 660 on the
|
|
|
1208 regular C code...
|
|
|
1209 */
|
|
|
1210
|
|
|
1211 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
|
|
|
1212 int sum;
|
|
|
1213 register vector signed short
|
|
|
1214 temp0 REG_v(v0),
|
|
|
1215 temp1 REG_v(v1),
|
|
|
1216 temp2 REG_v(v2),
|
|
|
1217 temp3 REG_v(v3),
|
|
|
1218 temp4 REG_v(v4),
|
|
|
1219 temp5 REG_v(v5),
|
|
|
1220 temp6 REG_v(v6),
|
|
|
1221 temp7 REG_v(v7);
|
|
|
1222 register vector signed short
|
|
|
1223 temp0S REG_v(v8),
|
|
|
1224 temp1S REG_v(v9),
|
|
|
1225 temp2S REG_v(v10),
|
|
|
1226 temp3S REG_v(v11),
|
|
|
1227 temp4S REG_v(v12),
|
|
|
1228 temp5S REG_v(v13),
|
|
|
1229 temp6S REG_v(v14),
|
|
|
1230 temp7S REG_v(v15);
|
|
|
1231 register const_vector unsigned char vzero REG_v(v31)=
|
|
|
1232 (const_vector unsigned char)vec_splat_u8(0);
|
|
|
1233 {
|
|
|
1234 register const_vector signed short vprod1 REG_v(v16)=
|
|
|
1235 (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
|
|
|
1236 register const_vector signed short vprod2 REG_v(v17)=
|
|
|
1237 (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
|
|
|
1238 register const_vector signed short vprod3 REG_v(v18)=
|
|
|
1239 (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
|
|
|
1240 register const_vector unsigned char perm1 REG_v(v19)=
|
|
|
1241 (const_vector unsigned char)
|
|
|
1242 AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
|
|
|
1243 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
|
|
|
1244 register const_vector unsigned char perm2 REG_v(v20)=
|
|
|
1245 (const_vector unsigned char)
|
|
|
1246 AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
|
|
|
1247 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
|
|
|
1248 register const_vector unsigned char perm3 REG_v(v21)=
|
|
|
1249 (const_vector unsigned char)
|
|
|
1250 AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
|
|
|
1251 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
|
|
|
1252
|
|
|
1253 #define ONEITERBUTTERFLY(i, res1, res2) \
|
|
|
1254 { \
|
|
|
1255 register vector unsigned char src1 REG_v(v22), \
|
|
|
1256 src2 REG_v(v23), \
|
|
|
1257 dst1 REG_v(v24), \
|
|
|
1258 dst2 REG_v(v25), \
|
|
|
1259 srcO REG_v(v22), \
|
|
|
1260 dstO REG_v(v23); \
|
|
|
1261 \
|
|
|
1262 register vector signed short srcV REG_v(v24), \
|
|
|
1263 dstV REG_v(v25), \
|
|
|
1264 srcW REG_v(v26), \
|
|
|
1265 dstW REG_v(v27), \
|
|
|
1266 but0 REG_v(v28), \
|
|
|
1267 but0S REG_v(v29), \
|
|
|
1268 op1 REG_v(v30), \
|
|
|
1269 but1 REG_v(v22), \
|
|
|
1270 op1S REG_v(v23), \
|
|
|
1271 but1S REG_v(v24), \
|
|
|
1272 op2 REG_v(v25), \
|
|
|
1273 but2 REG_v(v26), \
|
|
|
1274 op2S REG_v(v27), \
|
|
|
1275 but2S REG_v(v28), \
|
|
|
1276 op3 REG_v(v29), \
|
|
|
1277 op3S REG_v(v30); \
|
|
|
1278 \
|
|
|
1279 src1 = vec_ld(stride * i, src); \
|
|
|
1280 src2 = vec_ld((stride * i) + 16, src); \
|
|
|
1281 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
|
|
|
1282 dst1 = vec_ld(stride * i, dst); \
|
|
|
1283 dst2 = vec_ld((stride * i) + 16, dst); \
|
|
|
1284 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
|
|
|
1285 /* promote the unsigned chars to signed shorts */ \
|
|
|
1286 srcV = \
|
|
|
1287 (vector signed short)vec_mergeh((vector signed char)vzero, \
|
|
|
1288 (vector signed char)srcO); \
|
|
|
1289 dstV = \
|
|
|
1290 (vector signed short)vec_mergeh((vector signed char)vzero, \
|
|
|
1291 (vector signed char)dstO); \
|
|
|
1292 srcW = \
|
|
|
1293 (vector signed short)vec_mergel((vector signed char)vzero, \
|
|
|
1294 (vector signed char)srcO); \
|
|
|
1295 dstW = \
|
|
|
1296 (vector signed short)vec_mergel((vector signed char)vzero, \
|
|
|
1297 (vector signed char)dstO); \
|
|
|
1298 /* substractions inside the first butterfly */ \
|
|
|
1299 but0 = vec_sub(srcV, dstV); \
|
|
|
1300 but0S = vec_sub(srcW, dstW); \
|
|
|
1301 op1 = vec_perm(but0, but0, perm1); \
|
|
|
1302 but1 = vec_mladd(but0, vprod1, op1); \
|
|
|
1303 op1S = vec_perm(but0S, but0S, perm1); \
|
|
|
1304 but1S = vec_mladd(but0S, vprod1, op1S); \
|
|
|
1305 op2 = vec_perm(but1, but1, perm2); \
|
|
|
1306 but2 = vec_mladd(but1, vprod2, op2); \
|
|
|
1307 op2S = vec_perm(but1S, but1S, perm2); \
|
|
|
1308 but2S = vec_mladd(but1S, vprod2, op2S); \
|
|
|
1309 op3 = vec_perm(but2, but2, perm3); \
|
|
|
1310 res1 = vec_mladd(but2, vprod3, op3); \
|
|
|
1311 op3S = vec_perm(but2S, but2S, perm3); \
|
|
|
1312 res2 = vec_mladd(but2S, vprod3, op3S); \
|
|
|
1313 }
|
|
|
1314 ONEITERBUTTERFLY(0, temp0, temp0S);
|
|
|
1315 ONEITERBUTTERFLY(1, temp1, temp1S);
|
|
|
1316 ONEITERBUTTERFLY(2, temp2, temp2S);
|
|
|
1317 ONEITERBUTTERFLY(3, temp3, temp3S);
|
|
|
1318 ONEITERBUTTERFLY(4, temp4, temp4S);
|
|
|
1319 ONEITERBUTTERFLY(5, temp5, temp5S);
|
|
|
1320 ONEITERBUTTERFLY(6, temp6, temp6S);
|
|
|
1321 ONEITERBUTTERFLY(7, temp7, temp7S);
|
|
|
1322 }
|
|
|
1323 #undef ONEITERBUTTERFLY
|
|
|
1324 {
|
|
|
1325 register vector signed int vsum;
|
|
|
1326 register vector signed short line0S, line1S, line2S, line3S, line4S,
|
|
|
1327 line5S, line6S, line7S, line0BS,line2BS,
|
|
|
1328 line1BS,line3BS,line4BS,line6BS,line5BS,
|
|
|
1329 line7BS,line0CS,line4CS,line1CS,line5CS,
|
|
|
1330 line2CS,line6CS,line3CS,line7CS;
|
|
|
1331
|
|
|
1332 register vector signed short line0 = vec_add(temp0, temp1);
|
|
|
1333 register vector signed short line1 = vec_sub(temp0, temp1);
|
|
|
1334 register vector signed short line2 = vec_add(temp2, temp3);
|
|
|
1335 register vector signed short line3 = vec_sub(temp2, temp3);
|
|
|
1336 register vector signed short line4 = vec_add(temp4, temp5);
|
|
|
1337 register vector signed short line5 = vec_sub(temp4, temp5);
|
|
|
1338 register vector signed short line6 = vec_add(temp6, temp7);
|
|
|
1339 register vector signed short line7 = vec_sub(temp6, temp7);
|
|
|
1340
|
|
|
1341 register vector signed short line0B = vec_add(line0, line2);
|
|
|
1342 register vector signed short line2B = vec_sub(line0, line2);
|
|
|
1343 register vector signed short line1B = vec_add(line1, line3);
|
|
|
1344 register vector signed short line3B = vec_sub(line1, line3);
|
|
|
1345 register vector signed short line4B = vec_add(line4, line6);
|
|
|
1346 register vector signed short line6B = vec_sub(line4, line6);
|
|
|
1347 register vector signed short line5B = vec_add(line5, line7);
|
|
|
1348 register vector signed short line7B = vec_sub(line5, line7);
|
|
|
1349
|
|
|
1350 register vector signed short line0C = vec_add(line0B, line4B);
|
|
|
1351 register vector signed short line4C = vec_sub(line0B, line4B);
|
|
|
1352 register vector signed short line1C = vec_add(line1B, line5B);
|
|
|
1353 register vector signed short line5C = vec_sub(line1B, line5B);
|
|
|
1354 register vector signed short line2C = vec_add(line2B, line6B);
|
|
|
1355 register vector signed short line6C = vec_sub(line2B, line6B);
|
|
|
1356 register vector signed short line3C = vec_add(line3B, line7B);
|
|
|
1357 register vector signed short line7C = vec_sub(line3B, line7B);
|
|
|
1358
|
|
|
1359 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
|
|
|
1360 vsum = vec_sum4s(vec_abs(line1C), vsum);
|
|
|
1361 vsum = vec_sum4s(vec_abs(line2C), vsum);
|
|
|
1362 vsum = vec_sum4s(vec_abs(line3C), vsum);
|
|
|
1363 vsum = vec_sum4s(vec_abs(line4C), vsum);
|
|
|
1364 vsum = vec_sum4s(vec_abs(line5C), vsum);
|
|
|
1365 vsum = vec_sum4s(vec_abs(line6C), vsum);
|
|
|
1366 vsum = vec_sum4s(vec_abs(line7C), vsum);
|
|
|
1367
|
|
|
1368 line0S = vec_add(temp0S, temp1S);
|
|
|
1369 line1S = vec_sub(temp0S, temp1S);
|
|
|
1370 line2S = vec_add(temp2S, temp3S);
|
|
|
1371 line3S = vec_sub(temp2S, temp3S);
|
|
|
1372 line4S = vec_add(temp4S, temp5S);
|
|
|
1373 line5S = vec_sub(temp4S, temp5S);
|
|
|
1374 line6S = vec_add(temp6S, temp7S);
|
|
|
1375 line7S = vec_sub(temp6S, temp7S);
|
|
|
1376
|
|
|
1377 line0BS = vec_add(line0S, line2S);
|
|
|
1378 line2BS = vec_sub(line0S, line2S);
|
|
|
1379 line1BS = vec_add(line1S, line3S);
|
|
|
1380 line3BS = vec_sub(line1S, line3S);
|
|
|
1381 line4BS = vec_add(line4S, line6S);
|
|
|
1382 line6BS = vec_sub(line4S, line6S);
|
|
|
1383 line5BS = vec_add(line5S, line7S);
|
|
|
1384 line7BS = vec_sub(line5S, line7S);
|
|
|
1385
|
|
|
1386 line0CS = vec_add(line0BS, line4BS);
|
|
|
1387 line4CS = vec_sub(line0BS, line4BS);
|
|
|
1388 line1CS = vec_add(line1BS, line5BS);
|
|
|
1389 line5CS = vec_sub(line1BS, line5BS);
|
|
|
1390 line2CS = vec_add(line2BS, line6BS);
|
|
|
1391 line6CS = vec_sub(line2BS, line6BS);
|
|
|
1392 line3CS = vec_add(line3BS, line7BS);
|
|
|
1393 line7CS = vec_sub(line3BS, line7BS);
|
|
|
1394
|
|
|
1395 vsum = vec_sum4s(vec_abs(line0CS), vsum);
|
|
|
1396 vsum = vec_sum4s(vec_abs(line1CS), vsum);
|
|
|
1397 vsum = vec_sum4s(vec_abs(line2CS), vsum);
|
|
|
1398 vsum = vec_sum4s(vec_abs(line3CS), vsum);
|
|
|
1399 vsum = vec_sum4s(vec_abs(line4CS), vsum);
|
|
|
1400 vsum = vec_sum4s(vec_abs(line5CS), vsum);
|
|
|
1401 vsum = vec_sum4s(vec_abs(line6CS), vsum);
|
|
|
1402 vsum = vec_sum4s(vec_abs(line7CS), vsum);
|
|
|
1403 vsum = vec_sums(vsum, (vector signed int)vzero);
|
|
|
1404 vsum = vec_splat(vsum, 3);
|
|
|
1405 vec_ste(vsum, 0, &sum);
|
|
|
1406 }
|
|
|
1407 return sum;
|
|
|
1408 }
|
|
|
1409
|
|
|
1410 int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
|
|
|
1411 POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
|
|
|
1412 int score;
|
|
|
1413 POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
|
|
|
1414 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
|
|
|
1415 if (h==16) {
|
|
|
1416 dst += 8*stride;
|
|
|
1417 src += 8*stride;
|
|
|
1418 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
|
|
|
1419 }
|
|
|
1420 POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
|
|
|
1421 return score;
|
|
|
1422 }
|
|
|
1423
|
|
|
1424 int has_altivec(void)
|
|
|
1425 {
|
|
|
1426 #ifdef __AMIGAOS4__
|
|
|
1427 ULONG result = 0;
|
|
|
1428 extern struct ExecIFace *IExec;
|
|
|
1429
|
|
|
1430 IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
|
|
|
1431 if (result == VECTORTYPE_ALTIVEC) return 1;
|
|
|
1432 return 0;
|
|
|
1433 #else /* __AMIGAOS4__ */
|
|
|
1434
|
|
|
1435 #ifdef CONFIG_DARWIN
|
|
|
1436 int sels[2] = {CTL_HW, HW_VECTORUNIT};
|
|
|
1437 int has_vu = 0;
|
|
|
1438 size_t len = sizeof(has_vu);
|
|
|
1439 int err;
|
|
|
1440
|
|
|
1441 err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
|
|
|
1442
|
|
|
1443 if (err == 0) return (has_vu != 0);
|
|
|
1444 #else /* CONFIG_DARWIN */
|
|
|
1445 /* no Darwin, do it the brute-force way */
|
|
|
1446 /* this is borrowed from the libmpeg2 library */
|
|
|
1447 {
|
|
|
1448 signal (SIGILL, sigill_handler);
|
|
|
1449 if (sigsetjmp (jmpbuf, 1)) {
|
|
|
1450 signal (SIGILL, SIG_DFL);
|
|
|
1451 } else {
|
|
|
1452 canjump = 1;
|
|
|
1453
|
|
|
1454 asm volatile ("mtspr 256, %0\n\t"
|
|
|
1455 "vand %%v0, %%v0, %%v0"
|
|
|
1456 :
|
|
|
1457 : "r" (-1));
|
|
|
1458
|
|
|
1459 signal (SIGILL, SIG_DFL);
|
|
|
1460 return 1;
|
|
|
1461 }
|
|
|
1462 }
|
|
|
1463 #endif /* CONFIG_DARWIN */
|
|
|
1464 return 0;
|
|
|
1465 #endif /* __AMIGAOS4__ */
|
|
|
1466 }
|
|
|
1467
|
|
|
1468 static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
|
|
|
1469 int blocksize)
|
|
|
1470 {
|
|
|
1471 int i;
|
|
|
1472 vector float m, a;
|
|
|
1473 vector bool int t0, t1;
|
|
|
1474 const vector unsigned int v_31 = //XXX
|
|
|
1475 vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
|
|
|
1476 for(i=0; i<blocksize; i+=4) {
|
|
|
1477 m = vec_ld(0, mag+i);
|
|
|
1478 a = vec_ld(0, ang+i);
|
|
|
1479 t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
|
|
|
1480 t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
|
|
|
1481 a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
|
|
|
1482 t0 = (vector bool int)vec_and(a, t1);
|
|
|
1483 t1 = (vector bool int)vec_andc(a, t1);
|
|
|
1484 a = vec_sub(m, (vector float)t1);
|
|
|
1485 m = vec_add(m, (vector float)t0);
|
|
|
1486 vec_stl(a, 0, ang+i);
|
|
|
1487 vec_stl(m, 0, mag+i);
|
|
|
1488 }
|
|
|
1489 }
|
|
|
1490
|
|
|
1491 /* next one assumes that ((line_size % 8) == 0) */
|
|
|
1492 void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
|
|
|
1493 {
|
|
|
1494 POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
|
|
|
1495 register int i;
|
|
|
1496 register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
|
|
|
1497 register vector unsigned char blockv, temp1, temp2, blocktemp;
|
|
|
1498 register vector unsigned short pixelssum1, pixelssum2, temp3;
|
|
|
1499
|
|
|
1500 register const_vector unsigned char vczero = (const_vector unsigned char)
|
|
|
1501 vec_splat_u8(0);
|
|
|
1502 register const_vector unsigned short vctwo = (const_vector unsigned short)
|
|
|
1503 vec_splat_u16(2);
|
|
|
1504
|
|
|
1505 temp1 = vec_ld(0, pixels);
|
|
|
1506 temp2 = vec_ld(16, pixels);
|
|
|
1507 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
|
|
|
1508 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
|
|
|
1509 pixelsv2 = temp2;
|
|
|
1510 } else {
|
|
|
1511 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
|
|
|
1512 }
|
|
|
1513 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
|
1514 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
|
1515 pixelssum1 = vec_add((vector unsigned short)pixelsv1,
|
|
|
1516 (vector unsigned short)pixelsv2);
|
|
|
1517 pixelssum1 = vec_add(pixelssum1, vctwo);
|
|
|
1518
|
|
|
1519 POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
|
|
|
1520 for (i = 0; i < h ; i++) {
|
|
|
1521 int rightside = ((unsigned long)block & 0x0000000F);
|
|
|
1522 blockv = vec_ld(0, block);
|
|
|
1523
|
|
|
1524 temp1 = vec_ld(line_size, pixels);
|
|
|
1525 temp2 = vec_ld(line_size + 16, pixels);
|
|
|
1526 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
|
|
|
1527 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
|
|
|
1528 {
|
|
|
1529 pixelsv2 = temp2;
|
|
|
1530 } else {
|
|
|
1531 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
|
|
|
1532 }
|
|
|
1533
|
|
|
1534 pixelsv1 = vec_mergeh(vczero, pixelsv1);
|
|
|
1535 pixelsv2 = vec_mergeh(vczero, pixelsv2);
|
|
|
1536 pixelssum2 = vec_add((vector unsigned short)pixelsv1,
|
|
|
1537 (vector unsigned short)pixelsv2);
|
|
|
1538 temp3 = vec_add(pixelssum1, pixelssum2);
|
|
|
1539 temp3 = vec_sra(temp3, vctwo);
|
|
|
1540 pixelssum1 = vec_add(pixelssum2, vctwo);
|
|
|
1541 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
|
|
|
1542
|
|
|
1543 if (rightside) {
|
|
|
1544 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
|
|
|
1545 } else {
|
|
|
1546 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
|
|
|
1547 }
|
|
|
1548
|
|
|
1549 blockv = vec_avg(blocktemp, blockv);
|
|
|
1550 vec_st(blockv, 0, block);
|
|
|
1551
|
|
|
1552 block += line_size;
|
|
|
1553 pixels += line_size;
|
|
|
1554 }
|
|
|
1555
|
|
|
1556 POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
|
|
|
1557 }
|
|
|
1558
|
|
|
1559 void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
|
|
|
1560 {
|
|
|
1561 c->pix_abs[0][1] = sad16_x2_altivec;
|
|
|
1562 c->pix_abs[0][2] = sad16_y2_altivec;
|
|
|
1563 c->pix_abs[0][3] = sad16_xy2_altivec;
|
|
|
1564 c->pix_abs[0][0] = sad16_altivec;
|
|
|
1565 c->pix_abs[1][0] = sad8_altivec;
|
|
|
1566 c->sad[0]= sad16_altivec;
|
|
|
1567 c->sad[1]= sad8_altivec;
|
|
|
1568 c->pix_norm1 = pix_norm1_altivec;
|
|
|
1569 c->sse[1]= sse8_altivec;
|
|
|
1570 c->sse[0]= sse16_altivec;
|
|
|
1571 c->pix_sum = pix_sum_altivec;
|
|
|
1572 c->diff_pixels = diff_pixels_altivec;
|
|
|
1573 c->get_pixels = get_pixels_altivec;
|
|
|
1574 c->add_bytes= add_bytes_altivec;
|
|
|
1575 c->put_pixels_tab[0][0] = put_pixels16_altivec;
|
|
|
1576 /* the two functions do the same thing, so use the same code */
|
|
|
1577 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
|
|
|
1578 c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
|
|
|
1579 c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
|
|
|
1580 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
|
|
|
1581 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
|
|
|
1582 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
|
|
|
1583 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
|
|
|
1584 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
|
|
|
1585
|
|
|
1586 c->hadamard8_diff[0] = hadamard8_diff16_altivec;
|
|
|
1587 c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
|
|
|
1588 #ifdef CONFIG_VORBIS_DECODER
|
|
|
1589 c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
|
|
|
1590 #endif
|
|
|
1591 }
|