|
808
|
1 /*
|
|
|
2 * dsputil_vis.c
|
|
|
3 * Copyright (C) 2003 David S. Miller <davem@redhat.com>
|
|
|
4 *
|
|
|
5 * This file is part of FFmpeg.
|
|
|
6 *
|
|
|
7 * FFmpeg is free software; you can redistribute it and/or
|
|
|
8 * modify it under the terms of the GNU Lesser General Public
|
|
|
9 * License as published by the Free Software Foundation; either
|
|
|
10 * version 2.1 of the License, or (at your option) any later version.
|
|
|
11 *
|
|
|
12 * FFmpeg is distributed in the hope that it will be useful,
|
|
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
15 * Lesser General Public License for more details.
|
|
|
16 *
|
|
|
17 * You should have received a copy of the GNU Lesser General Public
|
|
|
18 * License along with FFmpeg; if not, write to the Free Software
|
|
|
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
20 */
|
|
|
21
|
|
|
22 /* The *no_round* functions have been added by James A. Morrison, 2003,2004.
|
|
|
23 The vis code from libmpeg2 was adapted for ffmpeg by James A. Morrison.
|
|
|
24 */
|
|
|
25
|
|
|
26 #include "config.h"
|
|
|
27
|
|
|
28 #ifdef ARCH_SPARC
|
|
|
29
|
|
|
30 #include <inttypes.h>
|
|
|
31 #include <signal.h>
|
|
|
32 #include <setjmp.h>
|
|
|
33
|
|
|
34 #include "../dsputil.h"
|
|
|
35
|
|
|
36 #include "vis.h"
|
|
|
37
|
|
|
38 /* The trick used in some of this file is the formula from the MMX
|
|
|
39 * motion comp code, which is:
|
|
|
40 *
|
|
|
41 * (x+y+1)>>1 == (x|y)-((x^y)>>1)
|
|
|
42 *
|
|
|
43 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
|
|
|
44 * We avoid overflows by masking before we do the shift, and we
|
|
|
45 * implement the shift by multiplying by 1/2 using mul8x16. So in
|
|
|
46 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
|
|
|
47 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
|
|
|
48 * the value 0x80808080 is in f8):
|
|
|
49 *
|
|
|
50 * fxor f0, f2, f10
|
|
|
51 * fand f10, f4, f10
|
|
|
52 * fmul8x16 f8, f10, f10
|
|
|
53 * fand f10, f6, f10
|
|
|
54 * for f0, f2, f12
|
|
|
55 * fpsub16 f12, f10, f10
|
|
|
56 */
|
|
|
57
|
|
|
58 #define ATTR_ALIGN(alignd) __attribute__ ((aligned(alignd)))
|
|
|
59
|
|
|
60 #define DUP4(x) {x, x, x, x}
|
|
|
61 #define DUP8(x) {x, x, x, x, x, x, x, x}
|
|
|
62 static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1);
|
|
|
63 static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2);
|
|
|
64 static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3);
|
|
|
65 static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6);
|
|
|
66 static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe);
|
|
|
67 static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f);
|
|
|
68 static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128);
|
|
|
69 static const int16_t constants256_512[] ATTR_ALIGN(8) =
|
|
|
70 {256, 512, 256, 512};
|
|
|
71 static const int16_t constants256_1024[] ATTR_ALIGN(8) =
|
|
|
72 {256, 1024, 256, 1024};
|
|
|
73
|
|
|
74 #define REF_0 0
|
|
|
75 #define REF_0_1 1
|
|
|
76 #define REF_2 2
|
|
|
77 #define REF_2_1 3
|
|
|
78 #define REF_4 4
|
|
|
79 #define REF_4_1 5
|
|
|
80 #define REF_6 6
|
|
|
81 #define REF_6_1 7
|
|
|
82 #define REF_S0 8
|
|
|
83 #define REF_S0_1 9
|
|
|
84 #define REF_S2 10
|
|
|
85 #define REF_S2_1 11
|
|
|
86 #define REF_S4 12
|
|
|
87 #define REF_S4_1 13
|
|
|
88 #define REF_S6 14
|
|
|
89 #define REF_S6_1 15
|
|
|
90 #define DST_0 16
|
|
|
91 #define DST_1 17
|
|
|
92 #define DST_2 18
|
|
|
93 #define DST_3 19
|
|
|
94 #define CONST_1 20
|
|
|
95 #define CONST_2 20
|
|
|
96 #define CONST_3 20
|
|
|
97 #define CONST_6 20
|
|
|
98 #define MASK_fe 20
|
|
|
99 #define CONST_128 22
|
|
|
100 #define CONST_256 22
|
|
|
101 #define CONST_512 22
|
|
|
102 #define CONST_1024 22
|
|
|
103 #define TMP0 24
|
|
|
104 #define TMP1 25
|
|
|
105 #define TMP2 26
|
|
|
106 #define TMP3 27
|
|
|
107 #define TMP4 28
|
|
|
108 #define TMP5 29
|
|
|
109 #define ZERO 30
|
|
|
110 #define MASK_7f 30
|
|
|
111
|
|
|
112 #define TMP6 32
|
|
|
113 #define TMP8 34
|
|
|
114 #define TMP10 36
|
|
|
115 #define TMP12 38
|
|
|
116 #define TMP14 40
|
|
|
117 #define TMP16 42
|
|
|
118 #define TMP18 44
|
|
|
119 #define TMP20 46
|
|
|
120 #define TMP22 48
|
|
|
121 #define TMP24 50
|
|
|
122 #define TMP26 52
|
|
|
123 #define TMP28 54
|
|
|
124 #define TMP30 56
|
|
|
125 #define TMP32 58
|
|
|
126
|
|
|
127 static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
128 const int stride, int height)
|
|
|
129 {
|
|
|
130 uint8_t *ref = (uint8_t *) _ref;
|
|
|
131
|
|
|
132 ref = vis_alignaddr(ref);
|
|
|
133 do { /* 5 cycles */
|
|
|
134 vis_ld64(ref[0], TMP0);
|
|
|
135
|
|
|
136 vis_ld64_2(ref, 8, TMP2);
|
|
|
137
|
|
|
138 vis_ld64_2(ref, 16, TMP4);
|
|
|
139 ref += stride;
|
|
|
140
|
|
|
141 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
142 vis_st64(REF_0, dest[0]);
|
|
|
143
|
|
|
144 vis_faligndata(TMP2, TMP4, REF_2);
|
|
|
145 vis_st64_2(REF_2, dest, 8);
|
|
|
146 dest += stride;
|
|
|
147 } while (--height);
|
|
|
148 }
|
|
|
149
|
|
|
150 static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
151 const int stride, int height)
|
|
|
152 {
|
|
|
153 uint8_t *ref = (uint8_t *) _ref;
|
|
|
154
|
|
|
155 ref = vis_alignaddr(ref);
|
|
|
156 do { /* 4 cycles */
|
|
|
157 vis_ld64(ref[0], TMP0);
|
|
|
158
|
|
|
159 vis_ld64(ref[8], TMP2);
|
|
|
160 ref += stride;
|
|
|
161
|
|
|
162 /* stall */
|
|
|
163
|
|
|
164 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
165 vis_st64(REF_0, dest[0]);
|
|
|
166 dest += stride;
|
|
|
167 } while (--height);
|
|
|
168 }
|
|
|
169
|
|
|
170
|
|
|
171 static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
172 const int stride, int height)
|
|
|
173 {
|
|
|
174 uint8_t *ref = (uint8_t *) _ref;
|
|
|
175 int stride_8 = stride + 8;
|
|
|
176
|
|
|
177 ref = vis_alignaddr(ref);
|
|
|
178
|
|
|
179 vis_ld64(ref[0], TMP0);
|
|
|
180
|
|
|
181 vis_ld64(ref[8], TMP2);
|
|
|
182
|
|
|
183 vis_ld64(ref[16], TMP4);
|
|
|
184
|
|
|
185 vis_ld64(dest[0], DST_0);
|
|
|
186
|
|
|
187 vis_ld64(dest[8], DST_2);
|
|
|
188
|
|
|
189 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
190 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
191
|
|
|
192 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
193 vis_faligndata(TMP2, TMP4, REF_2);
|
|
|
194
|
|
|
195 vis_ld64(constants128[0], CONST_128);
|
|
|
196
|
|
|
197 ref += stride;
|
|
|
198 height = (height >> 1) - 1;
|
|
|
199
|
|
|
200 do { /* 24 cycles */
|
|
|
201 vis_ld64(ref[0], TMP0);
|
|
|
202 vis_xor(DST_0, REF_0, TMP6);
|
|
|
203
|
|
|
204 vis_ld64_2(ref, 8, TMP2);
|
|
|
205 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
206
|
|
|
207 vis_ld64_2(ref, 16, TMP4);
|
|
|
208 ref += stride;
|
|
|
209 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
210 vis_xor(DST_2, REF_2, TMP8);
|
|
|
211
|
|
|
212 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
213
|
|
|
214 vis_or(DST_0, REF_0, TMP10);
|
|
|
215 vis_ld64_2(dest, stride, DST_0);
|
|
|
216 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
217
|
|
|
218 vis_or(DST_2, REF_2, TMP12);
|
|
|
219 vis_ld64_2(dest, stride_8, DST_2);
|
|
|
220
|
|
|
221 vis_ld64(ref[0], TMP14);
|
|
|
222 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
223
|
|
|
224 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
225
|
|
|
226 vis_psub16(TMP10, TMP6, TMP6);
|
|
|
227 vis_st64(TMP6, dest[0]);
|
|
|
228
|
|
|
229 vis_psub16(TMP12, TMP8, TMP8);
|
|
|
230 vis_st64_2(TMP8, dest, 8);
|
|
|
231
|
|
|
232 dest += stride;
|
|
|
233 vis_ld64_2(ref, 8, TMP16);
|
|
|
234 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
235
|
|
|
236 vis_ld64_2(ref, 16, TMP18);
|
|
|
237 vis_faligndata(TMP2, TMP4, REF_2);
|
|
|
238 ref += stride;
|
|
|
239
|
|
|
240 vis_xor(DST_0, REF_0, TMP20);
|
|
|
241
|
|
|
242 vis_and(TMP20, MASK_fe, TMP20);
|
|
|
243
|
|
|
244 vis_xor(DST_2, REF_2, TMP22);
|
|
|
245 vis_mul8x16(CONST_128, TMP20, TMP20);
|
|
|
246
|
|
|
247 vis_and(TMP22, MASK_fe, TMP22);
|
|
|
248
|
|
|
249 vis_or(DST_0, REF_0, TMP24);
|
|
|
250 vis_mul8x16(CONST_128, TMP22, TMP22);
|
|
|
251
|
|
|
252 vis_or(DST_2, REF_2, TMP26);
|
|
|
253
|
|
|
254 vis_ld64_2(dest, stride, DST_0);
|
|
|
255 vis_faligndata(TMP14, TMP16, REF_0);
|
|
|
256
|
|
|
257 vis_ld64_2(dest, stride_8, DST_2);
|
|
|
258 vis_faligndata(TMP16, TMP18, REF_2);
|
|
|
259
|
|
|
260 vis_and(TMP20, MASK_7f, TMP20);
|
|
|
261
|
|
|
262 vis_and(TMP22, MASK_7f, TMP22);
|
|
|
263
|
|
|
264 vis_psub16(TMP24, TMP20, TMP20);
|
|
|
265 vis_st64(TMP20, dest[0]);
|
|
|
266
|
|
|
267 vis_psub16(TMP26, TMP22, TMP22);
|
|
|
268 vis_st64_2(TMP22, dest, 8);
|
|
|
269 dest += stride;
|
|
|
270 } while (--height);
|
|
|
271
|
|
|
272 vis_ld64(ref[0], TMP0);
|
|
|
273 vis_xor(DST_0, REF_0, TMP6);
|
|
|
274
|
|
|
275 vis_ld64_2(ref, 8, TMP2);
|
|
|
276 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
277
|
|
|
278 vis_ld64_2(ref, 16, TMP4);
|
|
|
279 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
280 vis_xor(DST_2, REF_2, TMP8);
|
|
|
281
|
|
|
282 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
283
|
|
|
284 vis_or(DST_0, REF_0, TMP10);
|
|
|
285 vis_ld64_2(dest, stride, DST_0);
|
|
|
286 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
287
|
|
|
288 vis_or(DST_2, REF_2, TMP12);
|
|
|
289 vis_ld64_2(dest, stride_8, DST_2);
|
|
|
290
|
|
|
291 vis_ld64(ref[0], TMP14);
|
|
|
292 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
293
|
|
|
294 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
295
|
|
|
296 vis_psub16(TMP10, TMP6, TMP6);
|
|
|
297 vis_st64(TMP6, dest[0]);
|
|
|
298
|
|
|
299 vis_psub16(TMP12, TMP8, TMP8);
|
|
|
300 vis_st64_2(TMP8, dest, 8);
|
|
|
301
|
|
|
302 dest += stride;
|
|
|
303 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
304
|
|
|
305 vis_faligndata(TMP2, TMP4, REF_2);
|
|
|
306
|
|
|
307 vis_xor(DST_0, REF_0, TMP20);
|
|
|
308
|
|
|
309 vis_and(TMP20, MASK_fe, TMP20);
|
|
|
310
|
|
|
311 vis_xor(DST_2, REF_2, TMP22);
|
|
|
312 vis_mul8x16(CONST_128, TMP20, TMP20);
|
|
|
313
|
|
|
314 vis_and(TMP22, MASK_fe, TMP22);
|
|
|
315
|
|
|
316 vis_or(DST_0, REF_0, TMP24);
|
|
|
317 vis_mul8x16(CONST_128, TMP22, TMP22);
|
|
|
318
|
|
|
319 vis_or(DST_2, REF_2, TMP26);
|
|
|
320
|
|
|
321 vis_and(TMP20, MASK_7f, TMP20);
|
|
|
322
|
|
|
323 vis_and(TMP22, MASK_7f, TMP22);
|
|
|
324
|
|
|
325 vis_psub16(TMP24, TMP20, TMP20);
|
|
|
326 vis_st64(TMP20, dest[0]);
|
|
|
327
|
|
|
328 vis_psub16(TMP26, TMP22, TMP22);
|
|
|
329 vis_st64_2(TMP22, dest, 8);
|
|
|
330 }
|
|
|
331
|
|
|
332 static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
333 const int stride, int height)
|
|
|
334 {
|
|
|
335 uint8_t *ref = (uint8_t *) _ref;
|
|
|
336
|
|
|
337 ref = vis_alignaddr(ref);
|
|
|
338
|
|
|
339 vis_ld64(ref[0], TMP0);
|
|
|
340
|
|
|
341 vis_ld64(ref[8], TMP2);
|
|
|
342
|
|
|
343 vis_ld64(dest[0], DST_0);
|
|
|
344
|
|
|
345 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
346
|
|
|
347 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
348 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
349
|
|
|
350 vis_ld64(constants128[0], CONST_128);
|
|
|
351
|
|
|
352 ref += stride;
|
|
|
353 height = (height >> 1) - 1;
|
|
|
354
|
|
|
355 do { /* 12 cycles */
|
|
|
356 vis_ld64(ref[0], TMP0);
|
|
|
357 vis_xor(DST_0, REF_0, TMP4);
|
|
|
358
|
|
|
359 vis_ld64(ref[8], TMP2);
|
|
|
360 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
361
|
|
|
362 vis_or(DST_0, REF_0, TMP6);
|
|
|
363 vis_ld64_2(dest, stride, DST_0);
|
|
|
364 ref += stride;
|
|
|
365 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
366
|
|
|
367 vis_ld64(ref[0], TMP12);
|
|
|
368 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
369
|
|
|
370 vis_ld64(ref[8], TMP2);
|
|
|
371 vis_xor(DST_0, REF_0, TMP0);
|
|
|
372 ref += stride;
|
|
|
373
|
|
|
374 vis_and(TMP0, MASK_fe, TMP0);
|
|
|
375
|
|
|
376 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
377
|
|
|
378 vis_psub16(TMP6, TMP4, TMP4);
|
|
|
379 vis_st64(TMP4, dest[0]);
|
|
|
380 dest += stride;
|
|
|
381 vis_mul8x16(CONST_128, TMP0, TMP0);
|
|
|
382
|
|
|
383 vis_or(DST_0, REF_0, TMP6);
|
|
|
384 vis_ld64_2(dest, stride, DST_0);
|
|
|
385
|
|
|
386 vis_faligndata(TMP12, TMP2, REF_0);
|
|
|
387
|
|
|
388 vis_and(TMP0, MASK_7f, TMP0);
|
|
|
389
|
|
|
390 vis_psub16(TMP6, TMP0, TMP4);
|
|
|
391 vis_st64(TMP4, dest[0]);
|
|
|
392 dest += stride;
|
|
|
393 } while (--height);
|
|
|
394
|
|
|
395 vis_ld64(ref[0], TMP0);
|
|
|
396 vis_xor(DST_0, REF_0, TMP4);
|
|
|
397
|
|
|
398 vis_ld64(ref[8], TMP2);
|
|
|
399 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
400
|
|
|
401 vis_or(DST_0, REF_0, TMP6);
|
|
|
402 vis_ld64_2(dest, stride, DST_0);
|
|
|
403 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
404
|
|
|
405 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
406
|
|
|
407 vis_xor(DST_0, REF_0, TMP0);
|
|
|
408
|
|
|
409 vis_and(TMP0, MASK_fe, TMP0);
|
|
|
410
|
|
|
411 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
412
|
|
|
413 vis_psub16(TMP6, TMP4, TMP4);
|
|
|
414 vis_st64(TMP4, dest[0]);
|
|
|
415 dest += stride;
|
|
|
416 vis_mul8x16(CONST_128, TMP0, TMP0);
|
|
|
417
|
|
|
418 vis_or(DST_0, REF_0, TMP6);
|
|
|
419
|
|
|
420 vis_and(TMP0, MASK_7f, TMP0);
|
|
|
421
|
|
|
422 vis_psub16(TMP6, TMP0, TMP4);
|
|
|
423 vis_st64(TMP4, dest[0]);
|
|
|
424 }
|
|
|
425
|
|
|
426 static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
427 const int stride, int height)
|
|
|
428 {
|
|
|
429 uint8_t *ref = (uint8_t *) _ref;
|
|
|
430 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
431 unsigned long off_plus_1 = off + 1;
|
|
|
432
|
|
|
433 ref = vis_alignaddr(ref);
|
|
|
434
|
|
|
435 vis_ld64(ref[0], TMP0);
|
|
|
436
|
|
|
437 vis_ld64_2(ref, 8, TMP2);
|
|
|
438
|
|
|
439 vis_ld64_2(ref, 16, TMP4);
|
|
|
440
|
|
|
441 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
442
|
|
|
443 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
444 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
445
|
|
|
446 vis_ld64(constants128[0], CONST_128);
|
|
|
447 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
448
|
|
|
449 if (off != 0x7) {
|
|
|
450 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
451 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
452 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
453 } else {
|
|
|
454 vis_src1(TMP2, REF_2);
|
|
|
455 vis_src1(TMP4, REF_6);
|
|
|
456 }
|
|
|
457
|
|
|
458 ref += stride;
|
|
|
459 height = (height >> 1) - 1;
|
|
|
460
|
|
|
461 do { /* 34 cycles */
|
|
|
462 vis_ld64(ref[0], TMP0);
|
|
|
463 vis_xor(REF_0, REF_2, TMP6);
|
|
|
464
|
|
|
465 vis_ld64_2(ref, 8, TMP2);
|
|
|
466 vis_xor(REF_4, REF_6, TMP8);
|
|
|
467
|
|
|
468 vis_ld64_2(ref, 16, TMP4);
|
|
|
469 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
470 ref += stride;
|
|
|
471
|
|
|
472 vis_ld64(ref[0], TMP14);
|
|
|
473 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
474 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
475
|
|
|
476 vis_ld64_2(ref, 8, TMP16);
|
|
|
477 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
478 vis_or(REF_0, REF_2, TMP10);
|
|
|
479
|
|
|
480 vis_ld64_2(ref, 16, TMP18);
|
|
|
481 ref += stride;
|
|
|
482 vis_or(REF_4, REF_6, TMP12);
|
|
|
483
|
|
|
484 vis_alignaddr_g0((void *)off);
|
|
|
485
|
|
|
486 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
487
|
|
|
488 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
489
|
|
|
490 if (off != 0x7) {
|
|
|
491 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
492 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
493 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
494 } else {
|
|
|
495 vis_src1(TMP2, REF_2);
|
|
|
496 vis_src1(TMP4, REF_6);
|
|
|
497 }
|
|
|
498
|
|
|
499 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
500
|
|
|
501 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
502
|
|
|
503 vis_psub16(TMP10, TMP6, TMP6);
|
|
|
504 vis_st64(TMP6, dest[0]);
|
|
|
505
|
|
|
506 vis_psub16(TMP12, TMP8, TMP8);
|
|
|
507 vis_st64_2(TMP8, dest, 8);
|
|
|
508 dest += stride;
|
|
|
509
|
|
|
510 vis_xor(REF_0, REF_2, TMP6);
|
|
|
511
|
|
|
512 vis_xor(REF_4, REF_6, TMP8);
|
|
|
513
|
|
|
514 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
515
|
|
|
516 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
517 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
518
|
|
|
519 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
520 vis_or(REF_0, REF_2, TMP10);
|
|
|
521
|
|
|
522 vis_or(REF_4, REF_6, TMP12);
|
|
|
523
|
|
|
524 vis_alignaddr_g0((void *)off);
|
|
|
525
|
|
|
526 vis_faligndata(TMP14, TMP16, REF_0);
|
|
|
527
|
|
|
528 vis_faligndata(TMP16, TMP18, REF_4);
|
|
|
529
|
|
|
530 if (off != 0x7) {
|
|
|
531 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
532 vis_faligndata(TMP14, TMP16, REF_2);
|
|
|
533 vis_faligndata(TMP16, TMP18, REF_6);
|
|
|
534 } else {
|
|
|
535 vis_src1(TMP16, REF_2);
|
|
|
536 vis_src1(TMP18, REF_6);
|
|
|
537 }
|
|
|
538
|
|
|
539 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
540
|
|
|
541 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
542
|
|
|
543 vis_psub16(TMP10, TMP6, TMP6);
|
|
|
544 vis_st64(TMP6, dest[0]);
|
|
|
545
|
|
|
546 vis_psub16(TMP12, TMP8, TMP8);
|
|
|
547 vis_st64_2(TMP8, dest, 8);
|
|
|
548 dest += stride;
|
|
|
549 } while (--height);
|
|
|
550
|
|
|
551 vis_ld64(ref[0], TMP0);
|
|
|
552 vis_xor(REF_0, REF_2, TMP6);
|
|
|
553
|
|
|
554 vis_ld64_2(ref, 8, TMP2);
|
|
|
555 vis_xor(REF_4, REF_6, TMP8);
|
|
|
556
|
|
|
557 vis_ld64_2(ref, 16, TMP4);
|
|
|
558 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
559
|
|
|
560 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
561 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
562
|
|
|
563 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
564 vis_or(REF_0, REF_2, TMP10);
|
|
|
565
|
|
|
566 vis_or(REF_4, REF_6, TMP12);
|
|
|
567
|
|
|
568 vis_alignaddr_g0((void *)off);
|
|
|
569
|
|
|
570 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
571
|
|
|
572 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
573
|
|
|
574 if (off != 0x7) {
|
|
|
575 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
576 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
577 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
578 } else {
|
|
|
579 vis_src1(TMP2, REF_2);
|
|
|
580 vis_src1(TMP4, REF_6);
|
|
|
581 }
|
|
|
582
|
|
|
583 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
584
|
|
|
585 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
586
|
|
|
587 vis_psub16(TMP10, TMP6, TMP6);
|
|
|
588 vis_st64(TMP6, dest[0]);
|
|
|
589
|
|
|
590 vis_psub16(TMP12, TMP8, TMP8);
|
|
|
591 vis_st64_2(TMP8, dest, 8);
|
|
|
592 dest += stride;
|
|
|
593
|
|
|
594 vis_xor(REF_0, REF_2, TMP6);
|
|
|
595
|
|
|
596 vis_xor(REF_4, REF_6, TMP8);
|
|
|
597
|
|
|
598 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
599
|
|
|
600 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
601 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
602
|
|
|
603 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
604 vis_or(REF_0, REF_2, TMP10);
|
|
|
605
|
|
|
606 vis_or(REF_4, REF_6, TMP12);
|
|
|
607
|
|
|
608 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
609
|
|
|
610 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
611
|
|
|
612 vis_psub16(TMP10, TMP6, TMP6);
|
|
|
613 vis_st64(TMP6, dest[0]);
|
|
|
614
|
|
|
615 vis_psub16(TMP12, TMP8, TMP8);
|
|
|
616 vis_st64_2(TMP8, dest, 8);
|
|
|
617 }
|
|
|
618
|
|
|
619 static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
620 const int stride, int height)
|
|
|
621 {
|
|
|
622 uint8_t *ref = (uint8_t *) _ref;
|
|
|
623 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
624 unsigned long off_plus_1 = off + 1;
|
|
|
625
|
|
|
626 ref = vis_alignaddr(ref);
|
|
|
627
|
|
|
628 vis_ld64(ref[0], TMP0);
|
|
|
629
|
|
|
630 vis_ld64(ref[8], TMP2);
|
|
|
631
|
|
|
632 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
633
|
|
|
634 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
635
|
|
|
636 vis_ld64(constants128[0], CONST_128);
|
|
|
637 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
638
|
|
|
639 if (off != 0x7) {
|
|
|
640 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
641 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
642 } else {
|
|
|
643 vis_src1(TMP2, REF_2);
|
|
|
644 }
|
|
|
645
|
|
|
646 ref += stride;
|
|
|
647 height = (height >> 1) - 1;
|
|
|
648
|
|
|
649 do { /* 20 cycles */
|
|
|
650 vis_ld64(ref[0], TMP0);
|
|
|
651 vis_xor(REF_0, REF_2, TMP4);
|
|
|
652
|
|
|
653 vis_ld64_2(ref, 8, TMP2);
|
|
|
654 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
655 ref += stride;
|
|
|
656
|
|
|
657 vis_ld64(ref[0], TMP8);
|
|
|
658 vis_or(REF_0, REF_2, TMP6);
|
|
|
659 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
660
|
|
|
661 vis_alignaddr_g0((void *)off);
|
|
|
662
|
|
|
663 vis_ld64_2(ref, 8, TMP10);
|
|
|
664 ref += stride;
|
|
|
665 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
666
|
|
|
667 if (off != 0x7) {
|
|
|
668 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
669 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
670 } else {
|
|
|
671 vis_src1(TMP2, REF_2);
|
|
|
672 }
|
|
|
673
|
|
|
674 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
675
|
|
|
676 vis_psub16(TMP6, TMP4, DST_0);
|
|
|
677 vis_st64(DST_0, dest[0]);
|
|
|
678 dest += stride;
|
|
|
679
|
|
|
680 vis_xor(REF_0, REF_2, TMP12);
|
|
|
681
|
|
|
682 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
683
|
|
|
684 vis_or(REF_0, REF_2, TMP14);
|
|
|
685 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
686
|
|
|
687 vis_alignaddr_g0((void *)off);
|
|
|
688 vis_faligndata(TMP8, TMP10, REF_0);
|
|
|
689 if (off != 0x7) {
|
|
|
690 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
691 vis_faligndata(TMP8, TMP10, REF_2);
|
|
|
692 } else {
|
|
|
693 vis_src1(TMP10, REF_2);
|
|
|
694 }
|
|
|
695
|
|
|
696 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
697
|
|
|
698 vis_psub16(TMP14, TMP12, DST_0);
|
|
|
699 vis_st64(DST_0, dest[0]);
|
|
|
700 dest += stride;
|
|
|
701 } while (--height);
|
|
|
702
|
|
|
703 vis_ld64(ref[0], TMP0);
|
|
|
704 vis_xor(REF_0, REF_2, TMP4);
|
|
|
705
|
|
|
706 vis_ld64_2(ref, 8, TMP2);
|
|
|
707 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
708
|
|
|
709 vis_or(REF_0, REF_2, TMP6);
|
|
|
710 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
711
|
|
|
712 vis_alignaddr_g0((void *)off);
|
|
|
713
|
|
|
714 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
715
|
|
|
716 if (off != 0x7) {
|
|
|
717 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
718 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
719 } else {
|
|
|
720 vis_src1(TMP2, REF_2);
|
|
|
721 }
|
|
|
722
|
|
|
723 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
724
|
|
|
725 vis_psub16(TMP6, TMP4, DST_0);
|
|
|
726 vis_st64(DST_0, dest[0]);
|
|
|
727 dest += stride;
|
|
|
728
|
|
|
729 vis_xor(REF_0, REF_2, TMP12);
|
|
|
730
|
|
|
731 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
732
|
|
|
733 vis_or(REF_0, REF_2, TMP14);
|
|
|
734 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
735
|
|
|
736 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
737
|
|
|
738 vis_psub16(TMP14, TMP12, DST_0);
|
|
|
739 vis_st64(DST_0, dest[0]);
|
|
|
740 dest += stride;
|
|
|
741 }
|
|
|
742
|
|
|
743 static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
744 const int stride, int height)
|
|
|
745 {
|
|
|
746 uint8_t *ref = (uint8_t *) _ref;
|
|
|
747 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
748 unsigned long off_plus_1 = off + 1;
|
|
|
749
|
|
|
750 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
751
|
|
|
752 vis_ld64(constants3[0], CONST_3);
|
|
|
753 vis_fzero(ZERO);
|
|
|
754 vis_ld64(constants256_512[0], CONST_256);
|
|
|
755
|
|
|
756 ref = vis_alignaddr(ref);
|
|
|
757 do { /* 26 cycles */
|
|
|
758 vis_ld64(ref[0], TMP0);
|
|
|
759
|
|
|
760 vis_ld64(ref[8], TMP2);
|
|
|
761
|
|
|
762 vis_alignaddr_g0((void *)off);
|
|
|
763
|
|
|
764 vis_ld64(ref[16], TMP4);
|
|
|
765
|
|
|
766 vis_ld64(dest[0], DST_0);
|
|
|
767 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
768
|
|
|
769 vis_ld64(dest[8], DST_2);
|
|
|
770 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
771
|
|
|
772 if (off != 0x7) {
|
|
|
773 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
774 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
775 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
776 } else {
|
|
|
777 vis_src1(TMP2, REF_2);
|
|
|
778 vis_src1(TMP4, REF_6);
|
|
|
779 }
|
|
|
780
|
|
|
781 vis_mul8x16au(REF_0, CONST_256, TMP0);
|
|
|
782
|
|
|
783 vis_pmerge(ZERO, REF_2, TMP4);
|
|
|
784 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
|
|
785
|
|
|
786 vis_pmerge(ZERO, REF_2_1, TMP6);
|
|
|
787
|
|
|
788 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
789
|
|
|
790 vis_mul8x16al(DST_0, CONST_512, TMP4);
|
|
|
791 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
792
|
|
|
793 vis_mul8x16al(DST_1, CONST_512, TMP6);
|
|
|
794
|
|
|
795 vis_mul8x16au(REF_6, CONST_256, TMP12);
|
|
|
796
|
|
|
797 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
798 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
|
|
|
799
|
|
|
800 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
801 vis_mul8x16au(REF_4, CONST_256, TMP16);
|
|
|
802
|
|
|
803 vis_padd16(TMP0, CONST_3, TMP8);
|
|
|
804 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
|
|
|
805
|
|
|
806 vis_padd16(TMP2, CONST_3, TMP10);
|
|
|
807 vis_pack16(TMP8, DST_0);
|
|
|
808
|
|
|
809 vis_pack16(TMP10, DST_1);
|
|
|
810 vis_padd16(TMP16, TMP12, TMP0);
|
|
|
811
|
|
|
812 vis_st64(DST_0, dest[0]);
|
|
|
813 vis_mul8x16al(DST_2, CONST_512, TMP4);
|
|
|
814 vis_padd16(TMP18, TMP14, TMP2);
|
|
|
815
|
|
|
816 vis_mul8x16al(DST_3, CONST_512, TMP6);
|
|
|
817 vis_padd16(TMP0, CONST_3, TMP0);
|
|
|
818
|
|
|
819 vis_padd16(TMP2, CONST_3, TMP2);
|
|
|
820
|
|
|
821 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
822
|
|
|
823 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
824 vis_pack16(TMP0, DST_2);
|
|
|
825
|
|
|
826 vis_pack16(TMP2, DST_3);
|
|
|
827 vis_st64(DST_2, dest[8]);
|
|
|
828
|
|
|
829 ref += stride;
|
|
|
830 dest += stride;
|
|
|
831 } while (--height);
|
|
|
832 }
|
|
|
833
|
|
|
834 static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
835 const int stride, int height)
|
|
|
836 {
|
|
|
837 uint8_t *ref = (uint8_t *) _ref;
|
|
|
838 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
839 unsigned long off_plus_1 = off + 1;
|
|
|
840 int stride_times_2 = stride << 1;
|
|
|
841
|
|
|
842 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
843
|
|
|
844 vis_ld64(constants3[0], CONST_3);
|
|
|
845 vis_fzero(ZERO);
|
|
|
846 vis_ld64(constants256_512[0], CONST_256);
|
|
|
847
|
|
|
848 ref = vis_alignaddr(ref);
|
|
|
849 height >>= 2;
|
|
|
850 do { /* 47 cycles */
|
|
|
851 vis_ld64(ref[0], TMP0);
|
|
|
852
|
|
|
853 vis_ld64_2(ref, 8, TMP2);
|
|
|
854 ref += stride;
|
|
|
855
|
|
|
856 vis_alignaddr_g0((void *)off);
|
|
|
857
|
|
|
858 vis_ld64(ref[0], TMP4);
|
|
|
859 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
860
|
|
|
861 vis_ld64_2(ref, 8, TMP6);
|
|
|
862 ref += stride;
|
|
|
863
|
|
|
864 vis_ld64(ref[0], TMP8);
|
|
|
865
|
|
|
866 vis_ld64_2(ref, 8, TMP10);
|
|
|
867 ref += stride;
|
|
|
868 vis_faligndata(TMP4, TMP6, REF_4);
|
|
|
869
|
|
|
870 vis_ld64(ref[0], TMP12);
|
|
|
871
|
|
|
872 vis_ld64_2(ref, 8, TMP14);
|
|
|
873 ref += stride;
|
|
|
874 vis_faligndata(TMP8, TMP10, REF_S0);
|
|
|
875
|
|
|
876 vis_faligndata(TMP12, TMP14, REF_S4);
|
|
|
877
|
|
|
878 if (off != 0x7) {
|
|
|
879 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
880
|
|
|
881 vis_ld64(dest[0], DST_0);
|
|
|
882 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
883
|
|
|
884 vis_ld64_2(dest, stride, DST_2);
|
|
|
885 vis_faligndata(TMP4, TMP6, REF_6);
|
|
|
886
|
|
|
887 vis_faligndata(TMP8, TMP10, REF_S2);
|
|
|
888
|
|
|
889 vis_faligndata(TMP12, TMP14, REF_S6);
|
|
|
890 } else {
|
|
|
891 vis_ld64(dest[0], DST_0);
|
|
|
892 vis_src1(TMP2, REF_2);
|
|
|
893
|
|
|
894 vis_ld64_2(dest, stride, DST_2);
|
|
|
895 vis_src1(TMP6, REF_6);
|
|
|
896
|
|
|
897 vis_src1(TMP10, REF_S2);
|
|
|
898
|
|
|
899 vis_src1(TMP14, REF_S6);
|
|
|
900 }
|
|
|
901
|
|
|
902 vis_pmerge(ZERO, REF_0, TMP0);
|
|
|
903 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
|
|
904
|
|
|
905 vis_pmerge(ZERO, REF_2, TMP4);
|
|
|
906 vis_mul8x16au(REF_2_1, CONST_256, TMP6);
|
|
|
907
|
|
|
908 vis_padd16(TMP0, CONST_3, TMP0);
|
|
|
909 vis_mul8x16al(DST_0, CONST_512, TMP16);
|
|
|
910
|
|
|
911 vis_padd16(TMP2, CONST_3, TMP2);
|
|
|
912 vis_mul8x16al(DST_1, CONST_512, TMP18);
|
|
|
913
|
|
|
914 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
915 vis_mul8x16au(REF_4, CONST_256, TMP8);
|
|
|
916
|
|
|
917 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
918 vis_mul8x16au(REF_4_1, CONST_256, TMP10);
|
|
|
919
|
|
|
920 vis_padd16(TMP0, TMP16, TMP0);
|
|
|
921 vis_mul8x16au(REF_6, CONST_256, TMP12);
|
|
|
922
|
|
|
923 vis_padd16(TMP2, TMP18, TMP2);
|
|
|
924 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
|
|
|
925
|
|
|
926 vis_padd16(TMP8, CONST_3, TMP8);
|
|
|
927 vis_mul8x16al(DST_2, CONST_512, TMP16);
|
|
|
928
|
|
|
929 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
930 vis_mul8x16al(DST_3, CONST_512, TMP18);
|
|
|
931
|
|
|
932 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
933 vis_pack16(TMP0, DST_0);
|
|
|
934
|
|
|
935 vis_pack16(TMP2, DST_1);
|
|
|
936 vis_st64(DST_0, dest[0]);
|
|
|
937 dest += stride;
|
|
|
938 vis_padd16(TMP10, CONST_3, TMP10);
|
|
|
939
|
|
|
940 vis_ld64_2(dest, stride, DST_0);
|
|
|
941 vis_padd16(TMP8, TMP16, TMP8);
|
|
|
942
|
|
|
943 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
|
|
|
944 vis_padd16(TMP10, TMP18, TMP10);
|
|
|
945 vis_pack16(TMP8, DST_2);
|
|
|
946
|
|
|
947 vis_pack16(TMP10, DST_3);
|
|
|
948 vis_st64(DST_2, dest[0]);
|
|
|
949 dest += stride;
|
|
|
950
|
|
|
951 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
|
|
|
952 vis_pmerge(ZERO, REF_S0, TMP0);
|
|
|
953
|
|
|
954 vis_pmerge(ZERO, REF_S2, TMP24);
|
|
|
955 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
|
|
|
956
|
|
|
957 vis_padd16(TMP0, CONST_3, TMP0);
|
|
|
958 vis_mul8x16au(REF_S4, CONST_256, TMP8);
|
|
|
959
|
|
|
960 vis_padd16(TMP2, CONST_3, TMP2);
|
|
|
961 vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
|
|
|
962
|
|
|
963 vis_padd16(TMP0, TMP24, TMP0);
|
|
|
964 vis_mul8x16au(REF_S6, CONST_256, TMP12);
|
|
|
965
|
|
|
966 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
967 vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
|
|
|
968
|
|
|
969 vis_padd16(TMP8, CONST_3, TMP8);
|
|
|
970 vis_mul8x16al(DST_0, CONST_512, TMP16);
|
|
|
971
|
|
|
972 vis_padd16(TMP10, CONST_3, TMP10);
|
|
|
973 vis_mul8x16al(DST_1, CONST_512, TMP18);
|
|
|
974
|
|
|
975 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
976 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
|
|
|
977
|
|
|
978 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
|
|
|
979 vis_padd16(TMP0, TMP16, TMP0);
|
|
|
980
|
|
|
981 vis_padd16(TMP2, TMP18, TMP2);
|
|
|
982 vis_pack16(TMP0, DST_0);
|
|
|
983
|
|
|
984 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
985 vis_pack16(TMP2, DST_1);
|
|
|
986 vis_st64(DST_0, dest[0]);
|
|
|
987 dest += stride;
|
|
|
988
|
|
|
989 vis_padd16(TMP8, TMP20, TMP8);
|
|
|
990
|
|
|
991 vis_padd16(TMP10, TMP22, TMP10);
|
|
|
992 vis_pack16(TMP8, DST_2);
|
|
|
993
|
|
|
994 vis_pack16(TMP10, DST_3);
|
|
|
995 vis_st64(DST_2, dest[0]);
|
|
|
996 dest += stride;
|
|
|
997 } while (--height);
|
|
|
998 }
|
|
|
999
|
|
|
1000 static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
1001 const int stride, int height)
|
|
|
1002 {
|
|
|
1003 uint8_t *ref = (uint8_t *) _ref;
|
|
|
1004
|
|
|
1005 ref = vis_alignaddr(ref);
|
|
|
1006 vis_ld64(ref[0], TMP0);
|
|
|
1007
|
|
|
1008 vis_ld64_2(ref, 8, TMP2);
|
|
|
1009
|
|
|
1010 vis_ld64_2(ref, 16, TMP4);
|
|
|
1011 ref += stride;
|
|
|
1012
|
|
|
1013 vis_ld64(ref[0], TMP6);
|
|
|
1014 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
1015
|
|
|
1016 vis_ld64_2(ref, 8, TMP8);
|
|
|
1017 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
1018
|
|
|
1019 vis_ld64_2(ref, 16, TMP10);
|
|
|
1020 ref += stride;
|
|
|
1021
|
|
|
1022 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
1023 vis_faligndata(TMP6, TMP8, REF_2);
|
|
|
1024
|
|
|
1025 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
1026 vis_faligndata(TMP8, TMP10, REF_6);
|
|
|
1027
|
|
|
1028 vis_ld64(constants128[0], CONST_128);
|
|
|
1029 height = (height >> 1) - 1;
|
|
|
1030 do { /* 24 cycles */
|
|
|
1031 vis_ld64(ref[0], TMP0);
|
|
|
1032 vis_xor(REF_0, REF_2, TMP12);
|
|
|
1033
|
|
|
1034 vis_ld64_2(ref, 8, TMP2);
|
|
|
1035 vis_xor(REF_4, REF_6, TMP16);
|
|
|
1036
|
|
|
1037 vis_ld64_2(ref, 16, TMP4);
|
|
|
1038 ref += stride;
|
|
|
1039 vis_or(REF_0, REF_2, TMP14);
|
|
|
1040
|
|
|
1041 vis_ld64(ref[0], TMP6);
|
|
|
1042 vis_or(REF_4, REF_6, TMP18);
|
|
|
1043
|
|
|
1044 vis_ld64_2(ref, 8, TMP8);
|
|
|
1045 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
1046
|
|
|
1047 vis_ld64_2(ref, 16, TMP10);
|
|
|
1048 ref += stride;
|
|
|
1049 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
1050
|
|
|
1051 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
1052
|
|
|
1053 vis_and(TMP16, MASK_fe, TMP16);
|
|
|
1054 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
1055
|
|
|
1056 vis_mul8x16(CONST_128, TMP16, TMP16);
|
|
|
1057 vis_xor(REF_0, REF_2, TMP0);
|
|
|
1058
|
|
|
1059 vis_xor(REF_4, REF_6, TMP2);
|
|
|
1060
|
|
|
1061 vis_or(REF_0, REF_2, TMP20);
|
|
|
1062
|
|
|
1063 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
1064
|
|
|
1065 vis_and(TMP16, MASK_7f, TMP16);
|
|
|
1066
|
|
|
1067 vis_psub16(TMP14, TMP12, TMP12);
|
|
|
1068 vis_st64(TMP12, dest[0]);
|
|
|
1069
|
|
|
1070 vis_psub16(TMP18, TMP16, TMP16);
|
|
|
1071 vis_st64_2(TMP16, dest, 8);
|
|
|
1072 dest += stride;
|
|
|
1073
|
|
|
1074 vis_or(REF_4, REF_6, TMP18);
|
|
|
1075
|
|
|
1076 vis_and(TMP0, MASK_fe, TMP0);
|
|
|
1077
|
|
|
1078 vis_and(TMP2, MASK_fe, TMP2);
|
|
|
1079 vis_mul8x16(CONST_128, TMP0, TMP0);
|
|
|
1080
|
|
|
1081 vis_faligndata(TMP6, TMP8, REF_2);
|
|
|
1082 vis_mul8x16(CONST_128, TMP2, TMP2);
|
|
|
1083
|
|
|
1084 vis_faligndata(TMP8, TMP10, REF_6);
|
|
|
1085
|
|
|
1086 vis_and(TMP0, MASK_7f, TMP0);
|
|
|
1087
|
|
|
1088 vis_and(TMP2, MASK_7f, TMP2);
|
|
|
1089
|
|
|
1090 vis_psub16(TMP20, TMP0, TMP0);
|
|
|
1091 vis_st64(TMP0, dest[0]);
|
|
|
1092
|
|
|
1093 vis_psub16(TMP18, TMP2, TMP2);
|
|
|
1094 vis_st64_2(TMP2, dest, 8);
|
|
|
1095 dest += stride;
|
|
|
1096 } while (--height);
|
|
|
1097
|
|
|
1098 vis_ld64(ref[0], TMP0);
|
|
|
1099 vis_xor(REF_0, REF_2, TMP12);
|
|
|
1100
|
|
|
1101 vis_ld64_2(ref, 8, TMP2);
|
|
|
1102 vis_xor(REF_4, REF_6, TMP16);
|
|
|
1103
|
|
|
1104 vis_ld64_2(ref, 16, TMP4);
|
|
|
1105 vis_or(REF_0, REF_2, TMP14);
|
|
|
1106
|
|
|
1107 vis_or(REF_4, REF_6, TMP18);
|
|
|
1108
|
|
|
1109 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
1110
|
|
|
1111 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
1112
|
|
|
1113 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
1114
|
|
|
1115 vis_and(TMP16, MASK_fe, TMP16);
|
|
|
1116 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
1117
|
|
|
1118 vis_mul8x16(CONST_128, TMP16, TMP16);
|
|
|
1119 vis_xor(REF_0, REF_2, TMP0);
|
|
|
1120
|
|
|
1121 vis_xor(REF_4, REF_6, TMP2);
|
|
|
1122
|
|
|
1123 vis_or(REF_0, REF_2, TMP20);
|
|
|
1124
|
|
|
1125 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
1126
|
|
|
1127 vis_and(TMP16, MASK_7f, TMP16);
|
|
|
1128
|
|
|
1129 vis_psub16(TMP14, TMP12, TMP12);
|
|
|
1130 vis_st64(TMP12, dest[0]);
|
|
|
1131
|
|
|
1132 vis_psub16(TMP18, TMP16, TMP16);
|
|
|
1133 vis_st64_2(TMP16, dest, 8);
|
|
|
1134 dest += stride;
|
|
|
1135
|
|
|
1136 vis_or(REF_4, REF_6, TMP18);
|
|
|
1137
|
|
|
1138 vis_and(TMP0, MASK_fe, TMP0);
|
|
|
1139
|
|
|
1140 vis_and(TMP2, MASK_fe, TMP2);
|
|
|
1141 vis_mul8x16(CONST_128, TMP0, TMP0);
|
|
|
1142
|
|
|
1143 vis_mul8x16(CONST_128, TMP2, TMP2);
|
|
|
1144
|
|
|
1145 vis_and(TMP0, MASK_7f, TMP0);
|
|
|
1146
|
|
|
1147 vis_and(TMP2, MASK_7f, TMP2);
|
|
|
1148
|
|
|
1149 vis_psub16(TMP20, TMP0, TMP0);
|
|
|
1150 vis_st64(TMP0, dest[0]);
|
|
|
1151
|
|
|
1152 vis_psub16(TMP18, TMP2, TMP2);
|
|
|
1153 vis_st64_2(TMP2, dest, 8);
|
|
|
1154 }
|
|
|
1155
|
|
|
1156 static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
1157 const int stride, int height)
|
|
|
1158 {
|
|
|
1159 uint8_t *ref = (uint8_t *) _ref;
|
|
|
1160
|
|
|
1161 ref = vis_alignaddr(ref);
|
|
|
1162 vis_ld64(ref[0], TMP0);
|
|
|
1163
|
|
|
1164 vis_ld64_2(ref, 8, TMP2);
|
|
|
1165 ref += stride;
|
|
|
1166
|
|
|
1167 vis_ld64(ref[0], TMP4);
|
|
|
1168
|
|
|
1169 vis_ld64_2(ref, 8, TMP6);
|
|
|
1170 ref += stride;
|
|
|
1171
|
|
|
1172 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
1173 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
1174
|
|
|
1175 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
1176 vis_faligndata(TMP4, TMP6, REF_2);
|
|
|
1177
|
|
|
1178 vis_ld64(constants128[0], CONST_128);
|
|
|
1179 height = (height >> 1) - 1;
|
|
|
1180 do { /* 12 cycles */
|
|
|
1181 vis_ld64(ref[0], TMP0);
|
|
|
1182 vis_xor(REF_0, REF_2, TMP4);
|
|
|
1183
|
|
|
1184 vis_ld64_2(ref, 8, TMP2);
|
|
|
1185 ref += stride;
|
|
|
1186 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
1187
|
|
|
1188 vis_or(REF_0, REF_2, TMP6);
|
|
|
1189 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
1190
|
|
|
1191 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
1192 vis_ld64(ref[0], TMP0);
|
|
|
1193
|
|
|
1194 vis_ld64_2(ref, 8, TMP2);
|
|
|
1195 ref += stride;
|
|
|
1196 vis_xor(REF_0, REF_2, TMP12);
|
|
|
1197
|
|
|
1198 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
1199
|
|
|
1200 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
1201
|
|
|
1202 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
1203 vis_or(REF_0, REF_2, TMP14);
|
|
|
1204
|
|
|
1205 vis_psub16(TMP6, TMP4, DST_0);
|
|
|
1206 vis_st64(DST_0, dest[0]);
|
|
|
1207 dest += stride;
|
|
|
1208
|
|
|
1209 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
1210
|
|
|
1211 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
1212
|
|
|
1213 vis_psub16(TMP14, TMP12, DST_0);
|
|
|
1214 vis_st64(DST_0, dest[0]);
|
|
|
1215 dest += stride;
|
|
|
1216 } while (--height);
|
|
|
1217
|
|
|
1218 vis_ld64(ref[0], TMP0);
|
|
|
1219 vis_xor(REF_0, REF_2, TMP4);
|
|
|
1220
|
|
|
1221 vis_ld64_2(ref, 8, TMP2);
|
|
|
1222 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
1223
|
|
|
1224 vis_or(REF_0, REF_2, TMP6);
|
|
|
1225 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
1226
|
|
|
1227 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
1228
|
|
|
1229 vis_xor(REF_0, REF_2, TMP12);
|
|
|
1230
|
|
|
1231 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
1232
|
|
|
1233 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
1234
|
|
|
1235 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
1236 vis_or(REF_0, REF_2, TMP14);
|
|
|
1237
|
|
|
1238 vis_psub16(TMP6, TMP4, DST_0);
|
|
|
1239 vis_st64(DST_0, dest[0]);
|
|
|
1240 dest += stride;
|
|
|
1241
|
|
|
1242 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
1243
|
|
|
1244 vis_psub16(TMP14, TMP12, DST_0);
|
|
|
1245 vis_st64(DST_0, dest[0]);
|
|
|
1246 }
|
|
|
1247
|
|
|
1248 static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
1249 const int stride, int height)
|
|
|
1250 {
|
|
|
1251 uint8_t *ref = (uint8_t *) _ref;
|
|
|
1252 int stride_8 = stride + 8;
|
|
|
1253 int stride_16 = stride + 16;
|
|
|
1254
|
|
|
1255 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
1256
|
|
|
1257 ref = vis_alignaddr(ref);
|
|
|
1258
|
|
|
1259 vis_ld64(ref[ 0], TMP0);
|
|
|
1260 vis_fzero(ZERO);
|
|
|
1261
|
|
|
1262 vis_ld64(ref[ 8], TMP2);
|
|
|
1263
|
|
|
1264 vis_ld64(ref[16], TMP4);
|
|
|
1265
|
|
|
1266 vis_ld64(constants3[0], CONST_3);
|
|
|
1267 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
1268
|
|
|
1269 vis_ld64(constants256_512[0], CONST_256);
|
|
|
1270 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
1271 height >>= 1;
|
|
|
1272
|
|
|
1273 do { /* 31 cycles */
|
|
|
1274 vis_ld64_2(ref, stride, TMP0);
|
|
|
1275 vis_pmerge(ZERO, REF_2, TMP12);
|
|
|
1276 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
|
|
|
1277
|
|
|
1278 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
1279 vis_pmerge(ZERO, REF_6, TMP16);
|
|
|
1280 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
|
|
|
1281
|
|
|
1282 vis_ld64_2(ref, stride_16, TMP4);
|
|
|
1283 ref += stride;
|
|
|
1284
|
|
|
1285 vis_ld64(dest[0], DST_0);
|
|
|
1286 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
1287
|
|
|
1288 vis_ld64_2(dest, 8, DST_2);
|
|
|
1289 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
1290
|
|
|
1291 vis_ld64_2(ref, stride, TMP6);
|
|
|
1292 vis_pmerge(ZERO, REF_0, TMP0);
|
|
|
1293 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
|
|
1294
|
|
|
1295 vis_ld64_2(ref, stride_8, TMP8);
|
|
|
1296 vis_pmerge(ZERO, REF_4, TMP4);
|
|
|
1297
|
|
|
1298 vis_ld64_2(ref, stride_16, TMP10);
|
|
|
1299 ref += stride;
|
|
|
1300
|
|
|
1301 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
|
|
|
1302 vis_faligndata(TMP6, TMP8, REF_2);
|
|
|
1303 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
|
|
|
1304
|
|
|
1305 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
|
|
|
1306 vis_faligndata(TMP8, TMP10, REF_6);
|
|
|
1307 vis_mul8x16al(DST_0, CONST_512, TMP20);
|
|
|
1308
|
|
|
1309 vis_padd16(TMP0, CONST_3, TMP0);
|
|
|
1310 vis_mul8x16al(DST_1, CONST_512, TMP22);
|
|
|
1311
|
|
|
1312 vis_padd16(TMP2, CONST_3, TMP2);
|
|
|
1313 vis_mul8x16al(DST_2, CONST_512, TMP24);
|
|
|
1314
|
|
|
1315 vis_padd16(TMP4, CONST_3, TMP4);
|
|
|
1316 vis_mul8x16al(DST_3, CONST_512, TMP26);
|
|
|
1317
|
|
|
1318 vis_padd16(TMP6, CONST_3, TMP6);
|
|
|
1319
|
|
|
1320 vis_padd16(TMP12, TMP20, TMP12);
|
|
|
1321 vis_mul8x16al(REF_S0, CONST_512, TMP20);
|
|
|
1322
|
|
|
1323 vis_padd16(TMP14, TMP22, TMP14);
|
|
|
1324 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
|
|
|
1325
|
|
|
1326 vis_padd16(TMP16, TMP24, TMP16);
|
|
|
1327 vis_mul8x16al(REF_S2, CONST_512, TMP24);
|
|
|
1328
|
|
|
1329 vis_padd16(TMP18, TMP26, TMP18);
|
|
|
1330 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
|
|
|
1331
|
|
|
1332 vis_padd16(TMP12, TMP0, TMP12);
|
|
|
1333 vis_mul8x16au(REF_2, CONST_256, TMP28);
|
|
|
1334
|
|
|
1335 vis_padd16(TMP14, TMP2, TMP14);
|
|
|
1336 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
|
|
|
1337
|
|
|
1338 vis_padd16(TMP16, TMP4, TMP16);
|
|
|
1339 vis_mul8x16au(REF_6, CONST_256, REF_S4);
|
|
|
1340
|
|
|
1341 vis_padd16(TMP18, TMP6, TMP18);
|
|
|
1342 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
|
|
|
1343
|
|
|
1344 vis_pack16(TMP12, DST_0);
|
|
|
1345 vis_padd16(TMP28, TMP0, TMP12);
|
|
|
1346
|
|
|
1347 vis_pack16(TMP14, DST_1);
|
|
|
1348 vis_st64(DST_0, dest[0]);
|
|
|
1349 vis_padd16(TMP30, TMP2, TMP14);
|
|
|
1350
|
|
|
1351 vis_pack16(TMP16, DST_2);
|
|
|
1352 vis_padd16(REF_S4, TMP4, TMP16);
|
|
|
1353
|
|
|
1354 vis_pack16(TMP18, DST_3);
|
|
|
1355 vis_st64_2(DST_2, dest, 8);
|
|
|
1356 dest += stride;
|
|
|
1357 vis_padd16(REF_S6, TMP6, TMP18);
|
|
|
1358
|
|
|
1359 vis_padd16(TMP12, TMP20, TMP12);
|
|
|
1360
|
|
|
1361 vis_padd16(TMP14, TMP22, TMP14);
|
|
|
1362 vis_pack16(TMP12, DST_0);
|
|
|
1363
|
|
|
1364 vis_padd16(TMP16, TMP24, TMP16);
|
|
|
1365 vis_pack16(TMP14, DST_1);
|
|
|
1366 vis_st64(DST_0, dest[0]);
|
|
|
1367
|
|
|
1368 vis_padd16(TMP18, TMP26, TMP18);
|
|
|
1369 vis_pack16(TMP16, DST_2);
|
|
|
1370
|
|
|
1371 vis_pack16(TMP18, DST_3);
|
|
|
1372 vis_st64_2(DST_2, dest, 8);
|
|
|
1373 dest += stride;
|
|
|
1374 } while (--height);
|
|
|
1375 }
|
|
|
1376
|
|
|
1377 static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
1378 const int stride, int height)
|
|
|
1379 {
|
|
|
1380 uint8_t *ref = (uint8_t *) _ref;
|
|
|
1381 int stride_8 = stride + 8;
|
|
|
1382
|
|
|
1383 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
1384
|
|
|
1385 ref = vis_alignaddr(ref);
|
|
|
1386
|
|
|
1387 vis_ld64(ref[ 0], TMP0);
|
|
|
1388 vis_fzero(ZERO);
|
|
|
1389
|
|
|
1390 vis_ld64(ref[ 8], TMP2);
|
|
|
1391
|
|
|
1392 vis_ld64(constants3[0], CONST_3);
|
|
|
1393 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
1394
|
|
|
1395 vis_ld64(constants256_512[0], CONST_256);
|
|
|
1396
|
|
|
1397 height >>= 1;
|
|
|
1398 do { /* 20 cycles */
|
|
|
1399 vis_ld64_2(ref, stride, TMP0);
|
|
|
1400 vis_pmerge(ZERO, REF_2, TMP8);
|
|
|
1401 vis_mul8x16au(REF_2_1, CONST_256, TMP10);
|
|
|
1402
|
|
|
1403 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
1404 ref += stride;
|
|
|
1405
|
|
|
1406 vis_ld64(dest[0], DST_0);
|
|
|
1407
|
|
|
1408 vis_ld64_2(dest, stride, DST_2);
|
|
|
1409 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
1410
|
|
|
1411 vis_ld64_2(ref, stride, TMP4);
|
|
|
1412 vis_mul8x16al(DST_0, CONST_512, TMP16);
|
|
|
1413 vis_pmerge(ZERO, REF_0, TMP12);
|
|
|
1414
|
|
|
1415 vis_ld64_2(ref, stride_8, TMP6);
|
|
|
1416 ref += stride;
|
|
|
1417 vis_mul8x16al(DST_1, CONST_512, TMP18);
|
|
|
1418 vis_pmerge(ZERO, REF_0_1, TMP14);
|
|
|
1419
|
|
|
1420 vis_padd16(TMP12, CONST_3, TMP12);
|
|
|
1421 vis_mul8x16al(DST_2, CONST_512, TMP24);
|
|
|
1422
|
|
|
1423 vis_padd16(TMP14, CONST_3, TMP14);
|
|
|
1424 vis_mul8x16al(DST_3, CONST_512, TMP26);
|
|
|
1425
|
|
|
1426 vis_faligndata(TMP4, TMP6, REF_2);
|
|
|
1427
|
|
|
1428 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
1429
|
|
|
1430 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
1431 vis_mul8x16au(REF_2, CONST_256, TMP20);
|
|
|
1432
|
|
|
1433 vis_padd16(TMP8, TMP16, TMP0);
|
|
|
1434 vis_mul8x16au(REF_2_1, CONST_256, TMP22);
|
|
|
1435
|
|
|
1436 vis_padd16(TMP10, TMP18, TMP2);
|
|
|
1437 vis_pack16(TMP0, DST_0);
|
|
|
1438
|
|
|
1439 vis_pack16(TMP2, DST_1);
|
|
|
1440 vis_st64(DST_0, dest[0]);
|
|
|
1441 dest += stride;
|
|
|
1442 vis_padd16(TMP12, TMP20, TMP12);
|
|
|
1443
|
|
|
1444 vis_padd16(TMP14, TMP22, TMP14);
|
|
|
1445
|
|
|
1446 vis_padd16(TMP12, TMP24, TMP0);
|
|
|
1447
|
|
|
1448 vis_padd16(TMP14, TMP26, TMP2);
|
|
|
1449 vis_pack16(TMP0, DST_2);
|
|
|
1450
|
|
|
1451 vis_pack16(TMP2, DST_3);
|
|
|
1452 vis_st64(DST_2, dest[0]);
|
|
|
1453 dest += stride;
|
|
|
1454 } while (--height);
|
|
|
1455 }
|
|
|
1456
|
|
|
1457 static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
1458 const int stride, int height)
|
|
|
1459 {
|
|
|
1460 uint8_t *ref = (uint8_t *) _ref;
|
|
|
1461 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
1462 unsigned long off_plus_1 = off + 1;
|
|
|
1463 int stride_8 = stride + 8;
|
|
|
1464 int stride_16 = stride + 16;
|
|
|
1465
|
|
|
1466 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
1467
|
|
|
1468 ref = vis_alignaddr(ref);
|
|
|
1469
|
|
|
1470 vis_ld64(ref[ 0], TMP0);
|
|
|
1471 vis_fzero(ZERO);
|
|
|
1472
|
|
|
1473 vis_ld64(ref[ 8], TMP2);
|
|
|
1474
|
|
|
1475 vis_ld64(ref[16], TMP4);
|
|
|
1476
|
|
|
1477 vis_ld64(constants2[0], CONST_2);
|
|
|
1478 vis_faligndata(TMP0, TMP2, REF_S0);
|
|
|
1479
|
|
|
1480 vis_ld64(constants256_512[0], CONST_256);
|
|
|
1481 vis_faligndata(TMP2, TMP4, REF_S4);
|
|
|
1482
|
|
|
1483 if (off != 0x7) {
|
|
|
1484 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
1485 vis_faligndata(TMP0, TMP2, REF_S2);
|
|
|
1486 vis_faligndata(TMP2, TMP4, REF_S6);
|
|
|
1487 } else {
|
|
|
1488 vis_src1(TMP2, REF_S2);
|
|
|
1489 vis_src1(TMP4, REF_S6);
|
|
|
1490 }
|
|
|
1491
|
|
|
1492 height >>= 1;
|
|
|
1493 do {
|
|
|
1494 vis_ld64_2(ref, stride, TMP0);
|
|
|
1495 vis_mul8x16au(REF_S0, CONST_256, TMP12);
|
|
|
1496 vis_pmerge(ZERO, REF_S0_1, TMP14);
|
|
|
1497
|
|
|
1498 vis_alignaddr_g0((void *)off);
|
|
|
1499
|
|
|
1500 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
1501 vis_mul8x16au(REF_S2, CONST_256, TMP16);
|
|
|
1502 vis_pmerge(ZERO, REF_S2_1, TMP18);
|
|
|
1503
|
|
|
1504 vis_ld64_2(ref, stride_16, TMP4);
|
|
|
1505 ref += stride;
|
|
|
1506 vis_mul8x16au(REF_S4, CONST_256, TMP20);
|
|
|
1507 vis_pmerge(ZERO, REF_S4_1, TMP22);
|
|
|
1508
|
|
|
1509 vis_ld64_2(ref, stride, TMP6);
|
|
|
1510 vis_mul8x16au(REF_S6, CONST_256, TMP24);
|
|
|
1511 vis_pmerge(ZERO, REF_S6_1, TMP26);
|
|
|
1512
|
|
|
1513 vis_ld64_2(ref, stride_8, TMP8);
|
|
|
1514 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
1515
|
|
|
1516 vis_ld64_2(ref, stride_16, TMP10);
|
|
|
1517 ref += stride;
|
|
|
1518 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
1519
|
|
|
1520 vis_faligndata(TMP6, TMP8, REF_S0);
|
|
|
1521
|
|
|
1522 vis_faligndata(TMP8, TMP10, REF_S4);
|
|
|
1523
|
|
|
1524 if (off != 0x7) {
|
|
|
1525 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
1526 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
1527 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
1528 vis_faligndata(TMP6, TMP8, REF_S2);
|
|
|
1529 vis_faligndata(TMP8, TMP10, REF_S6);
|
|
|
1530 } else {
|
|
|
1531 vis_src1(TMP2, REF_2);
|
|
|
1532 vis_src1(TMP4, REF_6);
|
|
|
1533 vis_src1(TMP8, REF_S2);
|
|
|
1534 vis_src1(TMP10, REF_S6);
|
|
|
1535 }
|
|
|
1536
|
|
|
1537 vis_mul8x16au(REF_0, CONST_256, TMP0);
|
|
|
1538 vis_pmerge(ZERO, REF_0_1, TMP2);
|
|
|
1539
|
|
|
1540 vis_mul8x16au(REF_2, CONST_256, TMP4);
|
|
|
1541 vis_pmerge(ZERO, REF_2_1, TMP6);
|
|
|
1542
|
|
|
1543 vis_padd16(TMP0, CONST_2, TMP8);
|
|
|
1544 vis_mul8x16au(REF_4, CONST_256, TMP0);
|
|
|
1545
|
|
|
1546 vis_padd16(TMP2, CONST_2, TMP10);
|
|
|
1547 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
|
|
|
1548
|
|
|
1549 vis_padd16(TMP8, TMP4, TMP8);
|
|
|
1550 vis_mul8x16au(REF_6, CONST_256, TMP4);
|
|
|
1551
|
|
|
1552 vis_padd16(TMP10, TMP6, TMP10);
|
|
|
1553 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
|
|
|
1554
|
|
|
1555 vis_padd16(TMP12, TMP8, TMP12);
|
|
|
1556
|
|
|
1557 vis_padd16(TMP14, TMP10, TMP14);
|
|
|
1558
|
|
|
1559 vis_padd16(TMP12, TMP16, TMP12);
|
|
|
1560
|
|
|
1561 vis_padd16(TMP14, TMP18, TMP14);
|
|
|
1562 vis_pack16(TMP12, DST_0);
|
|
|
1563
|
|
|
1564 vis_pack16(TMP14, DST_1);
|
|
|
1565 vis_st64(DST_0, dest[0]);
|
|
|
1566 vis_padd16(TMP0, CONST_2, TMP12);
|
|
|
1567
|
|
|
1568 vis_mul8x16au(REF_S0, CONST_256, TMP0);
|
|
|
1569 vis_padd16(TMP2, CONST_2, TMP14);
|
|
|
1570
|
|
|
1571 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
|
|
|
1572 vis_padd16(TMP12, TMP4, TMP12);
|
|
|
1573
|
|
|
1574 vis_mul8x16au(REF_S2, CONST_256, TMP4);
|
|
|
1575 vis_padd16(TMP14, TMP6, TMP14);
|
|
|
1576
|
|
|
1577 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
|
|
|
1578 vis_padd16(TMP20, TMP12, TMP20);
|
|
|
1579
|
|
|
1580 vis_padd16(TMP22, TMP14, TMP22);
|
|
|
1581
|
|
|
1582 vis_padd16(TMP20, TMP24, TMP20);
|
|
|
1583
|
|
|
1584 vis_padd16(TMP22, TMP26, TMP22);
|
|
|
1585 vis_pack16(TMP20, DST_2);
|
|
|
1586
|
|
|
1587 vis_pack16(TMP22, DST_3);
|
|
|
1588 vis_st64_2(DST_2, dest, 8);
|
|
|
1589 dest += stride;
|
|
|
1590 vis_padd16(TMP0, TMP4, TMP24);
|
|
|
1591
|
|
|
1592 vis_mul8x16au(REF_S4, CONST_256, TMP0);
|
|
|
1593 vis_padd16(TMP2, TMP6, TMP26);
|
|
|
1594
|
|
|
1595 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
|
|
|
1596 vis_padd16(TMP24, TMP8, TMP24);
|
|
|
1597
|
|
|
1598 vis_padd16(TMP26, TMP10, TMP26);
|
|
|
1599 vis_pack16(TMP24, DST_0);
|
|
|
1600
|
|
|
1601 vis_pack16(TMP26, DST_1);
|
|
|
1602 vis_st64(DST_0, dest[0]);
|
|
|
1603 vis_pmerge(ZERO, REF_S6, TMP4);
|
|
|
1604
|
|
|
1605 vis_pmerge(ZERO, REF_S6_1, TMP6);
|
|
|
1606
|
|
|
1607 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
1608
|
|
|
1609 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
1610
|
|
|
1611 vis_padd16(TMP0, TMP12, TMP0);
|
|
|
1612
|
|
|
1613 vis_padd16(TMP2, TMP14, TMP2);
|
|
|
1614 vis_pack16(TMP0, DST_2);
|
|
|
1615
|
|
|
1616 vis_pack16(TMP2, DST_3);
|
|
|
1617 vis_st64_2(DST_2, dest, 8);
|
|
|
1618 dest += stride;
|
|
|
1619 } while (--height);
|
|
|
1620 }
|
|
|
1621
|
|
|
1622 static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
1623 const int stride, int height)
|
|
|
1624 {
|
|
|
1625 uint8_t *ref = (uint8_t *) _ref;
|
|
|
1626 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
1627 unsigned long off_plus_1 = off + 1;
|
|
|
1628 int stride_8 = stride + 8;
|
|
|
1629
|
|
|
1630 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
1631
|
|
|
1632 ref = vis_alignaddr(ref);
|
|
|
1633
|
|
|
1634 vis_ld64(ref[ 0], TMP0);
|
|
|
1635 vis_fzero(ZERO);
|
|
|
1636
|
|
|
1637 vis_ld64(ref[ 8], TMP2);
|
|
|
1638
|
|
|
1639 vis_ld64(constants2[0], CONST_2);
|
|
|
1640
|
|
|
1641 vis_ld64(constants256_512[0], CONST_256);
|
|
|
1642 vis_faligndata(TMP0, TMP2, REF_S0);
|
|
|
1643
|
|
|
1644 if (off != 0x7) {
|
|
|
1645 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
1646 vis_faligndata(TMP0, TMP2, REF_S2);
|
|
|
1647 } else {
|
|
|
1648 vis_src1(TMP2, REF_S2);
|
|
|
1649 }
|
|
|
1650
|
|
|
1651 height >>= 1;
|
|
|
1652 do { /* 26 cycles */
|
|
|
1653 vis_ld64_2(ref, stride, TMP0);
|
|
|
1654 vis_mul8x16au(REF_S0, CONST_256, TMP8);
|
|
|
1655 vis_pmerge(ZERO, REF_S2, TMP12);
|
|
|
1656
|
|
|
1657 vis_alignaddr_g0((void *)off);
|
|
|
1658
|
|
|
1659 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
1660 ref += stride;
|
|
|
1661 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
|
|
|
1662 vis_pmerge(ZERO, REF_S2_1, TMP14);
|
|
|
1663
|
|
|
1664 vis_ld64_2(ref, stride, TMP4);
|
|
|
1665
|
|
|
1666 vis_ld64_2(ref, stride_8, TMP6);
|
|
|
1667 ref += stride;
|
|
|
1668 vis_faligndata(TMP0, TMP2, REF_S4);
|
|
|
1669
|
|
|
1670 vis_pmerge(ZERO, REF_S4, TMP18);
|
|
|
1671
|
|
|
1672 vis_pmerge(ZERO, REF_S4_1, TMP20);
|
|
|
1673
|
|
|
1674 vis_faligndata(TMP4, TMP6, REF_S0);
|
|
|
1675
|
|
|
1676 if (off != 0x7) {
|
|
|
1677 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
1678 vis_faligndata(TMP0, TMP2, REF_S6);
|
|
|
1679 vis_faligndata(TMP4, TMP6, REF_S2);
|
|
|
1680 } else {
|
|
|
1681 vis_src1(TMP2, REF_S6);
|
|
|
1682 vis_src1(TMP6, REF_S2);
|
|
|
1683 }
|
|
|
1684
|
|
|
1685 vis_padd16(TMP18, CONST_2, TMP18);
|
|
|
1686 vis_mul8x16au(REF_S6, CONST_256, TMP22);
|
|
|
1687
|
|
|
1688 vis_padd16(TMP20, CONST_2, TMP20);
|
|
|
1689 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
|
|
|
1690
|
|
|
1691 vis_mul8x16au(REF_S0, CONST_256, TMP26);
|
|
|
1692 vis_pmerge(ZERO, REF_S0_1, TMP28);
|
|
|
1693
|
|
|
1694 vis_mul8x16au(REF_S2, CONST_256, TMP30);
|
|
|
1695 vis_padd16(TMP18, TMP22, TMP18);
|
|
|
1696
|
|
|
1697 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
|
|
|
1698 vis_padd16(TMP20, TMP24, TMP20);
|
|
|
1699
|
|
|
1700 vis_padd16(TMP8, TMP18, TMP8);
|
|
|
1701
|
|
|
1702 vis_padd16(TMP10, TMP20, TMP10);
|
|
|
1703
|
|
|
1704 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
1705
|
|
|
1706 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
1707 vis_pack16(TMP8, DST_0);
|
|
|
1708
|
|
|
1709 vis_pack16(TMP10, DST_1);
|
|
|
1710 vis_st64(DST_0, dest[0]);
|
|
|
1711 dest += stride;
|
|
|
1712 vis_padd16(TMP18, TMP26, TMP18);
|
|
|
1713
|
|
|
1714 vis_padd16(TMP20, TMP28, TMP20);
|
|
|
1715
|
|
|
1716 vis_padd16(TMP18, TMP30, TMP18);
|
|
|
1717
|
|
|
1718 vis_padd16(TMP20, TMP32, TMP20);
|
|
|
1719 vis_pack16(TMP18, DST_2);
|
|
|
1720
|
|
|
1721 vis_pack16(TMP20, DST_3);
|
|
|
1722 vis_st64(DST_2, dest[0]);
|
|
|
1723 dest += stride;
|
|
|
1724 } while (--height);
|
|
|
1725 }
|
|
|
1726
|
|
|
1727 static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
1728 const int stride, int height)
|
|
|
1729 {
|
|
|
1730 uint8_t *ref = (uint8_t *) _ref;
|
|
|
1731 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
1732 unsigned long off_plus_1 = off + 1;
|
|
|
1733 int stride_8 = stride + 8;
|
|
|
1734 int stride_16 = stride + 16;
|
|
|
1735
|
|
|
1736 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
1737
|
|
|
1738 ref = vis_alignaddr(ref);
|
|
|
1739
|
|
|
1740 vis_ld64(ref[ 0], TMP0);
|
|
|
1741 vis_fzero(ZERO);
|
|
|
1742
|
|
|
1743 vis_ld64(ref[ 8], TMP2);
|
|
|
1744
|
|
|
1745 vis_ld64(ref[16], TMP4);
|
|
|
1746
|
|
|
1747 vis_ld64(constants6[0], CONST_6);
|
|
|
1748 vis_faligndata(TMP0, TMP2, REF_S0);
|
|
|
1749
|
|
|
1750 vis_ld64(constants256_1024[0], CONST_256);
|
|
|
1751 vis_faligndata(TMP2, TMP4, REF_S4);
|
|
|
1752
|
|
|
1753 if (off != 0x7) {
|
|
|
1754 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
1755 vis_faligndata(TMP0, TMP2, REF_S2);
|
|
|
1756 vis_faligndata(TMP2, TMP4, REF_S6);
|
|
|
1757 } else {
|
|
|
1758 vis_src1(TMP2, REF_S2);
|
|
|
1759 vis_src1(TMP4, REF_S6);
|
|
|
1760 }
|
|
|
1761
|
|
|
1762 height >>= 1;
|
|
|
1763 do { /* 55 cycles */
|
|
|
1764 vis_ld64_2(ref, stride, TMP0);
|
|
|
1765 vis_mul8x16au(REF_S0, CONST_256, TMP12);
|
|
|
1766 vis_pmerge(ZERO, REF_S0_1, TMP14);
|
|
|
1767
|
|
|
1768 vis_alignaddr_g0((void *)off);
|
|
|
1769
|
|
|
1770 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
1771 vis_mul8x16au(REF_S2, CONST_256, TMP16);
|
|
|
1772 vis_pmerge(ZERO, REF_S2_1, TMP18);
|
|
|
1773
|
|
|
1774 vis_ld64_2(ref, stride_16, TMP4);
|
|
|
1775 ref += stride;
|
|
|
1776 vis_mul8x16au(REF_S4, CONST_256, TMP20);
|
|
|
1777 vis_pmerge(ZERO, REF_S4_1, TMP22);
|
|
|
1778
|
|
|
1779 vis_ld64_2(ref, stride, TMP6);
|
|
|
1780 vis_mul8x16au(REF_S6, CONST_256, TMP24);
|
|
|
1781 vis_pmerge(ZERO, REF_S6_1, TMP26);
|
|
|
1782
|
|
|
1783 vis_ld64_2(ref, stride_8, TMP8);
|
|
|
1784 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
1785
|
|
|
1786 vis_ld64_2(ref, stride_16, TMP10);
|
|
|
1787 ref += stride;
|
|
|
1788 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
1789
|
|
|
1790 vis_ld64(dest[0], DST_0);
|
|
|
1791 vis_faligndata(TMP6, TMP8, REF_S0);
|
|
|
1792
|
|
|
1793 vis_ld64_2(dest, 8, DST_2);
|
|
|
1794 vis_faligndata(TMP8, TMP10, REF_S4);
|
|
|
1795
|
|
|
1796 if (off != 0x7) {
|
|
|
1797 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
1798 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
1799 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
1800 vis_faligndata(TMP6, TMP8, REF_S2);
|
|
|
1801 vis_faligndata(TMP8, TMP10, REF_S6);
|
|
|
1802 } else {
|
|
|
1803 vis_src1(TMP2, REF_2);
|
|
|
1804 vis_src1(TMP4, REF_6);
|
|
|
1805 vis_src1(TMP8, REF_S2);
|
|
|
1806 vis_src1(TMP10, REF_S6);
|
|
|
1807 }
|
|
|
1808
|
|
|
1809 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
|
|
1810 vis_pmerge(ZERO, REF_0, TMP0);
|
|
|
1811
|
|
|
1812 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
|
|
1813 vis_pmerge(ZERO, REF_0_1, TMP2);
|
|
|
1814
|
|
|
1815 vis_mul8x16au(REF_2, CONST_256, TMP4);
|
|
|
1816 vis_pmerge(ZERO, REF_2_1, TMP6);
|
|
|
1817
|
|
|
1818 vis_mul8x16al(DST_2, CONST_1024, REF_0);
|
|
|
1819 vis_padd16(TMP0, CONST_6, TMP0);
|
|
|
1820
|
|
|
1821 vis_mul8x16al(DST_3, CONST_1024, REF_2);
|
|
|
1822 vis_padd16(TMP2, CONST_6, TMP2);
|
|
|
1823
|
|
|
1824 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
1825 vis_mul8x16au(REF_4, CONST_256, TMP4);
|
|
|
1826
|
|
|
1827 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
1828 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
|
|
|
1829
|
|
|
1830 vis_padd16(TMP12, TMP0, TMP12);
|
|
|
1831 vis_mul8x16au(REF_6, CONST_256, TMP8);
|
|
|
1832
|
|
|
1833 vis_padd16(TMP14, TMP2, TMP14);
|
|
|
1834 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
|
|
|
1835
|
|
|
1836 vis_padd16(TMP12, TMP16, TMP12);
|
|
|
1837 vis_mul8x16au(REF_S0, CONST_256, REF_4);
|
|
|
1838
|
|
|
1839 vis_padd16(TMP14, TMP18, TMP14);
|
|
|
1840 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
|
|
|
1841
|
|
|
1842 vis_padd16(TMP12, TMP30, TMP12);
|
|
|
1843
|
|
|
1844 vis_padd16(TMP14, TMP32, TMP14);
|
|
|
1845 vis_pack16(TMP12, DST_0);
|
|
|
1846
|
|
|
1847 vis_pack16(TMP14, DST_1);
|
|
|
1848 vis_st64(DST_0, dest[0]);
|
|
|
1849 vis_padd16(TMP4, CONST_6, TMP4);
|
|
|
1850
|
|
|
1851 vis_ld64_2(dest, stride, DST_0);
|
|
|
1852 vis_padd16(TMP6, CONST_6, TMP6);
|
|
|
1853 vis_mul8x16au(REF_S2, CONST_256, TMP12);
|
|
|
1854
|
|
|
1855 vis_padd16(TMP4, TMP8, TMP4);
|
|
|
1856 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
|
|
|
1857
|
|
|
1858 vis_padd16(TMP6, TMP10, TMP6);
|
|
|
1859
|
|
|
1860 vis_padd16(TMP20, TMP4, TMP20);
|
|
|
1861
|
|
|
1862 vis_padd16(TMP22, TMP6, TMP22);
|
|
|
1863
|
|
|
1864 vis_padd16(TMP20, TMP24, TMP20);
|
|
|
1865
|
|
|
1866 vis_padd16(TMP22, TMP26, TMP22);
|
|
|
1867
|
|
|
1868 vis_padd16(TMP20, REF_0, TMP20);
|
|
|
1869 vis_mul8x16au(REF_S4, CONST_256, REF_0);
|
|
|
1870
|
|
|
1871 vis_padd16(TMP22, REF_2, TMP22);
|
|
|
1872 vis_pack16(TMP20, DST_2);
|
|
|
1873
|
|
|
1874 vis_pack16(TMP22, DST_3);
|
|
|
1875 vis_st64_2(DST_2, dest, 8);
|
|
|
1876 dest += stride;
|
|
|
1877
|
|
|
1878 vis_ld64_2(dest, 8, DST_2);
|
|
|
1879 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
|
|
1880 vis_pmerge(ZERO, REF_S4_1, REF_2);
|
|
|
1881
|
|
|
1882 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
|
|
1883 vis_padd16(REF_4, TMP0, TMP8);
|
|
|
1884
|
|
|
1885 vis_mul8x16au(REF_S6, CONST_256, REF_4);
|
|
|
1886 vis_padd16(REF_6, TMP2, TMP10);
|
|
|
1887
|
|
|
1888 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
|
|
|
1889 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
1890
|
|
|
1891 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
1892
|
|
|
1893 vis_padd16(TMP8, TMP30, TMP8);
|
|
|
1894
|
|
|
1895 vis_padd16(TMP10, TMP32, TMP10);
|
|
|
1896 vis_pack16(TMP8, DST_0);
|
|
|
1897
|
|
|
1898 vis_pack16(TMP10, DST_1);
|
|
|
1899 vis_st64(DST_0, dest[0]);
|
|
|
1900
|
|
|
1901 vis_padd16(REF_0, TMP4, REF_0);
|
|
|
1902
|
|
|
1903 vis_mul8x16al(DST_2, CONST_1024, TMP30);
|
|
|
1904 vis_padd16(REF_2, TMP6, REF_2);
|
|
|
1905
|
|
|
1906 vis_mul8x16al(DST_3, CONST_1024, TMP32);
|
|
|
1907 vis_padd16(REF_0, REF_4, REF_0);
|
|
|
1908
|
|
|
1909 vis_padd16(REF_2, REF_6, REF_2);
|
|
|
1910
|
|
|
1911 vis_padd16(REF_0, TMP30, REF_0);
|
|
|
1912
|
|
|
1913 /* stall */
|
|
|
1914
|
|
|
1915 vis_padd16(REF_2, TMP32, REF_2);
|
|
|
1916 vis_pack16(REF_0, DST_2);
|
|
|
1917
|
|
|
1918 vis_pack16(REF_2, DST_3);
|
|
|
1919 vis_st64_2(DST_2, dest, 8);
|
|
|
1920 dest += stride;
|
|
|
1921 } while (--height);
|
|
|
1922 }
|
|
|
1923
|
|
|
1924 static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
1925 const int stride, int height)
|
|
|
1926 {
|
|
|
1927 uint8_t *ref = (uint8_t *) _ref;
|
|
|
1928 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
1929 unsigned long off_plus_1 = off + 1;
|
|
|
1930 int stride_8 = stride + 8;
|
|
|
1931
|
|
|
1932 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
1933
|
|
|
1934 ref = vis_alignaddr(ref);
|
|
|
1935
|
|
|
1936 vis_ld64(ref[0], TMP0);
|
|
|
1937 vis_fzero(ZERO);
|
|
|
1938
|
|
|
1939 vis_ld64_2(ref, 8, TMP2);
|
|
|
1940
|
|
|
1941 vis_ld64(constants6[0], CONST_6);
|
|
|
1942
|
|
|
1943 vis_ld64(constants256_1024[0], CONST_256);
|
|
|
1944 vis_faligndata(TMP0, TMP2, REF_S0);
|
|
|
1945
|
|
|
1946 if (off != 0x7) {
|
|
|
1947 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
1948 vis_faligndata(TMP0, TMP2, REF_S2);
|
|
|
1949 } else {
|
|
|
1950 vis_src1(TMP2, REF_S2);
|
|
|
1951 }
|
|
|
1952
|
|
|
1953 height >>= 1;
|
|
|
1954 do { /* 31 cycles */
|
|
|
1955 vis_ld64_2(ref, stride, TMP0);
|
|
|
1956 vis_mul8x16au(REF_S0, CONST_256, TMP8);
|
|
|
1957 vis_pmerge(ZERO, REF_S0_1, TMP10);
|
|
|
1958
|
|
|
1959 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
1960 ref += stride;
|
|
|
1961 vis_mul8x16au(REF_S2, CONST_256, TMP12);
|
|
|
1962 vis_pmerge(ZERO, REF_S2_1, TMP14);
|
|
|
1963
|
|
|
1964 vis_alignaddr_g0((void *)off);
|
|
|
1965
|
|
|
1966 vis_ld64_2(ref, stride, TMP4);
|
|
|
1967 vis_faligndata(TMP0, TMP2, REF_S4);
|
|
|
1968
|
|
|
1969 vis_ld64_2(ref, stride_8, TMP6);
|
|
|
1970 ref += stride;
|
|
|
1971
|
|
|
1972 vis_ld64(dest[0], DST_0);
|
|
|
1973 vis_faligndata(TMP4, TMP6, REF_S0);
|
|
|
1974
|
|
|
1975 vis_ld64_2(dest, stride, DST_2);
|
|
|
1976
|
|
|
1977 if (off != 0x7) {
|
|
|
1978 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
1979 vis_faligndata(TMP0, TMP2, REF_S6);
|
|
|
1980 vis_faligndata(TMP4, TMP6, REF_S2);
|
|
|
1981 } else {
|
|
|
1982 vis_src1(TMP2, REF_S6);
|
|
|
1983 vis_src1(TMP6, REF_S2);
|
|
|
1984 }
|
|
|
1985
|
|
|
1986 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
|
|
1987 vis_pmerge(ZERO, REF_S4, TMP22);
|
|
|
1988
|
|
|
1989 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
|
|
1990 vis_pmerge(ZERO, REF_S4_1, TMP24);
|
|
|
1991
|
|
|
1992 vis_mul8x16au(REF_S6, CONST_256, TMP26);
|
|
|
1993 vis_pmerge(ZERO, REF_S6_1, TMP28);
|
|
|
1994
|
|
|
1995 vis_mul8x16au(REF_S0, CONST_256, REF_S4);
|
|
|
1996 vis_padd16(TMP22, CONST_6, TMP22);
|
|
|
1997
|
|
|
1998 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
|
|
|
1999 vis_padd16(TMP24, CONST_6, TMP24);
|
|
|
2000
|
|
|
2001 vis_mul8x16al(DST_2, CONST_1024, REF_0);
|
|
|
2002 vis_padd16(TMP22, TMP26, TMP22);
|
|
|
2003
|
|
|
2004 vis_mul8x16al(DST_3, CONST_1024, REF_2);
|
|
|
2005 vis_padd16(TMP24, TMP28, TMP24);
|
|
|
2006
|
|
|
2007 vis_mul8x16au(REF_S2, CONST_256, TMP26);
|
|
|
2008 vis_padd16(TMP8, TMP22, TMP8);
|
|
|
2009
|
|
|
2010 vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
|
|
|
2011 vis_padd16(TMP10, TMP24, TMP10);
|
|
|
2012
|
|
|
2013 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
2014
|
|
|
2015 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
2016
|
|
|
2017 vis_padd16(TMP8, TMP30, TMP8);
|
|
|
2018
|
|
|
2019 vis_padd16(TMP10, TMP32, TMP10);
|
|
|
2020 vis_pack16(TMP8, DST_0);
|
|
|
2021
|
|
|
2022 vis_pack16(TMP10, DST_1);
|
|
|
2023 vis_st64(DST_0, dest[0]);
|
|
|
2024 dest += stride;
|
|
|
2025
|
|
|
2026 vis_padd16(REF_S4, TMP22, TMP12);
|
|
|
2027
|
|
|
2028 vis_padd16(REF_S6, TMP24, TMP14);
|
|
|
2029
|
|
|
2030 vis_padd16(TMP12, TMP26, TMP12);
|
|
|
2031
|
|
|
2032 vis_padd16(TMP14, TMP28, TMP14);
|
|
|
2033
|
|
|
2034 vis_padd16(TMP12, REF_0, TMP12);
|
|
|
2035
|
|
|
2036 vis_padd16(TMP14, REF_2, TMP14);
|
|
|
2037 vis_pack16(TMP12, DST_2);
|
|
|
2038
|
|
|
2039 vis_pack16(TMP14, DST_3);
|
|
|
2040 vis_st64(DST_2, dest[0]);
|
|
|
2041 dest += stride;
|
|
|
2042 } while (--height);
|
|
|
2043 }
|
|
|
2044
|
|
|
2045 /* End of rounding code */
|
|
|
2046
|
|
|
2047 /* Start of no rounding code */
|
|
|
2048 /* The trick used in some of this file is the formula from the MMX
|
|
|
2049 * motion comp code, which is:
|
|
|
2050 *
|
|
|
2051 * (x+y)>>1 == (x&y)+((x^y)>>1)
|
|
|
2052 *
|
|
|
2053 * This allows us to average 8 bytes at a time in a 64-bit FPU reg.
|
|
|
2054 * We avoid overflows by masking before we do the shift, and we
|
|
|
2055 * implement the shift by multiplying by 1/2 using mul8x16. So in
|
|
|
2056 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask
|
|
|
2057 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and
|
|
|
2058 * the value 0x80808080 is in f8):
|
|
|
2059 *
|
|
|
2060 * fxor f0, f2, f10
|
|
|
2061 * fand f10, f4, f10
|
|
|
2062 * fmul8x16 f8, f10, f10
|
|
|
2063 * fand f10, f6, f10
|
|
|
2064 * fand f0, f2, f12
|
|
|
2065 * fpadd16 f12, f10, f10
|
|
|
2066 */
|
|
|
2067
|
|
|
2068 static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
2069 const int stride, int height)
|
|
|
2070 {
|
|
|
2071 uint8_t *ref = (uint8_t *) _ref;
|
|
|
2072
|
|
|
2073 ref = vis_alignaddr(ref);
|
|
|
2074 do { /* 5 cycles */
|
|
|
2075 vis_ld64(ref[0], TMP0);
|
|
|
2076
|
|
|
2077 vis_ld64_2(ref, 8, TMP2);
|
|
|
2078
|
|
|
2079 vis_ld64_2(ref, 16, TMP4);
|
|
|
2080 ref += stride;
|
|
|
2081
|
|
|
2082 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2083 vis_st64(REF_0, dest[0]);
|
|
|
2084
|
|
|
2085 vis_faligndata(TMP2, TMP4, REF_2);
|
|
|
2086 vis_st64_2(REF_2, dest, 8);
|
|
|
2087 dest += stride;
|
|
|
2088 } while (--height);
|
|
|
2089 }
|
|
|
2090
|
|
|
2091 static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
2092 const int stride, int height)
|
|
|
2093 {
|
|
|
2094 uint8_t *ref = (uint8_t *) _ref;
|
|
|
2095
|
|
|
2096 ref = vis_alignaddr(ref);
|
|
|
2097 do { /* 4 cycles */
|
|
|
2098 vis_ld64(ref[0], TMP0);
|
|
|
2099
|
|
|
2100 vis_ld64(ref[8], TMP2);
|
|
|
2101 ref += stride;
|
|
|
2102
|
|
|
2103 /* stall */
|
|
|
2104
|
|
|
2105 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2106 vis_st64(REF_0, dest[0]);
|
|
|
2107 dest += stride;
|
|
|
2108 } while (--height);
|
|
|
2109 }
|
|
|
2110
|
|
|
2111
|
|
|
2112 static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
2113 const int stride, int height)
|
|
|
2114 {
|
|
|
2115 uint8_t *ref = (uint8_t *) _ref;
|
|
|
2116 int stride_8 = stride + 8;
|
|
|
2117
|
|
|
2118 ref = vis_alignaddr(ref);
|
|
|
2119
|
|
|
2120 vis_ld64(ref[0], TMP0);
|
|
|
2121
|
|
|
2122 vis_ld64(ref[8], TMP2);
|
|
|
2123
|
|
|
2124 vis_ld64(ref[16], TMP4);
|
|
|
2125
|
|
|
2126 vis_ld64(dest[0], DST_0);
|
|
|
2127
|
|
|
2128 vis_ld64(dest[8], DST_2);
|
|
|
2129
|
|
|
2130 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
2131 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2132
|
|
|
2133 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
2134 vis_faligndata(TMP2, TMP4, REF_2);
|
|
|
2135
|
|
|
2136 vis_ld64(constants128[0], CONST_128);
|
|
|
2137
|
|
|
2138 ref += stride;
|
|
|
2139 height = (height >> 1) - 1;
|
|
|
2140
|
|
|
2141 do { /* 24 cycles */
|
|
|
2142 vis_ld64(ref[0], TMP0);
|
|
|
2143 vis_xor(DST_0, REF_0, TMP6);
|
|
|
2144
|
|
|
2145 vis_ld64_2(ref, 8, TMP2);
|
|
|
2146 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
2147
|
|
|
2148 vis_ld64_2(ref, 16, TMP4);
|
|
|
2149 ref += stride;
|
|
|
2150 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
2151 vis_xor(DST_2, REF_2, TMP8);
|
|
|
2152
|
|
|
2153 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
2154
|
|
|
2155 vis_and(DST_0, REF_0, TMP10);
|
|
|
2156 vis_ld64_2(dest, stride, DST_0);
|
|
|
2157 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
2158
|
|
|
2159 vis_and(DST_2, REF_2, TMP12);
|
|
|
2160 vis_ld64_2(dest, stride_8, DST_2);
|
|
|
2161
|
|
|
2162 vis_ld64(ref[0], TMP14);
|
|
|
2163 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
2164
|
|
|
2165 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
2166
|
|
|
2167 vis_padd16(TMP10, TMP6, TMP6);
|
|
|
2168 vis_st64(TMP6, dest[0]);
|
|
|
2169
|
|
|
2170 vis_padd16(TMP12, TMP8, TMP8);
|
|
|
2171 vis_st64_2(TMP8, dest, 8);
|
|
|
2172
|
|
|
2173 dest += stride;
|
|
|
2174 vis_ld64_2(ref, 8, TMP16);
|
|
|
2175 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2176
|
|
|
2177 vis_ld64_2(ref, 16, TMP18);
|
|
|
2178 vis_faligndata(TMP2, TMP4, REF_2);
|
|
|
2179 ref += stride;
|
|
|
2180
|
|
|
2181 vis_xor(DST_0, REF_0, TMP20);
|
|
|
2182
|
|
|
2183 vis_and(TMP20, MASK_fe, TMP20);
|
|
|
2184
|
|
|
2185 vis_xor(DST_2, REF_2, TMP22);
|
|
|
2186 vis_mul8x16(CONST_128, TMP20, TMP20);
|
|
|
2187
|
|
|
2188 vis_and(TMP22, MASK_fe, TMP22);
|
|
|
2189
|
|
|
2190 vis_and(DST_0, REF_0, TMP24);
|
|
|
2191 vis_mul8x16(CONST_128, TMP22, TMP22);
|
|
|
2192
|
|
|
2193 vis_and(DST_2, REF_2, TMP26);
|
|
|
2194
|
|
|
2195 vis_ld64_2(dest, stride, DST_0);
|
|
|
2196 vis_faligndata(TMP14, TMP16, REF_0);
|
|
|
2197
|
|
|
2198 vis_ld64_2(dest, stride_8, DST_2);
|
|
|
2199 vis_faligndata(TMP16, TMP18, REF_2);
|
|
|
2200
|
|
|
2201 vis_and(TMP20, MASK_7f, TMP20);
|
|
|
2202
|
|
|
2203 vis_and(TMP22, MASK_7f, TMP22);
|
|
|
2204
|
|
|
2205 vis_padd16(TMP24, TMP20, TMP20);
|
|
|
2206 vis_st64(TMP20, dest[0]);
|
|
|
2207
|
|
|
2208 vis_padd16(TMP26, TMP22, TMP22);
|
|
|
2209 vis_st64_2(TMP22, dest, 8);
|
|
|
2210 dest += stride;
|
|
|
2211 } while (--height);
|
|
|
2212
|
|
|
2213 vis_ld64(ref[0], TMP0);
|
|
|
2214 vis_xor(DST_0, REF_0, TMP6);
|
|
|
2215
|
|
|
2216 vis_ld64_2(ref, 8, TMP2);
|
|
|
2217 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
2218
|
|
|
2219 vis_ld64_2(ref, 16, TMP4);
|
|
|
2220 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
2221 vis_xor(DST_2, REF_2, TMP8);
|
|
|
2222
|
|
|
2223 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
2224
|
|
|
2225 vis_and(DST_0, REF_0, TMP10);
|
|
|
2226 vis_ld64_2(dest, stride, DST_0);
|
|
|
2227 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
2228
|
|
|
2229 vis_and(DST_2, REF_2, TMP12);
|
|
|
2230 vis_ld64_2(dest, stride_8, DST_2);
|
|
|
2231
|
|
|
2232 vis_ld64(ref[0], TMP14);
|
|
|
2233 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
2234
|
|
|
2235 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
2236
|
|
|
2237 vis_padd16(TMP10, TMP6, TMP6);
|
|
|
2238 vis_st64(TMP6, dest[0]);
|
|
|
2239
|
|
|
2240 vis_padd16(TMP12, TMP8, TMP8);
|
|
|
2241 vis_st64_2(TMP8, dest, 8);
|
|
|
2242
|
|
|
2243 dest += stride;
|
|
|
2244 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2245
|
|
|
2246 vis_faligndata(TMP2, TMP4, REF_2);
|
|
|
2247
|
|
|
2248 vis_xor(DST_0, REF_0, TMP20);
|
|
|
2249
|
|
|
2250 vis_and(TMP20, MASK_fe, TMP20);
|
|
|
2251
|
|
|
2252 vis_xor(DST_2, REF_2, TMP22);
|
|
|
2253 vis_mul8x16(CONST_128, TMP20, TMP20);
|
|
|
2254
|
|
|
2255 vis_and(TMP22, MASK_fe, TMP22);
|
|
|
2256
|
|
|
2257 vis_and(DST_0, REF_0, TMP24);
|
|
|
2258 vis_mul8x16(CONST_128, TMP22, TMP22);
|
|
|
2259
|
|
|
2260 vis_and(DST_2, REF_2, TMP26);
|
|
|
2261
|
|
|
2262 vis_and(TMP20, MASK_7f, TMP20);
|
|
|
2263
|
|
|
2264 vis_and(TMP22, MASK_7f, TMP22);
|
|
|
2265
|
|
|
2266 vis_padd16(TMP24, TMP20, TMP20);
|
|
|
2267 vis_st64(TMP20, dest[0]);
|
|
|
2268
|
|
|
2269 vis_padd16(TMP26, TMP22, TMP22);
|
|
|
2270 vis_st64_2(TMP22, dest, 8);
|
|
|
2271 }
|
|
|
2272
|
|
|
2273 static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
2274 const int stride, int height)
|
|
|
2275 {
|
|
|
2276 uint8_t *ref = (uint8_t *) _ref;
|
|
|
2277
|
|
|
2278 ref = vis_alignaddr(ref);
|
|
|
2279
|
|
|
2280 vis_ld64(ref[0], TMP0);
|
|
|
2281
|
|
|
2282 vis_ld64(ref[8], TMP2);
|
|
|
2283
|
|
|
2284 vis_ld64(dest[0], DST_0);
|
|
|
2285
|
|
|
2286 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
2287
|
|
|
2288 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
2289 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2290
|
|
|
2291 vis_ld64(constants128[0], CONST_128);
|
|
|
2292
|
|
|
2293 ref += stride;
|
|
|
2294 height = (height >> 1) - 1;
|
|
|
2295
|
|
|
2296 do { /* 12 cycles */
|
|
|
2297 vis_ld64(ref[0], TMP0);
|
|
|
2298 vis_xor(DST_0, REF_0, TMP4);
|
|
|
2299
|
|
|
2300 vis_ld64(ref[8], TMP2);
|
|
|
2301 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
2302
|
|
|
2303 vis_and(DST_0, REF_0, TMP6);
|
|
|
2304 vis_ld64_2(dest, stride, DST_0);
|
|
|
2305 ref += stride;
|
|
|
2306 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
2307
|
|
|
2308 vis_ld64(ref[0], TMP12);
|
|
|
2309 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2310
|
|
|
2311 vis_ld64(ref[8], TMP2);
|
|
|
2312 vis_xor(DST_0, REF_0, TMP0);
|
|
|
2313 ref += stride;
|
|
|
2314
|
|
|
2315 vis_and(TMP0, MASK_fe, TMP0);
|
|
|
2316
|
|
|
2317 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
2318
|
|
|
2319 vis_padd16(TMP6, TMP4, TMP4);
|
|
|
2320 vis_st64(TMP4, dest[0]);
|
|
|
2321 dest += stride;
|
|
|
2322 vis_mul8x16(CONST_128, TMP0, TMP0);
|
|
|
2323
|
|
|
2324 vis_and(DST_0, REF_0, TMP6);
|
|
|
2325 vis_ld64_2(dest, stride, DST_0);
|
|
|
2326
|
|
|
2327 vis_faligndata(TMP12, TMP2, REF_0);
|
|
|
2328
|
|
|
2329 vis_and(TMP0, MASK_7f, TMP0);
|
|
|
2330
|
|
|
2331 vis_padd16(TMP6, TMP0, TMP4);
|
|
|
2332 vis_st64(TMP4, dest[0]);
|
|
|
2333 dest += stride;
|
|
|
2334 } while (--height);
|
|
|
2335
|
|
|
2336 vis_ld64(ref[0], TMP0);
|
|
|
2337 vis_xor(DST_0, REF_0, TMP4);
|
|
|
2338
|
|
|
2339 vis_ld64(ref[8], TMP2);
|
|
|
2340 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
2341
|
|
|
2342 vis_and(DST_0, REF_0, TMP6);
|
|
|
2343 vis_ld64_2(dest, stride, DST_0);
|
|
|
2344 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
2345
|
|
|
2346 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2347
|
|
|
2348 vis_xor(DST_0, REF_0, TMP0);
|
|
|
2349
|
|
|
2350 vis_and(TMP0, MASK_fe, TMP0);
|
|
|
2351
|
|
|
2352 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
2353
|
|
|
2354 vis_padd16(TMP6, TMP4, TMP4);
|
|
|
2355 vis_st64(TMP4, dest[0]);
|
|
|
2356 dest += stride;
|
|
|
2357 vis_mul8x16(CONST_128, TMP0, TMP0);
|
|
|
2358
|
|
|
2359 vis_and(DST_0, REF_0, TMP6);
|
|
|
2360
|
|
|
2361 vis_and(TMP0, MASK_7f, TMP0);
|
|
|
2362
|
|
|
2363 vis_padd16(TMP6, TMP0, TMP4);
|
|
|
2364 vis_st64(TMP4, dest[0]);
|
|
|
2365 }
|
|
|
2366
|
|
|
2367 static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
2368 const int stride, int height)
|
|
|
2369 {
|
|
|
2370 uint8_t *ref = (uint8_t *) _ref;
|
|
|
2371 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
2372 unsigned long off_plus_1 = off + 1;
|
|
|
2373
|
|
|
2374 ref = vis_alignaddr(ref);
|
|
|
2375
|
|
|
2376 vis_ld64(ref[0], TMP0);
|
|
|
2377
|
|
|
2378 vis_ld64_2(ref, 8, TMP2);
|
|
|
2379
|
|
|
2380 vis_ld64_2(ref, 16, TMP4);
|
|
|
2381
|
|
|
2382 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
2383
|
|
|
2384 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
2385 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2386
|
|
|
2387 vis_ld64(constants128[0], CONST_128);
|
|
|
2388 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
2389
|
|
|
2390 if (off != 0x7) {
|
|
|
2391 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
2392 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
2393 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
2394 } else {
|
|
|
2395 vis_src1(TMP2, REF_2);
|
|
|
2396 vis_src1(TMP4, REF_6);
|
|
|
2397 }
|
|
|
2398
|
|
|
2399 ref += stride;
|
|
|
2400 height = (height >> 1) - 1;
|
|
|
2401
|
|
|
2402 do { /* 34 cycles */
|
|
|
2403 vis_ld64(ref[0], TMP0);
|
|
|
2404 vis_xor(REF_0, REF_2, TMP6);
|
|
|
2405
|
|
|
2406 vis_ld64_2(ref, 8, TMP2);
|
|
|
2407 vis_xor(REF_4, REF_6, TMP8);
|
|
|
2408
|
|
|
2409 vis_ld64_2(ref, 16, TMP4);
|
|
|
2410 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
2411 ref += stride;
|
|
|
2412
|
|
|
2413 vis_ld64(ref[0], TMP14);
|
|
|
2414 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
2415 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
2416
|
|
|
2417 vis_ld64_2(ref, 8, TMP16);
|
|
|
2418 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
2419 vis_and(REF_0, REF_2, TMP10);
|
|
|
2420
|
|
|
2421 vis_ld64_2(ref, 16, TMP18);
|
|
|
2422 ref += stride;
|
|
|
2423 vis_and(REF_4, REF_6, TMP12);
|
|
|
2424
|
|
|
2425 vis_alignaddr_g0((void *)off);
|
|
|
2426
|
|
|
2427 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2428
|
|
|
2429 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
2430
|
|
|
2431 if (off != 0x7) {
|
|
|
2432 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
2433 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
2434 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
2435 } else {
|
|
|
2436 vis_src1(TMP2, REF_2);
|
|
|
2437 vis_src1(TMP4, REF_6);
|
|
|
2438 }
|
|
|
2439
|
|
|
2440 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
2441
|
|
|
2442 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
2443
|
|
|
2444 vis_padd16(TMP10, TMP6, TMP6);
|
|
|
2445 vis_st64(TMP6, dest[0]);
|
|
|
2446
|
|
|
2447 vis_padd16(TMP12, TMP8, TMP8);
|
|
|
2448 vis_st64_2(TMP8, dest, 8);
|
|
|
2449 dest += stride;
|
|
|
2450
|
|
|
2451 vis_xor(REF_0, REF_2, TMP6);
|
|
|
2452
|
|
|
2453 vis_xor(REF_4, REF_6, TMP8);
|
|
|
2454
|
|
|
2455 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
2456
|
|
|
2457 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
2458 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
2459
|
|
|
2460 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
2461 vis_and(REF_0, REF_2, TMP10);
|
|
|
2462
|
|
|
2463 vis_and(REF_4, REF_6, TMP12);
|
|
|
2464
|
|
|
2465 vis_alignaddr_g0((void *)off);
|
|
|
2466
|
|
|
2467 vis_faligndata(TMP14, TMP16, REF_0);
|
|
|
2468
|
|
|
2469 vis_faligndata(TMP16, TMP18, REF_4);
|
|
|
2470
|
|
|
2471 if (off != 0x7) {
|
|
|
2472 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
2473 vis_faligndata(TMP14, TMP16, REF_2);
|
|
|
2474 vis_faligndata(TMP16, TMP18, REF_6);
|
|
|
2475 } else {
|
|
|
2476 vis_src1(TMP16, REF_2);
|
|
|
2477 vis_src1(TMP18, REF_6);
|
|
|
2478 }
|
|
|
2479
|
|
|
2480 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
2481
|
|
|
2482 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
2483
|
|
|
2484 vis_padd16(TMP10, TMP6, TMP6);
|
|
|
2485 vis_st64(TMP6, dest[0]);
|
|
|
2486
|
|
|
2487 vis_padd16(TMP12, TMP8, TMP8);
|
|
|
2488 vis_st64_2(TMP8, dest, 8);
|
|
|
2489 dest += stride;
|
|
|
2490 } while (--height);
|
|
|
2491
|
|
|
2492 vis_ld64(ref[0], TMP0);
|
|
|
2493 vis_xor(REF_0, REF_2, TMP6);
|
|
|
2494
|
|
|
2495 vis_ld64_2(ref, 8, TMP2);
|
|
|
2496 vis_xor(REF_4, REF_6, TMP8);
|
|
|
2497
|
|
|
2498 vis_ld64_2(ref, 16, TMP4);
|
|
|
2499 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
2500
|
|
|
2501 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
2502 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
2503
|
|
|
2504 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
2505 vis_and(REF_0, REF_2, TMP10);
|
|
|
2506
|
|
|
2507 vis_and(REF_4, REF_6, TMP12);
|
|
|
2508
|
|
|
2509 vis_alignaddr_g0((void *)off);
|
|
|
2510
|
|
|
2511 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2512
|
|
|
2513 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
2514
|
|
|
2515 if (off != 0x7) {
|
|
|
2516 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
2517 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
2518 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
2519 } else {
|
|
|
2520 vis_src1(TMP2, REF_2);
|
|
|
2521 vis_src1(TMP4, REF_6);
|
|
|
2522 }
|
|
|
2523
|
|
|
2524 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
2525
|
|
|
2526 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
2527
|
|
|
2528 vis_padd16(TMP10, TMP6, TMP6);
|
|
|
2529 vis_st64(TMP6, dest[0]);
|
|
|
2530
|
|
|
2531 vis_padd16(TMP12, TMP8, TMP8);
|
|
|
2532 vis_st64_2(TMP8, dest, 8);
|
|
|
2533 dest += stride;
|
|
|
2534
|
|
|
2535 vis_xor(REF_0, REF_2, TMP6);
|
|
|
2536
|
|
|
2537 vis_xor(REF_4, REF_6, TMP8);
|
|
|
2538
|
|
|
2539 vis_and(TMP6, MASK_fe, TMP6);
|
|
|
2540
|
|
|
2541 vis_mul8x16(CONST_128, TMP6, TMP6);
|
|
|
2542 vis_and(TMP8, MASK_fe, TMP8);
|
|
|
2543
|
|
|
2544 vis_mul8x16(CONST_128, TMP8, TMP8);
|
|
|
2545 vis_and(REF_0, REF_2, TMP10);
|
|
|
2546
|
|
|
2547 vis_and(REF_4, REF_6, TMP12);
|
|
|
2548
|
|
|
2549 vis_and(TMP6, MASK_7f, TMP6);
|
|
|
2550
|
|
|
2551 vis_and(TMP8, MASK_7f, TMP8);
|
|
|
2552
|
|
|
2553 vis_padd16(TMP10, TMP6, TMP6);
|
|
|
2554 vis_st64(TMP6, dest[0]);
|
|
|
2555
|
|
|
2556 vis_padd16(TMP12, TMP8, TMP8);
|
|
|
2557 vis_st64_2(TMP8, dest, 8);
|
|
|
2558 }
|
|
|
2559
|
|
|
2560 static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
2561 const int stride, int height)
|
|
|
2562 {
|
|
|
2563 uint8_t *ref = (uint8_t *) _ref;
|
|
|
2564 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
2565 unsigned long off_plus_1 = off + 1;
|
|
|
2566
|
|
|
2567 ref = vis_alignaddr(ref);
|
|
|
2568
|
|
|
2569 vis_ld64(ref[0], TMP0);
|
|
|
2570
|
|
|
2571 vis_ld64(ref[8], TMP2);
|
|
|
2572
|
|
|
2573 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
2574
|
|
|
2575 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
2576
|
|
|
2577 vis_ld64(constants128[0], CONST_128);
|
|
|
2578 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2579
|
|
|
2580 if (off != 0x7) {
|
|
|
2581 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
2582 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
2583 } else {
|
|
|
2584 vis_src1(TMP2, REF_2);
|
|
|
2585 }
|
|
|
2586
|
|
|
2587 ref += stride;
|
|
|
2588 height = (height >> 1) - 1;
|
|
|
2589
|
|
|
2590 do { /* 20 cycles */
|
|
|
2591 vis_ld64(ref[0], TMP0);
|
|
|
2592 vis_xor(REF_0, REF_2, TMP4);
|
|
|
2593
|
|
|
2594 vis_ld64_2(ref, 8, TMP2);
|
|
|
2595 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
2596 ref += stride;
|
|
|
2597
|
|
|
2598 vis_ld64(ref[0], TMP8);
|
|
|
2599 vis_and(REF_0, REF_2, TMP6);
|
|
|
2600 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
2601
|
|
|
2602 vis_alignaddr_g0((void *)off);
|
|
|
2603
|
|
|
2604 vis_ld64_2(ref, 8, TMP10);
|
|
|
2605 ref += stride;
|
|
|
2606 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2607
|
|
|
2608 if (off != 0x7) {
|
|
|
2609 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
2610 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
2611 } else {
|
|
|
2612 vis_src1(TMP2, REF_2);
|
|
|
2613 }
|
|
|
2614
|
|
|
2615 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
2616
|
|
|
2617 vis_padd16(TMP6, TMP4, DST_0);
|
|
|
2618 vis_st64(DST_0, dest[0]);
|
|
|
2619 dest += stride;
|
|
|
2620
|
|
|
2621 vis_xor(REF_0, REF_2, TMP12);
|
|
|
2622
|
|
|
2623 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
2624
|
|
|
2625 vis_and(REF_0, REF_2, TMP14);
|
|
|
2626 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
2627
|
|
|
2628 vis_alignaddr_g0((void *)off);
|
|
|
2629 vis_faligndata(TMP8, TMP10, REF_0);
|
|
|
2630 if (off != 0x7) {
|
|
|
2631 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
2632 vis_faligndata(TMP8, TMP10, REF_2);
|
|
|
2633 } else {
|
|
|
2634 vis_src1(TMP10, REF_2);
|
|
|
2635 }
|
|
|
2636
|
|
|
2637 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
2638
|
|
|
2639 vis_padd16(TMP14, TMP12, DST_0);
|
|
|
2640 vis_st64(DST_0, dest[0]);
|
|
|
2641 dest += stride;
|
|
|
2642 } while (--height);
|
|
|
2643
|
|
|
2644 vis_ld64(ref[0], TMP0);
|
|
|
2645 vis_xor(REF_0, REF_2, TMP4);
|
|
|
2646
|
|
|
2647 vis_ld64_2(ref, 8, TMP2);
|
|
|
2648 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
2649
|
|
|
2650 vis_and(REF_0, REF_2, TMP6);
|
|
|
2651 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
2652
|
|
|
2653 vis_alignaddr_g0((void *)off);
|
|
|
2654
|
|
|
2655 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2656
|
|
|
2657 if (off != 0x7) {
|
|
|
2658 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
2659 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
2660 } else {
|
|
|
2661 vis_src1(TMP2, REF_2);
|
|
|
2662 }
|
|
|
2663
|
|
|
2664 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
2665
|
|
|
2666 vis_padd16(TMP6, TMP4, DST_0);
|
|
|
2667 vis_st64(DST_0, dest[0]);
|
|
|
2668 dest += stride;
|
|
|
2669
|
|
|
2670 vis_xor(REF_0, REF_2, TMP12);
|
|
|
2671
|
|
|
2672 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
2673
|
|
|
2674 vis_and(REF_0, REF_2, TMP14);
|
|
|
2675 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
2676
|
|
|
2677 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
2678
|
|
|
2679 vis_padd16(TMP14, TMP12, DST_0);
|
|
|
2680 vis_st64(DST_0, dest[0]);
|
|
|
2681 dest += stride;
|
|
|
2682 }
|
|
|
2683
|
|
|
2684 static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
2685 const int stride, int height)
|
|
|
2686 {
|
|
|
2687 uint8_t *ref = (uint8_t *) _ref;
|
|
|
2688 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
2689 unsigned long off_plus_1 = off + 1;
|
|
|
2690
|
|
|
2691 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
2692
|
|
|
2693 vis_ld64(constants3[0], CONST_3);
|
|
|
2694 vis_fzero(ZERO);
|
|
|
2695 vis_ld64(constants256_512[0], CONST_256);
|
|
|
2696
|
|
|
2697 ref = vis_alignaddr(ref);
|
|
|
2698 do { /* 26 cycles */
|
|
|
2699 vis_ld64(ref[0], TMP0);
|
|
|
2700
|
|
|
2701 vis_ld64(ref[8], TMP2);
|
|
|
2702
|
|
|
2703 vis_alignaddr_g0((void *)off);
|
|
|
2704
|
|
|
2705 vis_ld64(ref[16], TMP4);
|
|
|
2706
|
|
|
2707 vis_ld64(dest[0], DST_0);
|
|
|
2708 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2709
|
|
|
2710 vis_ld64(dest[8], DST_2);
|
|
|
2711 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
2712
|
|
|
2713 if (off != 0x7) {
|
|
|
2714 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
2715 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
2716 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
2717 } else {
|
|
|
2718 vis_src1(TMP2, REF_2);
|
|
|
2719 vis_src1(TMP4, REF_6);
|
|
|
2720 }
|
|
|
2721
|
|
|
2722 vis_mul8x16au(REF_0, CONST_256, TMP0);
|
|
|
2723
|
|
|
2724 vis_pmerge(ZERO, REF_2, TMP4);
|
|
|
2725 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
|
|
2726
|
|
|
2727 vis_pmerge(ZERO, REF_2_1, TMP6);
|
|
|
2728
|
|
|
2729 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
2730
|
|
|
2731 vis_mul8x16al(DST_0, CONST_512, TMP4);
|
|
|
2732 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
2733
|
|
|
2734 vis_mul8x16al(DST_1, CONST_512, TMP6);
|
|
|
2735
|
|
|
2736 vis_mul8x16au(REF_6, CONST_256, TMP12);
|
|
|
2737
|
|
|
2738 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
2739 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
|
|
|
2740
|
|
|
2741 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
2742 vis_mul8x16au(REF_4, CONST_256, TMP16);
|
|
|
2743
|
|
|
2744 vis_padd16(TMP0, CONST_3, TMP8);
|
|
|
2745 vis_mul8x16au(REF_4_1, CONST_256, TMP18);
|
|
|
2746
|
|
|
2747 vis_padd16(TMP2, CONST_3, TMP10);
|
|
|
2748 vis_pack16(TMP8, DST_0);
|
|
|
2749
|
|
|
2750 vis_pack16(TMP10, DST_1);
|
|
|
2751 vis_padd16(TMP16, TMP12, TMP0);
|
|
|
2752
|
|
|
2753 vis_st64(DST_0, dest[0]);
|
|
|
2754 vis_mul8x16al(DST_2, CONST_512, TMP4);
|
|
|
2755 vis_padd16(TMP18, TMP14, TMP2);
|
|
|
2756
|
|
|
2757 vis_mul8x16al(DST_3, CONST_512, TMP6);
|
|
|
2758 vis_padd16(TMP0, CONST_3, TMP0);
|
|
|
2759
|
|
|
2760 vis_padd16(TMP2, CONST_3, TMP2);
|
|
|
2761
|
|
|
2762 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
2763
|
|
|
2764 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
2765 vis_pack16(TMP0, DST_2);
|
|
|
2766
|
|
|
2767 vis_pack16(TMP2, DST_3);
|
|
|
2768 vis_st64(DST_2, dest[8]);
|
|
|
2769
|
|
|
2770 ref += stride;
|
|
|
2771 dest += stride;
|
|
|
2772 } while (--height);
|
|
|
2773 }
|
|
|
2774
|
|
|
2775 static void MC_avg_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
2776 const int stride, int height)
|
|
|
2777 {
|
|
|
2778 uint8_t *ref = (uint8_t *) _ref;
|
|
|
2779 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
2780 unsigned long off_plus_1 = off + 1;
|
|
|
2781 int stride_times_2 = stride << 1;
|
|
|
2782
|
|
|
2783 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
2784
|
|
|
2785 vis_ld64(constants3[0], CONST_3);
|
|
|
2786 vis_fzero(ZERO);
|
|
|
2787 vis_ld64(constants256_512[0], CONST_256);
|
|
|
2788
|
|
|
2789 ref = vis_alignaddr(ref);
|
|
|
2790 height >>= 2;
|
|
|
2791 do { /* 47 cycles */
|
|
|
2792 vis_ld64(ref[0], TMP0);
|
|
|
2793
|
|
|
2794 vis_ld64_2(ref, 8, TMP2);
|
|
|
2795 ref += stride;
|
|
|
2796
|
|
|
2797 vis_alignaddr_g0((void *)off);
|
|
|
2798
|
|
|
2799 vis_ld64(ref[0], TMP4);
|
|
|
2800 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2801
|
|
|
2802 vis_ld64_2(ref, 8, TMP6);
|
|
|
2803 ref += stride;
|
|
|
2804
|
|
|
2805 vis_ld64(ref[0], TMP8);
|
|
|
2806
|
|
|
2807 vis_ld64_2(ref, 8, TMP10);
|
|
|
2808 ref += stride;
|
|
|
2809 vis_faligndata(TMP4, TMP6, REF_4);
|
|
|
2810
|
|
|
2811 vis_ld64(ref[0], TMP12);
|
|
|
2812
|
|
|
2813 vis_ld64_2(ref, 8, TMP14);
|
|
|
2814 ref += stride;
|
|
|
2815 vis_faligndata(TMP8, TMP10, REF_S0);
|
|
|
2816
|
|
|
2817 vis_faligndata(TMP12, TMP14, REF_S4);
|
|
|
2818
|
|
|
2819 if (off != 0x7) {
|
|
|
2820 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
2821
|
|
|
2822 vis_ld64(dest[0], DST_0);
|
|
|
2823 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
2824
|
|
|
2825 vis_ld64_2(dest, stride, DST_2);
|
|
|
2826 vis_faligndata(TMP4, TMP6, REF_6);
|
|
|
2827
|
|
|
2828 vis_faligndata(TMP8, TMP10, REF_S2);
|
|
|
2829
|
|
|
2830 vis_faligndata(TMP12, TMP14, REF_S6);
|
|
|
2831 } else {
|
|
|
2832 vis_ld64(dest[0], DST_0);
|
|
|
2833 vis_src1(TMP2, REF_2);
|
|
|
2834
|
|
|
2835 vis_ld64_2(dest, stride, DST_2);
|
|
|
2836 vis_src1(TMP6, REF_6);
|
|
|
2837
|
|
|
2838 vis_src1(TMP10, REF_S2);
|
|
|
2839
|
|
|
2840 vis_src1(TMP14, REF_S6);
|
|
|
2841 }
|
|
|
2842
|
|
|
2843 vis_pmerge(ZERO, REF_0, TMP0);
|
|
|
2844 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
|
|
2845
|
|
|
2846 vis_pmerge(ZERO, REF_2, TMP4);
|
|
|
2847 vis_mul8x16au(REF_2_1, CONST_256, TMP6);
|
|
|
2848
|
|
|
2849 vis_padd16(TMP0, CONST_3, TMP0);
|
|
|
2850 vis_mul8x16al(DST_0, CONST_512, TMP16);
|
|
|
2851
|
|
|
2852 vis_padd16(TMP2, CONST_3, TMP2);
|
|
|
2853 vis_mul8x16al(DST_1, CONST_512, TMP18);
|
|
|
2854
|
|
|
2855 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
2856 vis_mul8x16au(REF_4, CONST_256, TMP8);
|
|
|
2857
|
|
|
2858 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
2859 vis_mul8x16au(REF_4_1, CONST_256, TMP10);
|
|
|
2860
|
|
|
2861 vis_padd16(TMP0, TMP16, TMP0);
|
|
|
2862 vis_mul8x16au(REF_6, CONST_256, TMP12);
|
|
|
2863
|
|
|
2864 vis_padd16(TMP2, TMP18, TMP2);
|
|
|
2865 vis_mul8x16au(REF_6_1, CONST_256, TMP14);
|
|
|
2866
|
|
|
2867 vis_padd16(TMP8, CONST_3, TMP8);
|
|
|
2868 vis_mul8x16al(DST_2, CONST_512, TMP16);
|
|
|
2869
|
|
|
2870 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
2871 vis_mul8x16al(DST_3, CONST_512, TMP18);
|
|
|
2872
|
|
|
2873 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
2874 vis_pack16(TMP0, DST_0);
|
|
|
2875
|
|
|
2876 vis_pack16(TMP2, DST_1);
|
|
|
2877 vis_st64(DST_0, dest[0]);
|
|
|
2878 dest += stride;
|
|
|
2879 vis_padd16(TMP10, CONST_3, TMP10);
|
|
|
2880
|
|
|
2881 vis_ld64_2(dest, stride, DST_0);
|
|
|
2882 vis_padd16(TMP8, TMP16, TMP8);
|
|
|
2883
|
|
|
2884 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/);
|
|
|
2885 vis_padd16(TMP10, TMP18, TMP10);
|
|
|
2886 vis_pack16(TMP8, DST_2);
|
|
|
2887
|
|
|
2888 vis_pack16(TMP10, DST_3);
|
|
|
2889 vis_st64(DST_2, dest[0]);
|
|
|
2890 dest += stride;
|
|
|
2891
|
|
|
2892 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
|
|
|
2893 vis_pmerge(ZERO, REF_S0, TMP0);
|
|
|
2894
|
|
|
2895 vis_pmerge(ZERO, REF_S2, TMP24);
|
|
|
2896 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
|
|
|
2897
|
|
|
2898 vis_padd16(TMP0, CONST_3, TMP0);
|
|
|
2899 vis_mul8x16au(REF_S4, CONST_256, TMP8);
|
|
|
2900
|
|
|
2901 vis_padd16(TMP2, CONST_3, TMP2);
|
|
|
2902 vis_mul8x16au(REF_S4_1, CONST_256, TMP10);
|
|
|
2903
|
|
|
2904 vis_padd16(TMP0, TMP24, TMP0);
|
|
|
2905 vis_mul8x16au(REF_S6, CONST_256, TMP12);
|
|
|
2906
|
|
|
2907 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
2908 vis_mul8x16au(REF_S6_1, CONST_256, TMP14);
|
|
|
2909
|
|
|
2910 vis_padd16(TMP8, CONST_3, TMP8);
|
|
|
2911 vis_mul8x16al(DST_0, CONST_512, TMP16);
|
|
|
2912
|
|
|
2913 vis_padd16(TMP10, CONST_3, TMP10);
|
|
|
2914 vis_mul8x16al(DST_1, CONST_512, TMP18);
|
|
|
2915
|
|
|
2916 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
2917 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20);
|
|
|
2918
|
|
|
2919 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22);
|
|
|
2920 vis_padd16(TMP0, TMP16, TMP0);
|
|
|
2921
|
|
|
2922 vis_padd16(TMP2, TMP18, TMP2);
|
|
|
2923 vis_pack16(TMP0, DST_0);
|
|
|
2924
|
|
|
2925 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
2926 vis_pack16(TMP2, DST_1);
|
|
|
2927 vis_st64(DST_0, dest[0]);
|
|
|
2928 dest += stride;
|
|
|
2929
|
|
|
2930 vis_padd16(TMP8, TMP20, TMP8);
|
|
|
2931
|
|
|
2932 vis_padd16(TMP10, TMP22, TMP10);
|
|
|
2933 vis_pack16(TMP8, DST_2);
|
|
|
2934
|
|
|
2935 vis_pack16(TMP10, DST_3);
|
|
|
2936 vis_st64(DST_2, dest[0]);
|
|
|
2937 dest += stride;
|
|
|
2938 } while (--height);
|
|
|
2939 }
|
|
|
2940
|
|
|
2941 static void MC_put_no_round_y_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
2942 const int stride, int height)
|
|
|
2943 {
|
|
|
2944 uint8_t *ref = (uint8_t *) _ref;
|
|
|
2945
|
|
|
2946 ref = vis_alignaddr(ref);
|
|
|
2947 vis_ld64(ref[0], TMP0);
|
|
|
2948
|
|
|
2949 vis_ld64_2(ref, 8, TMP2);
|
|
|
2950
|
|
|
2951 vis_ld64_2(ref, 16, TMP4);
|
|
|
2952 ref += stride;
|
|
|
2953
|
|
|
2954 vis_ld64(ref[0], TMP6);
|
|
|
2955 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2956
|
|
|
2957 vis_ld64_2(ref, 8, TMP8);
|
|
|
2958 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
2959
|
|
|
2960 vis_ld64_2(ref, 16, TMP10);
|
|
|
2961 ref += stride;
|
|
|
2962
|
|
|
2963 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
2964 vis_faligndata(TMP6, TMP8, REF_2);
|
|
|
2965
|
|
|
2966 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
2967 vis_faligndata(TMP8, TMP10, REF_6);
|
|
|
2968
|
|
|
2969 vis_ld64(constants128[0], CONST_128);
|
|
|
2970 height = (height >> 1) - 1;
|
|
|
2971 do { /* 24 cycles */
|
|
|
2972 vis_ld64(ref[0], TMP0);
|
|
|
2973 vis_xor(REF_0, REF_2, TMP12);
|
|
|
2974
|
|
|
2975 vis_ld64_2(ref, 8, TMP2);
|
|
|
2976 vis_xor(REF_4, REF_6, TMP16);
|
|
|
2977
|
|
|
2978 vis_ld64_2(ref, 16, TMP4);
|
|
|
2979 ref += stride;
|
|
|
2980 vis_and(REF_0, REF_2, TMP14);
|
|
|
2981
|
|
|
2982 vis_ld64(ref[0], TMP6);
|
|
|
2983 vis_and(REF_4, REF_6, TMP18);
|
|
|
2984
|
|
|
2985 vis_ld64_2(ref, 8, TMP8);
|
|
|
2986 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
2987
|
|
|
2988 vis_ld64_2(ref, 16, TMP10);
|
|
|
2989 ref += stride;
|
|
|
2990 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
2991
|
|
|
2992 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
2993
|
|
|
2994 vis_and(TMP16, MASK_fe, TMP16);
|
|
|
2995 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
2996
|
|
|
2997 vis_mul8x16(CONST_128, TMP16, TMP16);
|
|
|
2998 vis_xor(REF_0, REF_2, TMP0);
|
|
|
2999
|
|
|
3000 vis_xor(REF_4, REF_6, TMP2);
|
|
|
3001
|
|
|
3002 vis_and(REF_0, REF_2, TMP20);
|
|
|
3003
|
|
|
3004 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
3005
|
|
|
3006 vis_and(TMP16, MASK_7f, TMP16);
|
|
|
3007
|
|
|
3008 vis_padd16(TMP14, TMP12, TMP12);
|
|
|
3009 vis_st64(TMP12, dest[0]);
|
|
|
3010
|
|
|
3011 vis_padd16(TMP18, TMP16, TMP16);
|
|
|
3012 vis_st64_2(TMP16, dest, 8);
|
|
|
3013 dest += stride;
|
|
|
3014
|
|
|
3015 vis_and(REF_4, REF_6, TMP18);
|
|
|
3016
|
|
|
3017 vis_and(TMP0, MASK_fe, TMP0);
|
|
|
3018
|
|
|
3019 vis_and(TMP2, MASK_fe, TMP2);
|
|
|
3020 vis_mul8x16(CONST_128, TMP0, TMP0);
|
|
|
3021
|
|
|
3022 vis_faligndata(TMP6, TMP8, REF_2);
|
|
|
3023 vis_mul8x16(CONST_128, TMP2, TMP2);
|
|
|
3024
|
|
|
3025 vis_faligndata(TMP8, TMP10, REF_6);
|
|
|
3026
|
|
|
3027 vis_and(TMP0, MASK_7f, TMP0);
|
|
|
3028
|
|
|
3029 vis_and(TMP2, MASK_7f, TMP2);
|
|
|
3030
|
|
|
3031 vis_padd16(TMP20, TMP0, TMP0);
|
|
|
3032 vis_st64(TMP0, dest[0]);
|
|
|
3033
|
|
|
3034 vis_padd16(TMP18, TMP2, TMP2);
|
|
|
3035 vis_st64_2(TMP2, dest, 8);
|
|
|
3036 dest += stride;
|
|
|
3037 } while (--height);
|
|
|
3038
|
|
|
3039 vis_ld64(ref[0], TMP0);
|
|
|
3040 vis_xor(REF_0, REF_2, TMP12);
|
|
|
3041
|
|
|
3042 vis_ld64_2(ref, 8, TMP2);
|
|
|
3043 vis_xor(REF_4, REF_6, TMP16);
|
|
|
3044
|
|
|
3045 vis_ld64_2(ref, 16, TMP4);
|
|
|
3046 vis_and(REF_0, REF_2, TMP14);
|
|
|
3047
|
|
|
3048 vis_and(REF_4, REF_6, TMP18);
|
|
|
3049
|
|
|
3050 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
3051
|
|
|
3052 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
3053
|
|
|
3054 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
3055
|
|
|
3056 vis_and(TMP16, MASK_fe, TMP16);
|
|
|
3057 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
3058
|
|
|
3059 vis_mul8x16(CONST_128, TMP16, TMP16);
|
|
|
3060 vis_xor(REF_0, REF_2, TMP0);
|
|
|
3061
|
|
|
3062 vis_xor(REF_4, REF_6, TMP2);
|
|
|
3063
|
|
|
3064 vis_and(REF_0, REF_2, TMP20);
|
|
|
3065
|
|
|
3066 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
3067
|
|
|
3068 vis_and(TMP16, MASK_7f, TMP16);
|
|
|
3069
|
|
|
3070 vis_padd16(TMP14, TMP12, TMP12);
|
|
|
3071 vis_st64(TMP12, dest[0]);
|
|
|
3072
|
|
|
3073 vis_padd16(TMP18, TMP16, TMP16);
|
|
|
3074 vis_st64_2(TMP16, dest, 8);
|
|
|
3075 dest += stride;
|
|
|
3076
|
|
|
3077 vis_and(REF_4, REF_6, TMP18);
|
|
|
3078
|
|
|
3079 vis_and(TMP0, MASK_fe, TMP0);
|
|
|
3080
|
|
|
3081 vis_and(TMP2, MASK_fe, TMP2);
|
|
|
3082 vis_mul8x16(CONST_128, TMP0, TMP0);
|
|
|
3083
|
|
|
3084 vis_mul8x16(CONST_128, TMP2, TMP2);
|
|
|
3085
|
|
|
3086 vis_and(TMP0, MASK_7f, TMP0);
|
|
|
3087
|
|
|
3088 vis_and(TMP2, MASK_7f, TMP2);
|
|
|
3089
|
|
|
3090 vis_padd16(TMP20, TMP0, TMP0);
|
|
|
3091 vis_st64(TMP0, dest[0]);
|
|
|
3092
|
|
|
3093 vis_padd16(TMP18, TMP2, TMP2);
|
|
|
3094 vis_st64_2(TMP2, dest, 8);
|
|
|
3095 }
|
|
|
3096
|
|
|
3097 static void MC_put_no_round_y_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
3098 const int stride, int height)
|
|
|
3099 {
|
|
|
3100 uint8_t *ref = (uint8_t *) _ref;
|
|
|
3101
|
|
|
3102 ref = vis_alignaddr(ref);
|
|
|
3103 vis_ld64(ref[0], TMP0);
|
|
|
3104
|
|
|
3105 vis_ld64_2(ref, 8, TMP2);
|
|
|
3106 ref += stride;
|
|
|
3107
|
|
|
3108 vis_ld64(ref[0], TMP4);
|
|
|
3109
|
|
|
3110 vis_ld64_2(ref, 8, TMP6);
|
|
|
3111 ref += stride;
|
|
|
3112
|
|
|
3113 vis_ld64(constants_fe[0], MASK_fe);
|
|
|
3114 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
3115
|
|
|
3116 vis_ld64(constants_7f[0], MASK_7f);
|
|
|
3117 vis_faligndata(TMP4, TMP6, REF_2);
|
|
|
3118
|
|
|
3119 vis_ld64(constants128[0], CONST_128);
|
|
|
3120 height = (height >> 1) - 1;
|
|
|
3121 do { /* 12 cycles */
|
|
|
3122 vis_ld64(ref[0], TMP0);
|
|
|
3123 vis_xor(REF_0, REF_2, TMP4);
|
|
|
3124
|
|
|
3125 vis_ld64_2(ref, 8, TMP2);
|
|
|
3126 ref += stride;
|
|
|
3127 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
3128
|
|
|
3129 vis_and(REF_0, REF_2, TMP6);
|
|
|
3130 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
3131
|
|
|
3132 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
3133 vis_ld64(ref[0], TMP0);
|
|
|
3134
|
|
|
3135 vis_ld64_2(ref, 8, TMP2);
|
|
|
3136 ref += stride;
|
|
|
3137 vis_xor(REF_0, REF_2, TMP12);
|
|
|
3138
|
|
|
3139 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
3140
|
|
|
3141 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
3142
|
|
|
3143 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
3144 vis_and(REF_0, REF_2, TMP14);
|
|
|
3145
|
|
|
3146 vis_padd16(TMP6, TMP4, DST_0);
|
|
|
3147 vis_st64(DST_0, dest[0]);
|
|
|
3148 dest += stride;
|
|
|
3149
|
|
|
3150 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
3151
|
|
|
3152 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
3153
|
|
|
3154 vis_padd16(TMP14, TMP12, DST_0);
|
|
|
3155 vis_st64(DST_0, dest[0]);
|
|
|
3156 dest += stride;
|
|
|
3157 } while (--height);
|
|
|
3158
|
|
|
3159 vis_ld64(ref[0], TMP0);
|
|
|
3160 vis_xor(REF_0, REF_2, TMP4);
|
|
|
3161
|
|
|
3162 vis_ld64_2(ref, 8, TMP2);
|
|
|
3163 vis_and(TMP4, MASK_fe, TMP4);
|
|
|
3164
|
|
|
3165 vis_and(REF_0, REF_2, TMP6);
|
|
|
3166 vis_mul8x16(CONST_128, TMP4, TMP4);
|
|
|
3167
|
|
|
3168 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
3169
|
|
|
3170 vis_xor(REF_0, REF_2, TMP12);
|
|
|
3171
|
|
|
3172 vis_and(TMP4, MASK_7f, TMP4);
|
|
|
3173
|
|
|
3174 vis_and(TMP12, MASK_fe, TMP12);
|
|
|
3175
|
|
|
3176 vis_mul8x16(CONST_128, TMP12, TMP12);
|
|
|
3177 vis_and(REF_0, REF_2, TMP14);
|
|
|
3178
|
|
|
3179 vis_padd16(TMP6, TMP4, DST_0);
|
|
|
3180 vis_st64(DST_0, dest[0]);
|
|
|
3181 dest += stride;
|
|
|
3182
|
|
|
3183 vis_and(TMP12, MASK_7f, TMP12);
|
|
|
3184
|
|
|
3185 vis_padd16(TMP14, TMP12, DST_0);
|
|
|
3186 vis_st64(DST_0, dest[0]);
|
|
|
3187 }
|
|
|
3188
|
|
|
3189 static void MC_avg_no_round_y_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
3190 const int stride, int height)
|
|
|
3191 {
|
|
|
3192 uint8_t *ref = (uint8_t *) _ref;
|
|
|
3193 int stride_8 = stride + 8;
|
|
|
3194 int stride_16 = stride + 16;
|
|
|
3195
|
|
|
3196 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
3197
|
|
|
3198 ref = vis_alignaddr(ref);
|
|
|
3199
|
|
|
3200 vis_ld64(ref[ 0], TMP0);
|
|
|
3201 vis_fzero(ZERO);
|
|
|
3202
|
|
|
3203 vis_ld64(ref[ 8], TMP2);
|
|
|
3204
|
|
|
3205 vis_ld64(ref[16], TMP4);
|
|
|
3206
|
|
|
3207 vis_ld64(constants3[0], CONST_3);
|
|
|
3208 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
3209
|
|
|
3210 vis_ld64(constants256_512[0], CONST_256);
|
|
|
3211 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
3212 height >>= 1;
|
|
|
3213
|
|
|
3214 do { /* 31 cycles */
|
|
|
3215 vis_ld64_2(ref, stride, TMP0);
|
|
|
3216 vis_pmerge(ZERO, REF_2, TMP12);
|
|
|
3217 vis_mul8x16au(REF_2_1, CONST_256, TMP14);
|
|
|
3218
|
|
|
3219 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
3220 vis_pmerge(ZERO, REF_6, TMP16);
|
|
|
3221 vis_mul8x16au(REF_6_1, CONST_256, TMP18);
|
|
|
3222
|
|
|
3223 vis_ld64_2(ref, stride_16, TMP4);
|
|
|
3224 ref += stride;
|
|
|
3225
|
|
|
3226 vis_ld64(dest[0], DST_0);
|
|
|
3227 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
3228
|
|
|
3229 vis_ld64_2(dest, 8, DST_2);
|
|
|
3230 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
3231
|
|
|
3232 vis_ld64_2(ref, stride, TMP6);
|
|
|
3233 vis_pmerge(ZERO, REF_0, TMP0);
|
|
|
3234 vis_mul8x16au(REF_0_1, CONST_256, TMP2);
|
|
|
3235
|
|
|
3236 vis_ld64_2(ref, stride_8, TMP8);
|
|
|
3237 vis_pmerge(ZERO, REF_4, TMP4);
|
|
|
3238
|
|
|
3239 vis_ld64_2(ref, stride_16, TMP10);
|
|
|
3240 ref += stride;
|
|
|
3241
|
|
|
3242 vis_ld64_2(dest, stride, REF_S0/*DST_4*/);
|
|
|
3243 vis_faligndata(TMP6, TMP8, REF_2);
|
|
|
3244 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
|
|
|
3245
|
|
|
3246 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/);
|
|
|
3247 vis_faligndata(TMP8, TMP10, REF_6);
|
|
|
3248 vis_mul8x16al(DST_0, CONST_512, TMP20);
|
|
|
3249
|
|
|
3250 vis_padd16(TMP0, CONST_3, TMP0);
|
|
|
3251 vis_mul8x16al(DST_1, CONST_512, TMP22);
|
|
|
3252
|
|
|
3253 vis_padd16(TMP2, CONST_3, TMP2);
|
|
|
3254 vis_mul8x16al(DST_2, CONST_512, TMP24);
|
|
|
3255
|
|
|
3256 vis_padd16(TMP4, CONST_3, TMP4);
|
|
|
3257 vis_mul8x16al(DST_3, CONST_512, TMP26);
|
|
|
3258
|
|
|
3259 vis_padd16(TMP6, CONST_3, TMP6);
|
|
|
3260
|
|
|
3261 vis_padd16(TMP12, TMP20, TMP12);
|
|
|
3262 vis_mul8x16al(REF_S0, CONST_512, TMP20);
|
|
|
3263
|
|
|
3264 vis_padd16(TMP14, TMP22, TMP14);
|
|
|
3265 vis_mul8x16al(REF_S0_1, CONST_512, TMP22);
|
|
|
3266
|
|
|
3267 vis_padd16(TMP16, TMP24, TMP16);
|
|
|
3268 vis_mul8x16al(REF_S2, CONST_512, TMP24);
|
|
|
3269
|
|
|
3270 vis_padd16(TMP18, TMP26, TMP18);
|
|
|
3271 vis_mul8x16al(REF_S2_1, CONST_512, TMP26);
|
|
|
3272
|
|
|
3273 vis_padd16(TMP12, TMP0, TMP12);
|
|
|
3274 vis_mul8x16au(REF_2, CONST_256, TMP28);
|
|
|
3275
|
|
|
3276 vis_padd16(TMP14, TMP2, TMP14);
|
|
|
3277 vis_mul8x16au(REF_2_1, CONST_256, TMP30);
|
|
|
3278
|
|
|
3279 vis_padd16(TMP16, TMP4, TMP16);
|
|
|
3280 vis_mul8x16au(REF_6, CONST_256, REF_S4);
|
|
|
3281
|
|
|
3282 vis_padd16(TMP18, TMP6, TMP18);
|
|
|
3283 vis_mul8x16au(REF_6_1, CONST_256, REF_S6);
|
|
|
3284
|
|
|
3285 vis_pack16(TMP12, DST_0);
|
|
|
3286 vis_padd16(TMP28, TMP0, TMP12);
|
|
|
3287
|
|
|
3288 vis_pack16(TMP14, DST_1);
|
|
|
3289 vis_st64(DST_0, dest[0]);
|
|
|
3290 vis_padd16(TMP30, TMP2, TMP14);
|
|
|
3291
|
|
|
3292 vis_pack16(TMP16, DST_2);
|
|
|
3293 vis_padd16(REF_S4, TMP4, TMP16);
|
|
|
3294
|
|
|
3295 vis_pack16(TMP18, DST_3);
|
|
|
3296 vis_st64_2(DST_2, dest, 8);
|
|
|
3297 dest += stride;
|
|
|
3298 vis_padd16(REF_S6, TMP6, TMP18);
|
|
|
3299
|
|
|
3300 vis_padd16(TMP12, TMP20, TMP12);
|
|
|
3301
|
|
|
3302 vis_padd16(TMP14, TMP22, TMP14);
|
|
|
3303 vis_pack16(TMP12, DST_0);
|
|
|
3304
|
|
|
3305 vis_padd16(TMP16, TMP24, TMP16);
|
|
|
3306 vis_pack16(TMP14, DST_1);
|
|
|
3307 vis_st64(DST_0, dest[0]);
|
|
|
3308
|
|
|
3309 vis_padd16(TMP18, TMP26, TMP18);
|
|
|
3310 vis_pack16(TMP16, DST_2);
|
|
|
3311
|
|
|
3312 vis_pack16(TMP18, DST_3);
|
|
|
3313 vis_st64_2(DST_2, dest, 8);
|
|
|
3314 dest += stride;
|
|
|
3315 } while (--height);
|
|
|
3316 }
|
|
|
3317
|
|
|
3318 static void MC_avg_no_round_y_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
3319 const int stride, int height)
|
|
|
3320 {
|
|
|
3321 uint8_t *ref = (uint8_t *) _ref;
|
|
|
3322 int stride_8 = stride + 8;
|
|
|
3323
|
|
|
3324 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
3325
|
|
|
3326 ref = vis_alignaddr(ref);
|
|
|
3327
|
|
|
3328 vis_ld64(ref[ 0], TMP0);
|
|
|
3329 vis_fzero(ZERO);
|
|
|
3330
|
|
|
3331 vis_ld64(ref[ 8], TMP2);
|
|
|
3332
|
|
|
3333 vis_ld64(constants3[0], CONST_3);
|
|
|
3334 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
3335
|
|
|
3336 vis_ld64(constants256_512[0], CONST_256);
|
|
|
3337
|
|
|
3338 height >>= 1;
|
|
|
3339 do { /* 20 cycles */
|
|
|
3340 vis_ld64_2(ref, stride, TMP0);
|
|
|
3341 vis_pmerge(ZERO, REF_2, TMP8);
|
|
|
3342 vis_mul8x16au(REF_2_1, CONST_256, TMP10);
|
|
|
3343
|
|
|
3344 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
3345 ref += stride;
|
|
|
3346
|
|
|
3347 vis_ld64(dest[0], DST_0);
|
|
|
3348
|
|
|
3349 vis_ld64_2(dest, stride, DST_2);
|
|
|
3350 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
3351
|
|
|
3352 vis_ld64_2(ref, stride, TMP4);
|
|
|
3353 vis_mul8x16al(DST_0, CONST_512, TMP16);
|
|
|
3354 vis_pmerge(ZERO, REF_0, TMP12);
|
|
|
3355
|
|
|
3356 vis_ld64_2(ref, stride_8, TMP6);
|
|
|
3357 ref += stride;
|
|
|
3358 vis_mul8x16al(DST_1, CONST_512, TMP18);
|
|
|
3359 vis_pmerge(ZERO, REF_0_1, TMP14);
|
|
|
3360
|
|
|
3361 vis_padd16(TMP12, CONST_3, TMP12);
|
|
|
3362 vis_mul8x16al(DST_2, CONST_512, TMP24);
|
|
|
3363
|
|
|
3364 vis_padd16(TMP14, CONST_3, TMP14);
|
|
|
3365 vis_mul8x16al(DST_3, CONST_512, TMP26);
|
|
|
3366
|
|
|
3367 vis_faligndata(TMP4, TMP6, REF_2);
|
|
|
3368
|
|
|
3369 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
3370
|
|
|
3371 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
3372 vis_mul8x16au(REF_2, CONST_256, TMP20);
|
|
|
3373
|
|
|
3374 vis_padd16(TMP8, TMP16, TMP0);
|
|
|
3375 vis_mul8x16au(REF_2_1, CONST_256, TMP22);
|
|
|
3376
|
|
|
3377 vis_padd16(TMP10, TMP18, TMP2);
|
|
|
3378 vis_pack16(TMP0, DST_0);
|
|
|
3379
|
|
|
3380 vis_pack16(TMP2, DST_1);
|
|
|
3381 vis_st64(DST_0, dest[0]);
|
|
|
3382 dest += stride;
|
|
|
3383 vis_padd16(TMP12, TMP20, TMP12);
|
|
|
3384
|
|
|
3385 vis_padd16(TMP14, TMP22, TMP14);
|
|
|
3386
|
|
|
3387 vis_padd16(TMP12, TMP24, TMP0);
|
|
|
3388
|
|
|
3389 vis_padd16(TMP14, TMP26, TMP2);
|
|
|
3390 vis_pack16(TMP0, DST_2);
|
|
|
3391
|
|
|
3392 vis_pack16(TMP2, DST_3);
|
|
|
3393 vis_st64(DST_2, dest[0]);
|
|
|
3394 dest += stride;
|
|
|
3395 } while (--height);
|
|
|
3396 }
|
|
|
3397
|
|
|
3398 static void MC_put_no_round_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
3399 const int stride, int height)
|
|
|
3400 {
|
|
|
3401 uint8_t *ref = (uint8_t *) _ref;
|
|
|
3402 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
3403 unsigned long off_plus_1 = off + 1;
|
|
|
3404 int stride_8 = stride + 8;
|
|
|
3405 int stride_16 = stride + 16;
|
|
|
3406
|
|
|
3407 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
3408
|
|
|
3409 ref = vis_alignaddr(ref);
|
|
|
3410
|
|
|
3411 vis_ld64(ref[ 0], TMP0);
|
|
|
3412 vis_fzero(ZERO);
|
|
|
3413
|
|
|
3414 vis_ld64(ref[ 8], TMP2);
|
|
|
3415
|
|
|
3416 vis_ld64(ref[16], TMP4);
|
|
|
3417
|
|
|
3418 vis_ld64(constants1[0], CONST_1);
|
|
|
3419 vis_faligndata(TMP0, TMP2, REF_S0);
|
|
|
3420
|
|
|
3421 vis_ld64(constants256_512[0], CONST_256);
|
|
|
3422 vis_faligndata(TMP2, TMP4, REF_S4);
|
|
|
3423
|
|
|
3424 if (off != 0x7) {
|
|
|
3425 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
3426 vis_faligndata(TMP0, TMP2, REF_S2);
|
|
|
3427 vis_faligndata(TMP2, TMP4, REF_S6);
|
|
|
3428 } else {
|
|
|
3429 vis_src1(TMP2, REF_S2);
|
|
|
3430 vis_src1(TMP4, REF_S6);
|
|
|
3431 }
|
|
|
3432
|
|
|
3433 height >>= 1;
|
|
|
3434 do {
|
|
|
3435 vis_ld64_2(ref, stride, TMP0);
|
|
|
3436 vis_mul8x16au(REF_S0, CONST_256, TMP12);
|
|
|
3437 vis_pmerge(ZERO, REF_S0_1, TMP14);
|
|
|
3438
|
|
|
3439 vis_alignaddr_g0((void *)off);
|
|
|
3440
|
|
|
3441 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
3442 vis_mul8x16au(REF_S2, CONST_256, TMP16);
|
|
|
3443 vis_pmerge(ZERO, REF_S2_1, TMP18);
|
|
|
3444
|
|
|
3445 vis_ld64_2(ref, stride_16, TMP4);
|
|
|
3446 ref += stride;
|
|
|
3447 vis_mul8x16au(REF_S4, CONST_256, TMP20);
|
|
|
3448 vis_pmerge(ZERO, REF_S4_1, TMP22);
|
|
|
3449
|
|
|
3450 vis_ld64_2(ref, stride, TMP6);
|
|
|
3451 vis_mul8x16au(REF_S6, CONST_256, TMP24);
|
|
|
3452 vis_pmerge(ZERO, REF_S6_1, TMP26);
|
|
|
3453
|
|
|
3454 vis_ld64_2(ref, stride_8, TMP8);
|
|
|
3455 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
3456
|
|
|
3457 vis_ld64_2(ref, stride_16, TMP10);
|
|
|
3458 ref += stride;
|
|
|
3459 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
3460
|
|
|
3461 vis_faligndata(TMP6, TMP8, REF_S0);
|
|
|
3462
|
|
|
3463 vis_faligndata(TMP8, TMP10, REF_S4);
|
|
|
3464
|
|
|
3465 if (off != 0x7) {
|
|
|
3466 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
3467 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
3468 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
3469 vis_faligndata(TMP6, TMP8, REF_S2);
|
|
|
3470 vis_faligndata(TMP8, TMP10, REF_S6);
|
|
|
3471 } else {
|
|
|
3472 vis_src1(TMP2, REF_2);
|
|
|
3473 vis_src1(TMP4, REF_6);
|
|
|
3474 vis_src1(TMP8, REF_S2);
|
|
|
3475 vis_src1(TMP10, REF_S6);
|
|
|
3476 }
|
|
|
3477
|
|
|
3478 vis_mul8x16au(REF_0, CONST_256, TMP0);
|
|
|
3479 vis_pmerge(ZERO, REF_0_1, TMP2);
|
|
|
3480
|
|
|
3481 vis_mul8x16au(REF_2, CONST_256, TMP4);
|
|
|
3482 vis_pmerge(ZERO, REF_2_1, TMP6);
|
|
|
3483
|
|
|
3484 vis_padd16(TMP0, CONST_2, TMP8);
|
|
|
3485 vis_mul8x16au(REF_4, CONST_256, TMP0);
|
|
|
3486
|
|
|
3487 vis_padd16(TMP2, CONST_1, TMP10);
|
|
|
3488 vis_mul8x16au(REF_4_1, CONST_256, TMP2);
|
|
|
3489
|
|
|
3490 vis_padd16(TMP8, TMP4, TMP8);
|
|
|
3491 vis_mul8x16au(REF_6, CONST_256, TMP4);
|
|
|
3492
|
|
|
3493 vis_padd16(TMP10, TMP6, TMP10);
|
|
|
3494 vis_mul8x16au(REF_6_1, CONST_256, TMP6);
|
|
|
3495
|
|
|
3496 vis_padd16(TMP12, TMP8, TMP12);
|
|
|
3497
|
|
|
3498 vis_padd16(TMP14, TMP10, TMP14);
|
|
|
3499
|
|
|
3500 vis_padd16(TMP12, TMP16, TMP12);
|
|
|
3501
|
|
|
3502 vis_padd16(TMP14, TMP18, TMP14);
|
|
|
3503 vis_pack16(TMP12, DST_0);
|
|
|
3504
|
|
|
3505 vis_pack16(TMP14, DST_1);
|
|
|
3506 vis_st64(DST_0, dest[0]);
|
|
|
3507 vis_padd16(TMP0, CONST_1, TMP12);
|
|
|
3508
|
|
|
3509 vis_mul8x16au(REF_S0, CONST_256, TMP0);
|
|
|
3510 vis_padd16(TMP2, CONST_1, TMP14);
|
|
|
3511
|
|
|
3512 vis_mul8x16au(REF_S0_1, CONST_256, TMP2);
|
|
|
3513 vis_padd16(TMP12, TMP4, TMP12);
|
|
|
3514
|
|
|
3515 vis_mul8x16au(REF_S2, CONST_256, TMP4);
|
|
|
3516 vis_padd16(TMP14, TMP6, TMP14);
|
|
|
3517
|
|
|
3518 vis_mul8x16au(REF_S2_1, CONST_256, TMP6);
|
|
|
3519 vis_padd16(TMP20, TMP12, TMP20);
|
|
|
3520
|
|
|
3521 vis_padd16(TMP22, TMP14, TMP22);
|
|
|
3522
|
|
|
3523 vis_padd16(TMP20, TMP24, TMP20);
|
|
|
3524
|
|
|
3525 vis_padd16(TMP22, TMP26, TMP22);
|
|
|
3526 vis_pack16(TMP20, DST_2);
|
|
|
3527
|
|
|
3528 vis_pack16(TMP22, DST_3);
|
|
|
3529 vis_st64_2(DST_2, dest, 8);
|
|
|
3530 dest += stride;
|
|
|
3531 vis_padd16(TMP0, TMP4, TMP24);
|
|
|
3532
|
|
|
3533 vis_mul8x16au(REF_S4, CONST_256, TMP0);
|
|
|
3534 vis_padd16(TMP2, TMP6, TMP26);
|
|
|
3535
|
|
|
3536 vis_mul8x16au(REF_S4_1, CONST_256, TMP2);
|
|
|
3537 vis_padd16(TMP24, TMP8, TMP24);
|
|
|
3538
|
|
|
3539 vis_padd16(TMP26, TMP10, TMP26);
|
|
|
3540 vis_pack16(TMP24, DST_0);
|
|
|
3541
|
|
|
3542 vis_pack16(TMP26, DST_1);
|
|
|
3543 vis_st64(DST_0, dest[0]);
|
|
|
3544 vis_pmerge(ZERO, REF_S6, TMP4);
|
|
|
3545
|
|
|
3546 vis_pmerge(ZERO, REF_S6_1, TMP6);
|
|
|
3547
|
|
|
3548 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
3549
|
|
|
3550 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
3551
|
|
|
3552 vis_padd16(TMP0, TMP12, TMP0);
|
|
|
3553
|
|
|
3554 vis_padd16(TMP2, TMP14, TMP2);
|
|
|
3555 vis_pack16(TMP0, DST_2);
|
|
|
3556
|
|
|
3557 vis_pack16(TMP2, DST_3);
|
|
|
3558 vis_st64_2(DST_2, dest, 8);
|
|
|
3559 dest += stride;
|
|
|
3560 } while (--height);
|
|
|
3561 }
|
|
|
3562
|
|
|
3563 static void MC_put_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
3564 const int stride, int height)
|
|
|
3565 {
|
|
|
3566 uint8_t *ref = (uint8_t *) _ref;
|
|
|
3567 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
3568 unsigned long off_plus_1 = off + 1;
|
|
|
3569 int stride_8 = stride + 8;
|
|
|
3570
|
|
|
3571 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
3572
|
|
|
3573 ref = vis_alignaddr(ref);
|
|
|
3574
|
|
|
3575 vis_ld64(ref[ 0], TMP0);
|
|
|
3576 vis_fzero(ZERO);
|
|
|
3577
|
|
|
3578 vis_ld64(ref[ 8], TMP2);
|
|
|
3579
|
|
|
3580 vis_ld64(constants1[0], CONST_1);
|
|
|
3581
|
|
|
3582 vis_ld64(constants256_512[0], CONST_256);
|
|
|
3583 vis_faligndata(TMP0, TMP2, REF_S0);
|
|
|
3584
|
|
|
3585 if (off != 0x7) {
|
|
|
3586 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
3587 vis_faligndata(TMP0, TMP2, REF_S2);
|
|
|
3588 } else {
|
|
|
3589 vis_src1(TMP2, REF_S2);
|
|
|
3590 }
|
|
|
3591
|
|
|
3592 height >>= 1;
|
|
|
3593 do { /* 26 cycles */
|
|
|
3594 vis_ld64_2(ref, stride, TMP0);
|
|
|
3595 vis_mul8x16au(REF_S0, CONST_256, TMP8);
|
|
|
3596 vis_pmerge(ZERO, REF_S2, TMP12);
|
|
|
3597
|
|
|
3598 vis_alignaddr_g0((void *)off);
|
|
|
3599
|
|
|
3600 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
3601 ref += stride;
|
|
|
3602 vis_mul8x16au(REF_S0_1, CONST_256, TMP10);
|
|
|
3603 vis_pmerge(ZERO, REF_S2_1, TMP14);
|
|
|
3604
|
|
|
3605 vis_ld64_2(ref, stride, TMP4);
|
|
|
3606
|
|
|
3607 vis_ld64_2(ref, stride_8, TMP6);
|
|
|
3608 ref += stride;
|
|
|
3609 vis_faligndata(TMP0, TMP2, REF_S4);
|
|
|
3610
|
|
|
3611 vis_pmerge(ZERO, REF_S4, TMP18);
|
|
|
3612
|
|
|
3613 vis_pmerge(ZERO, REF_S4_1, TMP20);
|
|
|
3614
|
|
|
3615 vis_faligndata(TMP4, TMP6, REF_S0);
|
|
|
3616
|
|
|
3617 if (off != 0x7) {
|
|
|
3618 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
3619 vis_faligndata(TMP0, TMP2, REF_S6);
|
|
|
3620 vis_faligndata(TMP4, TMP6, REF_S2);
|
|
|
3621 } else {
|
|
|
3622 vis_src1(TMP2, REF_S6);
|
|
|
3623 vis_src1(TMP6, REF_S2);
|
|
|
3624 }
|
|
|
3625
|
|
|
3626 vis_padd16(TMP18, CONST_1, TMP18);
|
|
|
3627 vis_mul8x16au(REF_S6, CONST_256, TMP22);
|
|
|
3628
|
|
|
3629 vis_padd16(TMP20, CONST_1, TMP20);
|
|
|
3630 vis_mul8x16au(REF_S6_1, CONST_256, TMP24);
|
|
|
3631
|
|
|
3632 vis_mul8x16au(REF_S0, CONST_256, TMP26);
|
|
|
3633 vis_pmerge(ZERO, REF_S0_1, TMP28);
|
|
|
3634
|
|
|
3635 vis_mul8x16au(REF_S2, CONST_256, TMP30);
|
|
|
3636 vis_padd16(TMP18, TMP22, TMP18);
|
|
|
3637
|
|
|
3638 vis_mul8x16au(REF_S2_1, CONST_256, TMP32);
|
|
|
3639 vis_padd16(TMP20, TMP24, TMP20);
|
|
|
3640
|
|
|
3641 vis_padd16(TMP8, TMP18, TMP8);
|
|
|
3642
|
|
|
3643 vis_padd16(TMP10, TMP20, TMP10);
|
|
|
3644
|
|
|
3645 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
3646
|
|
|
3647 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
3648 vis_pack16(TMP8, DST_0);
|
|
|
3649
|
|
|
3650 vis_pack16(TMP10, DST_1);
|
|
|
3651 vis_st64(DST_0, dest[0]);
|
|
|
3652 dest += stride;
|
|
|
3653 vis_padd16(TMP18, TMP26, TMP18);
|
|
|
3654
|
|
|
3655 vis_padd16(TMP20, TMP28, TMP20);
|
|
|
3656
|
|
|
3657 vis_padd16(TMP18, TMP30, TMP18);
|
|
|
3658
|
|
|
3659 vis_padd16(TMP20, TMP32, TMP20);
|
|
|
3660 vis_pack16(TMP18, DST_2);
|
|
|
3661
|
|
|
3662 vis_pack16(TMP20, DST_3);
|
|
|
3663 vis_st64(DST_2, dest[0]);
|
|
|
3664 dest += stride;
|
|
|
3665 } while (--height);
|
|
|
3666 }
|
|
|
3667
|
|
|
3668 static void MC_avg_no_round_xy_16_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
3669 const int stride, int height)
|
|
|
3670 {
|
|
|
3671 uint8_t *ref = (uint8_t *) _ref;
|
|
|
3672 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
3673 unsigned long off_plus_1 = off + 1;
|
|
|
3674 int stride_8 = stride + 8;
|
|
|
3675 int stride_16 = stride + 16;
|
|
|
3676
|
|
|
3677 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
3678
|
|
|
3679 ref = vis_alignaddr(ref);
|
|
|
3680
|
|
|
3681 vis_ld64(ref[ 0], TMP0);
|
|
|
3682 vis_fzero(ZERO);
|
|
|
3683
|
|
|
3684 vis_ld64(ref[ 8], TMP2);
|
|
|
3685
|
|
|
3686 vis_ld64(ref[16], TMP4);
|
|
|
3687
|
|
|
3688 vis_ld64(constants6[0], CONST_6);
|
|
|
3689 vis_faligndata(TMP0, TMP2, REF_S0);
|
|
|
3690
|
|
|
3691 vis_ld64(constants256_1024[0], CONST_256);
|
|
|
3692 vis_faligndata(TMP2, TMP4, REF_S4);
|
|
|
3693
|
|
|
3694 if (off != 0x7) {
|
|
|
3695 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
3696 vis_faligndata(TMP0, TMP2, REF_S2);
|
|
|
3697 vis_faligndata(TMP2, TMP4, REF_S6);
|
|
|
3698 } else {
|
|
|
3699 vis_src1(TMP2, REF_S2);
|
|
|
3700 vis_src1(TMP4, REF_S6);
|
|
|
3701 }
|
|
|
3702
|
|
|
3703 height >>= 1;
|
|
|
3704 do { /* 55 cycles */
|
|
|
3705 vis_ld64_2(ref, stride, TMP0);
|
|
|
3706 vis_mul8x16au(REF_S0, CONST_256, TMP12);
|
|
|
3707 vis_pmerge(ZERO, REF_S0_1, TMP14);
|
|
|
3708
|
|
|
3709 vis_alignaddr_g0((void *)off);
|
|
|
3710
|
|
|
3711 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
3712 vis_mul8x16au(REF_S2, CONST_256, TMP16);
|
|
|
3713 vis_pmerge(ZERO, REF_S2_1, TMP18);
|
|
|
3714
|
|
|
3715 vis_ld64_2(ref, stride_16, TMP4);
|
|
|
3716 ref += stride;
|
|
|
3717 vis_mul8x16au(REF_S4, CONST_256, TMP20);
|
|
|
3718 vis_pmerge(ZERO, REF_S4_1, TMP22);
|
|
|
3719
|
|
|
3720 vis_ld64_2(ref, stride, TMP6);
|
|
|
3721 vis_mul8x16au(REF_S6, CONST_256, TMP24);
|
|
|
3722 vis_pmerge(ZERO, REF_S6_1, TMP26);
|
|
|
3723
|
|
|
3724 vis_ld64_2(ref, stride_8, TMP8);
|
|
|
3725 vis_faligndata(TMP0, TMP2, REF_0);
|
|
|
3726
|
|
|
3727 vis_ld64_2(ref, stride_16, TMP10);
|
|
|
3728 ref += stride;
|
|
|
3729 vis_faligndata(TMP2, TMP4, REF_4);
|
|
|
3730
|
|
|
3731 vis_ld64(dest[0], DST_0);
|
|
|
3732 vis_faligndata(TMP6, TMP8, REF_S0);
|
|
|
3733
|
|
|
3734 vis_ld64_2(dest, 8, DST_2);
|
|
|
3735 vis_faligndata(TMP8, TMP10, REF_S4);
|
|
|
3736
|
|
|
3737 if (off != 0x7) {
|
|
|
3738 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
3739 vis_faligndata(TMP0, TMP2, REF_2);
|
|
|
3740 vis_faligndata(TMP2, TMP4, REF_6);
|
|
|
3741 vis_faligndata(TMP6, TMP8, REF_S2);
|
|
|
3742 vis_faligndata(TMP8, TMP10, REF_S6);
|
|
|
3743 } else {
|
|
|
3744 vis_src1(TMP2, REF_2);
|
|
|
3745 vis_src1(TMP4, REF_6);
|
|
|
3746 vis_src1(TMP8, REF_S2);
|
|
|
3747 vis_src1(TMP10, REF_S6);
|
|
|
3748 }
|
|
|
3749
|
|
|
3750 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
|
|
3751 vis_pmerge(ZERO, REF_0, TMP0);
|
|
|
3752
|
|
|
3753 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
|
|
3754 vis_pmerge(ZERO, REF_0_1, TMP2);
|
|
|
3755
|
|
|
3756 vis_mul8x16au(REF_2, CONST_256, TMP4);
|
|
|
3757 vis_pmerge(ZERO, REF_2_1, TMP6);
|
|
|
3758
|
|
|
3759 vis_mul8x16al(DST_2, CONST_1024, REF_0);
|
|
|
3760 vis_padd16(TMP0, CONST_6, TMP0);
|
|
|
3761
|
|
|
3762 vis_mul8x16al(DST_3, CONST_1024, REF_2);
|
|
|
3763 vis_padd16(TMP2, CONST_6, TMP2);
|
|
|
3764
|
|
|
3765 vis_padd16(TMP0, TMP4, TMP0);
|
|
|
3766 vis_mul8x16au(REF_4, CONST_256, TMP4);
|
|
|
3767
|
|
|
3768 vis_padd16(TMP2, TMP6, TMP2);
|
|
|
3769 vis_mul8x16au(REF_4_1, CONST_256, TMP6);
|
|
|
3770
|
|
|
3771 vis_padd16(TMP12, TMP0, TMP12);
|
|
|
3772 vis_mul8x16au(REF_6, CONST_256, TMP8);
|
|
|
3773
|
|
|
3774 vis_padd16(TMP14, TMP2, TMP14);
|
|
|
3775 vis_mul8x16au(REF_6_1, CONST_256, TMP10);
|
|
|
3776
|
|
|
3777 vis_padd16(TMP12, TMP16, TMP12);
|
|
|
3778 vis_mul8x16au(REF_S0, CONST_256, REF_4);
|
|
|
3779
|
|
|
3780 vis_padd16(TMP14, TMP18, TMP14);
|
|
|
3781 vis_mul8x16au(REF_S0_1, CONST_256, REF_6);
|
|
|
3782
|
|
|
3783 vis_padd16(TMP12, TMP30, TMP12);
|
|
|
3784
|
|
|
3785 vis_padd16(TMP14, TMP32, TMP14);
|
|
|
3786 vis_pack16(TMP12, DST_0);
|
|
|
3787
|
|
|
3788 vis_pack16(TMP14, DST_1);
|
|
|
3789 vis_st64(DST_0, dest[0]);
|
|
|
3790 vis_padd16(TMP4, CONST_6, TMP4);
|
|
|
3791
|
|
|
3792 vis_ld64_2(dest, stride, DST_0);
|
|
|
3793 vis_padd16(TMP6, CONST_6, TMP6);
|
|
|
3794 vis_mul8x16au(REF_S2, CONST_256, TMP12);
|
|
|
3795
|
|
|
3796 vis_padd16(TMP4, TMP8, TMP4);
|
|
|
3797 vis_mul8x16au(REF_S2_1, CONST_256, TMP14);
|
|
|
3798
|
|
|
3799 vis_padd16(TMP6, TMP10, TMP6);
|
|
|
3800
|
|
|
3801 vis_padd16(TMP20, TMP4, TMP20);
|
|
|
3802
|
|
|
3803 vis_padd16(TMP22, TMP6, TMP22);
|
|
|
3804
|
|
|
3805 vis_padd16(TMP20, TMP24, TMP20);
|
|
|
3806
|
|
|
3807 vis_padd16(TMP22, TMP26, TMP22);
|
|
|
3808
|
|
|
3809 vis_padd16(TMP20, REF_0, TMP20);
|
|
|
3810 vis_mul8x16au(REF_S4, CONST_256, REF_0);
|
|
|
3811
|
|
|
3812 vis_padd16(TMP22, REF_2, TMP22);
|
|
|
3813 vis_pack16(TMP20, DST_2);
|
|
|
3814
|
|
|
3815 vis_pack16(TMP22, DST_3);
|
|
|
3816 vis_st64_2(DST_2, dest, 8);
|
|
|
3817 dest += stride;
|
|
|
3818
|
|
|
3819 vis_ld64_2(dest, 8, DST_2);
|
|
|
3820 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
|
|
3821 vis_pmerge(ZERO, REF_S4_1, REF_2);
|
|
|
3822
|
|
|
3823 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
|
|
3824 vis_padd16(REF_4, TMP0, TMP8);
|
|
|
3825
|
|
|
3826 vis_mul8x16au(REF_S6, CONST_256, REF_4);
|
|
|
3827 vis_padd16(REF_6, TMP2, TMP10);
|
|
|
3828
|
|
|
3829 vis_mul8x16au(REF_S6_1, CONST_256, REF_6);
|
|
|
3830 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
3831
|
|
|
3832 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
3833
|
|
|
3834 vis_padd16(TMP8, TMP30, TMP8);
|
|
|
3835
|
|
|
3836 vis_padd16(TMP10, TMP32, TMP10);
|
|
|
3837 vis_pack16(TMP8, DST_0);
|
|
|
3838
|
|
|
3839 vis_pack16(TMP10, DST_1);
|
|
|
3840 vis_st64(DST_0, dest[0]);
|
|
|
3841
|
|
|
3842 vis_padd16(REF_0, TMP4, REF_0);
|
|
|
3843
|
|
|
3844 vis_mul8x16al(DST_2, CONST_1024, TMP30);
|
|
|
3845 vis_padd16(REF_2, TMP6, REF_2);
|
|
|
3846
|
|
|
3847 vis_mul8x16al(DST_3, CONST_1024, TMP32);
|
|
|
3848 vis_padd16(REF_0, REF_4, REF_0);
|
|
|
3849
|
|
|
3850 vis_padd16(REF_2, REF_6, REF_2);
|
|
|
3851
|
|
|
3852 vis_padd16(REF_0, TMP30, REF_0);
|
|
|
3853
|
|
|
3854 /* stall */
|
|
|
3855
|
|
|
3856 vis_padd16(REF_2, TMP32, REF_2);
|
|
|
3857 vis_pack16(REF_0, DST_2);
|
|
|
3858
|
|
|
3859 vis_pack16(REF_2, DST_3);
|
|
|
3860 vis_st64_2(DST_2, dest, 8);
|
|
|
3861 dest += stride;
|
|
|
3862 } while (--height);
|
|
|
3863 }
|
|
|
3864
|
|
|
3865 static void MC_avg_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref,
|
|
|
3866 const int stride, int height)
|
|
|
3867 {
|
|
|
3868 uint8_t *ref = (uint8_t *) _ref;
|
|
|
3869 unsigned long off = (unsigned long) ref & 0x7;
|
|
|
3870 unsigned long off_plus_1 = off + 1;
|
|
|
3871 int stride_8 = stride + 8;
|
|
|
3872
|
|
|
3873 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT);
|
|
|
3874
|
|
|
3875 ref = vis_alignaddr(ref);
|
|
|
3876
|
|
|
3877 vis_ld64(ref[0], TMP0);
|
|
|
3878 vis_fzero(ZERO);
|
|
|
3879
|
|
|
3880 vis_ld64_2(ref, 8, TMP2);
|
|
|
3881
|
|
|
3882 vis_ld64(constants6[0], CONST_6);
|
|
|
3883
|
|
|
3884 vis_ld64(constants256_1024[0], CONST_256);
|
|
|
3885 vis_faligndata(TMP0, TMP2, REF_S0);
|
|
|
3886
|
|
|
3887 if (off != 0x7) {
|
|
|
3888 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
3889 vis_faligndata(TMP0, TMP2, REF_S2);
|
|
|
3890 } else {
|
|
|
3891 vis_src1(TMP2, REF_S2);
|
|
|
3892 }
|
|
|
3893
|
|
|
3894 height >>= 1;
|
|
|
3895 do { /* 31 cycles */
|
|
|
3896 vis_ld64_2(ref, stride, TMP0);
|
|
|
3897 vis_mul8x16au(REF_S0, CONST_256, TMP8);
|
|
|
3898 vis_pmerge(ZERO, REF_S0_1, TMP10);
|
|
|
3899
|
|
|
3900 vis_ld64_2(ref, stride_8, TMP2);
|
|
|
3901 ref += stride;
|
|
|
3902 vis_mul8x16au(REF_S2, CONST_256, TMP12);
|
|
|
3903 vis_pmerge(ZERO, REF_S2_1, TMP14);
|
|
|
3904
|
|
|
3905 vis_alignaddr_g0((void *)off);
|
|
|
3906
|
|
|
3907 vis_ld64_2(ref, stride, TMP4);
|
|
|
3908 vis_faligndata(TMP0, TMP2, REF_S4);
|
|
|
3909
|
|
|
3910 vis_ld64_2(ref, stride_8, TMP6);
|
|
|
3911 ref += stride;
|
|
|
3912
|
|
|
3913 vis_ld64(dest[0], DST_0);
|
|
|
3914 vis_faligndata(TMP4, TMP6, REF_S0);
|
|
|
3915
|
|
|
3916 vis_ld64_2(dest, stride, DST_2);
|
|
|
3917
|
|
|
3918 if (off != 0x7) {
|
|
|
3919 vis_alignaddr_g0((void *)off_plus_1);
|
|
|
3920 vis_faligndata(TMP0, TMP2, REF_S6);
|
|
|
3921 vis_faligndata(TMP4, TMP6, REF_S2);
|
|
|
3922 } else {
|
|
|
3923 vis_src1(TMP2, REF_S6);
|
|
|
3924 vis_src1(TMP6, REF_S2);
|
|
|
3925 }
|
|
|
3926
|
|
|
3927 vis_mul8x16al(DST_0, CONST_1024, TMP30);
|
|
|
3928 vis_pmerge(ZERO, REF_S4, TMP22);
|
|
|
3929
|
|
|
3930 vis_mul8x16al(DST_1, CONST_1024, TMP32);
|
|
|
3931 vis_pmerge(ZERO, REF_S4_1, TMP24);
|
|
|
3932
|
|
|
3933 vis_mul8x16au(REF_S6, CONST_256, TMP26);
|
|
|
3934 vis_pmerge(ZERO, REF_S6_1, TMP28);
|
|
|
3935
|
|
|
3936 vis_mul8x16au(REF_S0, CONST_256, REF_S4);
|
|
|
3937 vis_padd16(TMP22, CONST_6, TMP22);
|
|
|
3938
|
|
|
3939 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6);
|
|
|
3940 vis_padd16(TMP24, CONST_6, TMP24);
|
|
|
3941
|
|
|
3942 vis_mul8x16al(DST_2, CONST_1024, REF_0);
|
|
|
3943 vis_padd16(TMP22, TMP26, TMP22);
|
|
|
3944
|
|
|
3945 vis_mul8x16al(DST_3, CONST_1024, REF_2);
|
|
|
3946 vis_padd16(TMP24, TMP28, TMP24);
|
|
|
3947
|
|
|
3948 vis_mul8x16au(REF_S2, CONST_256, TMP26);
|
|
|
3949 vis_padd16(TMP8, TMP22, TMP8);
|
|
|
3950
|
|
|
3951 vis_mul8x16au(REF_S2_1, CONST_256, TMP28);
|
|
|
3952 vis_padd16(TMP10, TMP24, TMP10);
|
|
|
3953
|
|
|
3954 vis_padd16(TMP8, TMP12, TMP8);
|
|
|
3955
|
|
|
3956 vis_padd16(TMP10, TMP14, TMP10);
|
|
|
3957
|
|
|
3958 vis_padd16(TMP8, TMP30, TMP8);
|
|
|
3959
|
|
|
3960 vis_padd16(TMP10, TMP32, TMP10);
|
|
|
3961 vis_pack16(TMP8, DST_0);
|
|
|
3962
|
|
|
3963 vis_pack16(TMP10, DST_1);
|
|
|
3964 vis_st64(DST_0, dest[0]);
|
|
|
3965 dest += stride;
|
|
|
3966
|
|
|
3967 vis_padd16(REF_S4, TMP22, TMP12);
|
|
|
3968
|
|
|
3969 vis_padd16(REF_S6, TMP24, TMP14);
|
|
|
3970
|
|
|
3971 vis_padd16(TMP12, TMP26, TMP12);
|
|
|
3972
|
|
|
3973 vis_padd16(TMP14, TMP28, TMP14);
|
|
|
3974
|
|
|
3975 vis_padd16(TMP12, REF_0, TMP12);
|
|
|
3976
|
|
|
3977 vis_padd16(TMP14, REF_2, TMP14);
|
|
|
3978 vis_pack16(TMP12, DST_2);
|
|
|
3979
|
|
|
3980 vis_pack16(TMP14, DST_3);
|
|
|
3981 vis_st64(DST_2, dest[0]);
|
|
|
3982 dest += stride;
|
|
|
3983 } while (--height);
|
|
|
3984 }
|
|
|
3985
|
|
|
3986 /* End of no rounding code */
|
|
|
3987
|
|
|
3988 static sigjmp_buf jmpbuf;
|
|
|
3989 static volatile sig_atomic_t canjump = 0;
|
|
|
3990
|
|
|
3991 static void sigill_handler (int sig)
|
|
|
3992 {
|
|
|
3993 if (!canjump) {
|
|
|
3994 signal (sig, SIG_DFL);
|
|
|
3995 raise (sig);
|
|
|
3996 }
|
|
|
3997
|
|
|
3998 canjump = 0;
|
|
|
3999 siglongjmp (jmpbuf, 1);
|
|
|
4000 }
|
|
|
4001
|
|
|
4002 #define ACCEL_SPARC_VIS 1
|
|
|
4003 #define ACCEL_SPARC_VIS2 2
|
|
|
4004
|
|
|
4005 static int vis_level ()
|
|
|
4006 {
|
|
|
4007 int accel = 0;
|
|
|
4008
|
|
|
4009 signal (SIGILL, sigill_handler);
|
|
|
4010 if (sigsetjmp (jmpbuf, 1)) {
|
|
|
4011 signal (SIGILL, SIG_DFL);
|
|
|
4012 return accel;
|
|
|
4013 }
|
|
|
4014
|
|
|
4015 canjump = 1;
|
|
|
4016
|
|
|
4017 /* pdist %f0, %f0, %f0 */
|
|
|
4018 __asm__ __volatile__(".word\t0x81b007c0");
|
|
|
4019
|
|
|
4020 canjump = 0;
|
|
|
4021 accel |= ACCEL_SPARC_VIS;
|
|
|
4022
|
|
|
4023 if (sigsetjmp (jmpbuf, 1)) {
|
|
|
4024 signal (SIGILL, SIG_DFL);
|
|
|
4025 return accel;
|
|
|
4026 }
|
|
|
4027
|
|
|
4028 canjump = 1;
|
|
|
4029
|
|
|
4030 /* edge8n %g0, %g0, %g0 */
|
|
|
4031 __asm__ __volatile__(".word\t0x81b00020");
|
|
|
4032
|
|
|
4033 canjump = 0;
|
|
|
4034 accel |= ACCEL_SPARC_VIS2;
|
|
|
4035
|
|
|
4036 signal (SIGILL, SIG_DFL);
|
|
|
4037
|
|
|
4038 return accel;
|
|
|
4039 }
|
|
|
4040
|
|
|
4041 /* libavcodec initialization code */
|
|
|
4042 void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx)
|
|
|
4043 {
|
|
|
4044 /* VIS specific optimisations */
|
|
|
4045 int accel = vis_level ();
|
|
|
4046
|
|
|
4047 if (accel & ACCEL_SPARC_VIS) {
|
|
|
4048 c->put_pixels_tab[0][0] = MC_put_o_16_vis;
|
|
|
4049 c->put_pixels_tab[0][1] = MC_put_x_16_vis;
|
|
|
4050 c->put_pixels_tab[0][2] = MC_put_y_16_vis;
|
|
|
4051 c->put_pixels_tab[0][3] = MC_put_xy_16_vis;
|
|
|
4052
|
|
|
4053 c->put_pixels_tab[1][0] = MC_put_o_8_vis;
|
|
|
4054 c->put_pixels_tab[1][1] = MC_put_x_8_vis;
|
|
|
4055 c->put_pixels_tab[1][2] = MC_put_y_8_vis;
|
|
|
4056 c->put_pixels_tab[1][3] = MC_put_xy_8_vis;
|
|
|
4057
|
|
|
4058 c->avg_pixels_tab[0][0] = MC_avg_o_16_vis;
|
|
|
4059 c->avg_pixels_tab[0][1] = MC_avg_x_16_vis;
|
|
|
4060 c->avg_pixels_tab[0][2] = MC_avg_y_16_vis;
|
|
|
4061 c->avg_pixels_tab[0][3] = MC_avg_xy_16_vis;
|
|
|
4062
|
|
|
4063 c->avg_pixels_tab[1][0] = MC_avg_o_8_vis;
|
|
|
4064 c->avg_pixels_tab[1][1] = MC_avg_x_8_vis;
|
|
|
4065 c->avg_pixels_tab[1][2] = MC_avg_y_8_vis;
|
|
|
4066 c->avg_pixels_tab[1][3] = MC_avg_xy_8_vis;
|
|
|
4067
|
|
|
4068 c->put_no_rnd_pixels_tab[0][0] = MC_put_no_round_o_16_vis;
|
|
|
4069 c->put_no_rnd_pixels_tab[0][1] = MC_put_no_round_x_16_vis;
|
|
|
4070 c->put_no_rnd_pixels_tab[0][2] = MC_put_no_round_y_16_vis;
|
|
|
4071 c->put_no_rnd_pixels_tab[0][3] = MC_put_no_round_xy_16_vis;
|
|
|
4072
|
|
|
4073 c->put_no_rnd_pixels_tab[1][0] = MC_put_no_round_o_8_vis;
|
|
|
4074 c->put_no_rnd_pixels_tab[1][1] = MC_put_no_round_x_8_vis;
|
|
|
4075 c->put_no_rnd_pixels_tab[1][2] = MC_put_no_round_y_8_vis;
|
|
|
4076 c->put_no_rnd_pixels_tab[1][3] = MC_put_no_round_xy_8_vis;
|
|
|
4077
|
|
|
4078 c->avg_no_rnd_pixels_tab[0][0] = MC_avg_no_round_o_16_vis;
|
|
|
4079 c->avg_no_rnd_pixels_tab[0][1] = MC_avg_no_round_x_16_vis;
|
|
|
4080 c->avg_no_rnd_pixels_tab[0][2] = MC_avg_no_round_y_16_vis;
|
|
|
4081 c->avg_no_rnd_pixels_tab[0][3] = MC_avg_no_round_xy_16_vis;
|
|
|
4082
|
|
|
4083 c->avg_no_rnd_pixels_tab[1][0] = MC_avg_no_round_o_8_vis;
|
|
|
4084 c->avg_no_rnd_pixels_tab[1][1] = MC_avg_no_round_x_8_vis;
|
|
|
4085 c->avg_no_rnd_pixels_tab[1][2] = MC_avg_no_round_y_8_vis;
|
|
|
4086 c->avg_no_rnd_pixels_tab[1][3] = MC_avg_no_round_xy_8_vis;
|
|
|
4087 }
|
|
|
4088 }
|
|
|
4089
|
|
|
4090 #endif /* !(ARCH_SPARC) */
|