Mercurial > libavcodec.hg
annotate sparc/dsputil_vis.c @ 11032:01bd040f8607 libavcodec
Unroll main loop so the edge==0 case is seperate.
This allows many things to be simplified away.
h264 decoder is overall 1% faster with a mbaff sample and
0.1% slower with the cathedral sample, probably because the slow loop
filter code must be loaded into the code cache for each first MB of each
row but isnt used for the following MBs.
| author | michael |
|---|---|
| date | Thu, 28 Jan 2010 01:24:25 +0000 |
| parents | f9c847fb4839 |
| children | 766ca433df3b |
| rev | line source |
|---|---|
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1 /* |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2 * Copyright (C) 2003 David S. Miller <davem@redhat.com> |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3 * |
|
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
4 * This file is part of FFmpeg. |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
5 * |
|
3987
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
6 * FFmpeg is free software; you can redistribute it and/or |
|
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
7 * modify it under the terms of the GNU Lesser General Public |
|
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
8 * License as published by the Free Software Foundation; either |
|
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
9 * version 2.1 of the License, or (at your option) any later version. |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
10 * |
|
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3036
diff
changeset
|
11 * FFmpeg is distributed in the hope that it will be useful, |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
3987
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
14 * Lesser General Public License for more details. |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
15 * |
|
3987
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
16 * You should have received a copy of the GNU Lesser General Public |
|
2c54309fef91
Switch to the LGPL as agreed to by the author according to the
diego
parents:
3947
diff
changeset
|
17 * License along with FFmpeg; if not, write to the Free Software |
|
3036
0b546eab515d
Update licensing information: The FSF changed postal address.
diego
parents:
2979
diff
changeset
|
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
19 */ |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
20 |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
21 /* The *no_round* functions have been added by James A. Morrison, 2003,2004. |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
22 The vis code from libmpeg2 was adapted for ffmpeg by James A. Morrison. |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
23 */ |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
24 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
25 #include "config.h" |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
26 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
27 #include <inttypes.h> |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
28 |
| 6763 | 29 #include "libavcodec/dsputil.h" |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
30 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
31 #include "vis.h" |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
32 |
| 8250 | 33 void ff_simple_idct_put_vis(uint8_t *dest, int line_size, DCTELEM *data); |
| 34 void ff_simple_idct_add_vis(uint8_t *dest, int line_size, DCTELEM *data); | |
| 35 void ff_simple_idct_vis(DCTELEM *data); | |
| 5618 | 36 |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
37 /* The trick used in some of this file is the formula from the MMX |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
38 * motion comp code, which is: |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
39 * |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
40 * (x+y+1)>>1 == (x|y)-((x^y)>>1) |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
41 * |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
42 * This allows us to average 8 bytes at a time in a 64-bit FPU reg. |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
43 * We avoid overflows by masking before we do the shift, and we |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
44 * implement the shift by multiplying by 1/2 using mul8x16. So in |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
45 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
46 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
47 * the value 0x80808080 is in f8): |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
48 * |
| 2979 | 49 * fxor f0, f2, f10 |
| 50 * fand f10, f4, f10 | |
| 51 * fmul8x16 f8, f10, f10 | |
| 52 * fand f10, f6, f10 | |
| 53 * for f0, f2, f12 | |
| 54 * fpsub16 f12, f10, f10 | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
55 */ |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
56 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
57 #define ATTR_ALIGN(alignd) __attribute__ ((aligned(alignd))) |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
58 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
59 #define DUP4(x) {x, x, x, x} |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
60 #define DUP8(x) {x, x, x, x, x, x, x, x} |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
61 static const int16_t constants1[] ATTR_ALIGN(8) = DUP4 (1); |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
62 static const int16_t constants2[] ATTR_ALIGN(8) = DUP4 (2); |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
63 static const int16_t constants3[] ATTR_ALIGN(8) = DUP4 (3); |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
64 static const int16_t constants6[] ATTR_ALIGN(8) = DUP4 (6); |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
65 static const int8_t constants_fe[] ATTR_ALIGN(8) = DUP8 (0xfe); |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
66 static const int8_t constants_7f[] ATTR_ALIGN(8) = DUP8 (0x7f); |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
67 static const int8_t constants128[] ATTR_ALIGN(8) = DUP8 (128); |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
68 static const int16_t constants256_512[] ATTR_ALIGN(8) = |
| 2979 | 69 {256, 512, 256, 512}; |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
70 static const int16_t constants256_1024[] ATTR_ALIGN(8) = |
| 2979 | 71 {256, 1024, 256, 1024}; |
| 72 | |
| 73 #define REF_0 0 | |
| 74 #define REF_0_1 1 | |
| 75 #define REF_2 2 | |
| 76 #define REF_2_1 3 | |
| 77 #define REF_4 4 | |
| 78 #define REF_4_1 5 | |
| 79 #define REF_6 6 | |
| 80 #define REF_6_1 7 | |
| 81 #define REF_S0 8 | |
| 82 #define REF_S0_1 9 | |
| 83 #define REF_S2 10 | |
| 84 #define REF_S2_1 11 | |
| 85 #define REF_S4 12 | |
| 86 #define REF_S4_1 13 | |
| 87 #define REF_S6 14 | |
| 88 #define REF_S6_1 15 | |
| 89 #define DST_0 16 | |
| 90 #define DST_1 17 | |
| 91 #define DST_2 18 | |
| 92 #define DST_3 19 | |
| 93 #define CONST_1 20 | |
| 94 #define CONST_2 20 | |
| 95 #define CONST_3 20 | |
| 96 #define CONST_6 20 | |
| 97 #define MASK_fe 20 | |
| 98 #define CONST_128 22 | |
| 99 #define CONST_256 22 | |
| 100 #define CONST_512 22 | |
| 101 #define CONST_1024 22 | |
| 102 #define TMP0 24 | |
| 103 #define TMP1 25 | |
| 104 #define TMP2 26 | |
| 105 #define TMP3 27 | |
| 106 #define TMP4 28 | |
| 107 #define TMP5 29 | |
| 108 #define ZERO 30 | |
| 109 #define MASK_7f 30 | |
| 110 | |
| 111 #define TMP6 32 | |
| 112 #define TMP8 34 | |
| 113 #define TMP10 36 | |
| 114 #define TMP12 38 | |
| 115 #define TMP14 40 | |
| 116 #define TMP16 42 | |
| 117 #define TMP18 44 | |
| 118 #define TMP20 46 | |
| 119 #define TMP22 48 | |
| 120 #define TMP24 50 | |
| 121 #define TMP26 52 | |
| 122 #define TMP28 54 | |
| 123 #define TMP30 56 | |
| 124 #define TMP32 58 | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
125 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
126 static void MC_put_o_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 127 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
128 { |
| 2979 | 129 uint8_t *ref = (uint8_t *) _ref; |
| 130 | |
| 131 ref = vis_alignaddr(ref); | |
| 132 do { /* 5 cycles */ | |
| 133 vis_ld64(ref[0], TMP0); | |
| 134 | |
| 135 vis_ld64_2(ref, 8, TMP2); | |
| 136 | |
| 137 vis_ld64_2(ref, 16, TMP4); | |
| 138 ref += stride; | |
| 139 | |
| 140 vis_faligndata(TMP0, TMP2, REF_0); | |
| 141 vis_st64(REF_0, dest[0]); | |
| 142 | |
| 143 vis_faligndata(TMP2, TMP4, REF_2); | |
| 144 vis_st64_2(REF_2, dest, 8); | |
| 145 dest += stride; | |
| 146 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
147 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
148 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
149 static void MC_put_o_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 150 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
151 { |
| 2979 | 152 uint8_t *ref = (uint8_t *) _ref; |
| 153 | |
| 154 ref = vis_alignaddr(ref); | |
| 155 do { /* 4 cycles */ | |
| 156 vis_ld64(ref[0], TMP0); | |
| 157 | |
| 158 vis_ld64(ref[8], TMP2); | |
| 159 ref += stride; | |
| 160 | |
| 161 /* stall */ | |
| 162 | |
| 163 vis_faligndata(TMP0, TMP2, REF_0); | |
| 164 vis_st64(REF_0, dest[0]); | |
| 165 dest += stride; | |
| 166 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
167 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
168 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
169 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
170 static void MC_avg_o_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 171 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
172 { |
| 2979 | 173 uint8_t *ref = (uint8_t *) _ref; |
| 174 int stride_8 = stride + 8; | |
| 175 | |
| 176 ref = vis_alignaddr(ref); | |
| 177 | |
| 178 vis_ld64(ref[0], TMP0); | |
| 179 | |
| 180 vis_ld64(ref[8], TMP2); | |
| 181 | |
| 182 vis_ld64(ref[16], TMP4); | |
| 183 | |
| 184 vis_ld64(dest[0], DST_0); | |
| 185 | |
| 186 vis_ld64(dest[8], DST_2); | |
| 187 | |
| 188 vis_ld64(constants_fe[0], MASK_fe); | |
| 189 vis_faligndata(TMP0, TMP2, REF_0); | |
| 190 | |
| 191 vis_ld64(constants_7f[0], MASK_7f); | |
| 192 vis_faligndata(TMP2, TMP4, REF_2); | |
| 193 | |
| 194 vis_ld64(constants128[0], CONST_128); | |
| 195 | |
| 196 ref += stride; | |
| 197 height = (height >> 1) - 1; | |
| 198 | |
| 199 do { /* 24 cycles */ | |
| 200 vis_ld64(ref[0], TMP0); | |
| 201 vis_xor(DST_0, REF_0, TMP6); | |
| 202 | |
| 203 vis_ld64_2(ref, 8, TMP2); | |
| 204 vis_and(TMP6, MASK_fe, TMP6); | |
| 205 | |
| 206 vis_ld64_2(ref, 16, TMP4); | |
| 207 ref += stride; | |
| 208 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 209 vis_xor(DST_2, REF_2, TMP8); | |
| 210 | |
| 211 vis_and(TMP8, MASK_fe, TMP8); | |
| 212 | |
| 213 vis_or(DST_0, REF_0, TMP10); | |
| 214 vis_ld64_2(dest, stride, DST_0); | |
| 215 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 216 | |
| 217 vis_or(DST_2, REF_2, TMP12); | |
| 218 vis_ld64_2(dest, stride_8, DST_2); | |
| 219 | |
| 220 vis_ld64(ref[0], TMP14); | |
| 221 vis_and(TMP6, MASK_7f, TMP6); | |
| 222 | |
| 223 vis_and(TMP8, MASK_7f, TMP8); | |
| 224 | |
| 225 vis_psub16(TMP10, TMP6, TMP6); | |
| 226 vis_st64(TMP6, dest[0]); | |
| 227 | |
| 228 vis_psub16(TMP12, TMP8, TMP8); | |
| 229 vis_st64_2(TMP8, dest, 8); | |
| 230 | |
| 231 dest += stride; | |
| 232 vis_ld64_2(ref, 8, TMP16); | |
| 233 vis_faligndata(TMP0, TMP2, REF_0); | |
| 234 | |
| 235 vis_ld64_2(ref, 16, TMP18); | |
| 236 vis_faligndata(TMP2, TMP4, REF_2); | |
| 237 ref += stride; | |
| 238 | |
| 239 vis_xor(DST_0, REF_0, TMP20); | |
| 240 | |
| 241 vis_and(TMP20, MASK_fe, TMP20); | |
| 242 | |
| 243 vis_xor(DST_2, REF_2, TMP22); | |
| 244 vis_mul8x16(CONST_128, TMP20, TMP20); | |
| 245 | |
| 246 vis_and(TMP22, MASK_fe, TMP22); | |
| 247 | |
| 248 vis_or(DST_0, REF_0, TMP24); | |
| 249 vis_mul8x16(CONST_128, TMP22, TMP22); | |
| 250 | |
| 251 vis_or(DST_2, REF_2, TMP26); | |
| 252 | |
| 253 vis_ld64_2(dest, stride, DST_0); | |
| 254 vis_faligndata(TMP14, TMP16, REF_0); | |
| 255 | |
| 256 vis_ld64_2(dest, stride_8, DST_2); | |
| 257 vis_faligndata(TMP16, TMP18, REF_2); | |
| 258 | |
| 259 vis_and(TMP20, MASK_7f, TMP20); | |
| 260 | |
| 261 vis_and(TMP22, MASK_7f, TMP22); | |
| 262 | |
| 263 vis_psub16(TMP24, TMP20, TMP20); | |
| 264 vis_st64(TMP20, dest[0]); | |
| 265 | |
| 266 vis_psub16(TMP26, TMP22, TMP22); | |
| 267 vis_st64_2(TMP22, dest, 8); | |
| 268 dest += stride; | |
| 269 } while (--height); | |
| 270 | |
| 271 vis_ld64(ref[0], TMP0); | |
| 272 vis_xor(DST_0, REF_0, TMP6); | |
| 273 | |
| 274 vis_ld64_2(ref, 8, TMP2); | |
| 275 vis_and(TMP6, MASK_fe, TMP6); | |
| 276 | |
| 277 vis_ld64_2(ref, 16, TMP4); | |
| 278 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 279 vis_xor(DST_2, REF_2, TMP8); | |
| 280 | |
| 281 vis_and(TMP8, MASK_fe, TMP8); | |
| 282 | |
| 283 vis_or(DST_0, REF_0, TMP10); | |
| 284 vis_ld64_2(dest, stride, DST_0); | |
| 285 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 286 | |
| 287 vis_or(DST_2, REF_2, TMP12); | |
| 288 vis_ld64_2(dest, stride_8, DST_2); | |
| 289 | |
| 290 vis_ld64(ref[0], TMP14); | |
| 291 vis_and(TMP6, MASK_7f, TMP6); | |
| 292 | |
| 293 vis_and(TMP8, MASK_7f, TMP8); | |
| 294 | |
| 295 vis_psub16(TMP10, TMP6, TMP6); | |
| 296 vis_st64(TMP6, dest[0]); | |
| 297 | |
| 298 vis_psub16(TMP12, TMP8, TMP8); | |
| 299 vis_st64_2(TMP8, dest, 8); | |
| 300 | |
| 301 dest += stride; | |
| 302 vis_faligndata(TMP0, TMP2, REF_0); | |
| 303 | |
| 304 vis_faligndata(TMP2, TMP4, REF_2); | |
| 305 | |
| 306 vis_xor(DST_0, REF_0, TMP20); | |
| 307 | |
| 308 vis_and(TMP20, MASK_fe, TMP20); | |
| 309 | |
| 310 vis_xor(DST_2, REF_2, TMP22); | |
| 311 vis_mul8x16(CONST_128, TMP20, TMP20); | |
| 312 | |
| 313 vis_and(TMP22, MASK_fe, TMP22); | |
| 314 | |
| 315 vis_or(DST_0, REF_0, TMP24); | |
| 316 vis_mul8x16(CONST_128, TMP22, TMP22); | |
| 317 | |
| 318 vis_or(DST_2, REF_2, TMP26); | |
| 319 | |
| 320 vis_and(TMP20, MASK_7f, TMP20); | |
| 321 | |
| 322 vis_and(TMP22, MASK_7f, TMP22); | |
| 323 | |
| 324 vis_psub16(TMP24, TMP20, TMP20); | |
| 325 vis_st64(TMP20, dest[0]); | |
| 326 | |
| 327 vis_psub16(TMP26, TMP22, TMP22); | |
| 328 vis_st64_2(TMP22, dest, 8); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
329 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
330 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
331 static void MC_avg_o_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 332 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
333 { |
| 2979 | 334 uint8_t *ref = (uint8_t *) _ref; |
| 335 | |
| 336 ref = vis_alignaddr(ref); | |
| 337 | |
| 338 vis_ld64(ref[0], TMP0); | |
| 339 | |
| 340 vis_ld64(ref[8], TMP2); | |
| 341 | |
| 342 vis_ld64(dest[0], DST_0); | |
| 343 | |
| 344 vis_ld64(constants_fe[0], MASK_fe); | |
| 345 | |
| 346 vis_ld64(constants_7f[0], MASK_7f); | |
| 347 vis_faligndata(TMP0, TMP2, REF_0); | |
| 348 | |
| 349 vis_ld64(constants128[0], CONST_128); | |
| 350 | |
| 351 ref += stride; | |
| 352 height = (height >> 1) - 1; | |
| 353 | |
| 354 do { /* 12 cycles */ | |
| 355 vis_ld64(ref[0], TMP0); | |
| 356 vis_xor(DST_0, REF_0, TMP4); | |
| 357 | |
| 358 vis_ld64(ref[8], TMP2); | |
| 359 vis_and(TMP4, MASK_fe, TMP4); | |
| 360 | |
| 361 vis_or(DST_0, REF_0, TMP6); | |
| 362 vis_ld64_2(dest, stride, DST_0); | |
| 363 ref += stride; | |
| 364 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 365 | |
| 366 vis_ld64(ref[0], TMP12); | |
| 367 vis_faligndata(TMP0, TMP2, REF_0); | |
| 368 | |
| 369 vis_ld64(ref[8], TMP2); | |
| 370 vis_xor(DST_0, REF_0, TMP0); | |
| 371 ref += stride; | |
| 372 | |
| 373 vis_and(TMP0, MASK_fe, TMP0); | |
| 374 | |
| 375 vis_and(TMP4, MASK_7f, TMP4); | |
| 376 | |
| 377 vis_psub16(TMP6, TMP4, TMP4); | |
| 378 vis_st64(TMP4, dest[0]); | |
| 379 dest += stride; | |
| 380 vis_mul8x16(CONST_128, TMP0, TMP0); | |
| 381 | |
| 382 vis_or(DST_0, REF_0, TMP6); | |
| 383 vis_ld64_2(dest, stride, DST_0); | |
| 384 | |
| 385 vis_faligndata(TMP12, TMP2, REF_0); | |
| 386 | |
| 387 vis_and(TMP0, MASK_7f, TMP0); | |
| 388 | |
| 389 vis_psub16(TMP6, TMP0, TMP4); | |
| 390 vis_st64(TMP4, dest[0]); | |
| 391 dest += stride; | |
| 392 } while (--height); | |
| 393 | |
| 394 vis_ld64(ref[0], TMP0); | |
| 395 vis_xor(DST_0, REF_0, TMP4); | |
| 396 | |
| 397 vis_ld64(ref[8], TMP2); | |
| 398 vis_and(TMP4, MASK_fe, TMP4); | |
| 399 | |
| 400 vis_or(DST_0, REF_0, TMP6); | |
| 401 vis_ld64_2(dest, stride, DST_0); | |
| 402 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 403 | |
| 404 vis_faligndata(TMP0, TMP2, REF_0); | |
| 405 | |
| 406 vis_xor(DST_0, REF_0, TMP0); | |
| 407 | |
| 408 vis_and(TMP0, MASK_fe, TMP0); | |
| 409 | |
| 410 vis_and(TMP4, MASK_7f, TMP4); | |
| 411 | |
| 412 vis_psub16(TMP6, TMP4, TMP4); | |
| 413 vis_st64(TMP4, dest[0]); | |
| 414 dest += stride; | |
| 415 vis_mul8x16(CONST_128, TMP0, TMP0); | |
| 416 | |
| 417 vis_or(DST_0, REF_0, TMP6); | |
| 418 | |
| 419 vis_and(TMP0, MASK_7f, TMP0); | |
| 420 | |
| 421 vis_psub16(TMP6, TMP0, TMP4); | |
| 422 vis_st64(TMP4, dest[0]); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
423 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
424 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
425 static void MC_put_x_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 426 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
427 { |
| 2979 | 428 uint8_t *ref = (uint8_t *) _ref; |
| 429 unsigned long off = (unsigned long) ref & 0x7; | |
| 430 unsigned long off_plus_1 = off + 1; | |
| 431 | |
| 432 ref = vis_alignaddr(ref); | |
| 433 | |
| 434 vis_ld64(ref[0], TMP0); | |
| 435 | |
| 436 vis_ld64_2(ref, 8, TMP2); | |
| 437 | |
| 438 vis_ld64_2(ref, 16, TMP4); | |
| 439 | |
| 440 vis_ld64(constants_fe[0], MASK_fe); | |
| 441 | |
| 442 vis_ld64(constants_7f[0], MASK_7f); | |
| 443 vis_faligndata(TMP0, TMP2, REF_0); | |
| 444 | |
| 445 vis_ld64(constants128[0], CONST_128); | |
| 446 vis_faligndata(TMP2, TMP4, REF_4); | |
| 447 | |
| 448 if (off != 0x7) { | |
| 449 vis_alignaddr_g0((void *)off_plus_1); | |
| 450 vis_faligndata(TMP0, TMP2, REF_2); | |
| 451 vis_faligndata(TMP2, TMP4, REF_6); | |
| 452 } else { | |
| 453 vis_src1(TMP2, REF_2); | |
| 454 vis_src1(TMP4, REF_6); | |
| 455 } | |
| 456 | |
| 457 ref += stride; | |
| 458 height = (height >> 1) - 1; | |
| 459 | |
| 460 do { /* 34 cycles */ | |
| 461 vis_ld64(ref[0], TMP0); | |
| 462 vis_xor(REF_0, REF_2, TMP6); | |
| 463 | |
| 464 vis_ld64_2(ref, 8, TMP2); | |
| 465 vis_xor(REF_4, REF_6, TMP8); | |
| 466 | |
| 467 vis_ld64_2(ref, 16, TMP4); | |
| 468 vis_and(TMP6, MASK_fe, TMP6); | |
| 469 ref += stride; | |
| 470 | |
| 471 vis_ld64(ref[0], TMP14); | |
| 472 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 473 vis_and(TMP8, MASK_fe, TMP8); | |
| 474 | |
| 475 vis_ld64_2(ref, 8, TMP16); | |
| 476 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 477 vis_or(REF_0, REF_2, TMP10); | |
| 478 | |
| 479 vis_ld64_2(ref, 16, TMP18); | |
| 480 ref += stride; | |
| 481 vis_or(REF_4, REF_6, TMP12); | |
| 482 | |
| 483 vis_alignaddr_g0((void *)off); | |
| 484 | |
| 485 vis_faligndata(TMP0, TMP2, REF_0); | |
| 486 | |
| 487 vis_faligndata(TMP2, TMP4, REF_4); | |
| 488 | |
| 489 if (off != 0x7) { | |
| 490 vis_alignaddr_g0((void *)off_plus_1); | |
| 491 vis_faligndata(TMP0, TMP2, REF_2); | |
| 492 vis_faligndata(TMP2, TMP4, REF_6); | |
| 493 } else { | |
| 494 vis_src1(TMP2, REF_2); | |
| 495 vis_src1(TMP4, REF_6); | |
| 496 } | |
| 497 | |
| 498 vis_and(TMP6, MASK_7f, TMP6); | |
| 499 | |
| 500 vis_and(TMP8, MASK_7f, TMP8); | |
| 501 | |
| 502 vis_psub16(TMP10, TMP6, TMP6); | |
| 503 vis_st64(TMP6, dest[0]); | |
| 504 | |
| 505 vis_psub16(TMP12, TMP8, TMP8); | |
| 506 vis_st64_2(TMP8, dest, 8); | |
| 507 dest += stride; | |
| 508 | |
| 509 vis_xor(REF_0, REF_2, TMP6); | |
| 510 | |
| 511 vis_xor(REF_4, REF_6, TMP8); | |
| 512 | |
| 513 vis_and(TMP6, MASK_fe, TMP6); | |
| 514 | |
| 515 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 516 vis_and(TMP8, MASK_fe, TMP8); | |
| 517 | |
| 518 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 519 vis_or(REF_0, REF_2, TMP10); | |
| 520 | |
| 521 vis_or(REF_4, REF_6, TMP12); | |
| 522 | |
| 523 vis_alignaddr_g0((void *)off); | |
| 524 | |
| 525 vis_faligndata(TMP14, TMP16, REF_0); | |
| 526 | |
| 527 vis_faligndata(TMP16, TMP18, REF_4); | |
| 528 | |
| 529 if (off != 0x7) { | |
| 530 vis_alignaddr_g0((void *)off_plus_1); | |
| 531 vis_faligndata(TMP14, TMP16, REF_2); | |
| 532 vis_faligndata(TMP16, TMP18, REF_6); | |
| 533 } else { | |
| 534 vis_src1(TMP16, REF_2); | |
| 535 vis_src1(TMP18, REF_6); | |
| 536 } | |
| 537 | |
| 538 vis_and(TMP6, MASK_7f, TMP6); | |
| 539 | |
| 540 vis_and(TMP8, MASK_7f, TMP8); | |
| 541 | |
| 542 vis_psub16(TMP10, TMP6, TMP6); | |
| 543 vis_st64(TMP6, dest[0]); | |
| 544 | |
| 545 vis_psub16(TMP12, TMP8, TMP8); | |
| 546 vis_st64_2(TMP8, dest, 8); | |
| 547 dest += stride; | |
| 548 } while (--height); | |
| 549 | |
| 550 vis_ld64(ref[0], TMP0); | |
| 551 vis_xor(REF_0, REF_2, TMP6); | |
| 552 | |
| 553 vis_ld64_2(ref, 8, TMP2); | |
| 554 vis_xor(REF_4, REF_6, TMP8); | |
| 555 | |
| 556 vis_ld64_2(ref, 16, TMP4); | |
| 557 vis_and(TMP6, MASK_fe, TMP6); | |
| 558 | |
| 559 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 560 vis_and(TMP8, MASK_fe, TMP8); | |
| 561 | |
| 562 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 563 vis_or(REF_0, REF_2, TMP10); | |
| 564 | |
| 565 vis_or(REF_4, REF_6, TMP12); | |
| 566 | |
| 567 vis_alignaddr_g0((void *)off); | |
| 568 | |
| 569 vis_faligndata(TMP0, TMP2, REF_0); | |
| 570 | |
| 571 vis_faligndata(TMP2, TMP4, REF_4); | |
| 572 | |
| 573 if (off != 0x7) { | |
| 574 vis_alignaddr_g0((void *)off_plus_1); | |
| 575 vis_faligndata(TMP0, TMP2, REF_2); | |
| 576 vis_faligndata(TMP2, TMP4, REF_6); | |
| 577 } else { | |
| 578 vis_src1(TMP2, REF_2); | |
| 579 vis_src1(TMP4, REF_6); | |
| 580 } | |
| 581 | |
| 582 vis_and(TMP6, MASK_7f, TMP6); | |
| 583 | |
| 584 vis_and(TMP8, MASK_7f, TMP8); | |
| 585 | |
| 586 vis_psub16(TMP10, TMP6, TMP6); | |
| 587 vis_st64(TMP6, dest[0]); | |
| 588 | |
| 589 vis_psub16(TMP12, TMP8, TMP8); | |
| 590 vis_st64_2(TMP8, dest, 8); | |
| 591 dest += stride; | |
| 592 | |
| 593 vis_xor(REF_0, REF_2, TMP6); | |
| 594 | |
| 595 vis_xor(REF_4, REF_6, TMP8); | |
| 596 | |
| 597 vis_and(TMP6, MASK_fe, TMP6); | |
| 598 | |
| 599 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 600 vis_and(TMP8, MASK_fe, TMP8); | |
| 601 | |
| 602 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 603 vis_or(REF_0, REF_2, TMP10); | |
| 604 | |
| 605 vis_or(REF_4, REF_6, TMP12); | |
| 606 | |
| 607 vis_and(TMP6, MASK_7f, TMP6); | |
| 608 | |
| 609 vis_and(TMP8, MASK_7f, TMP8); | |
| 610 | |
| 611 vis_psub16(TMP10, TMP6, TMP6); | |
| 612 vis_st64(TMP6, dest[0]); | |
| 613 | |
| 614 vis_psub16(TMP12, TMP8, TMP8); | |
| 615 vis_st64_2(TMP8, dest, 8); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
616 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
617 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
618 static void MC_put_x_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 619 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
620 { |
| 2979 | 621 uint8_t *ref = (uint8_t *) _ref; |
| 622 unsigned long off = (unsigned long) ref & 0x7; | |
| 623 unsigned long off_plus_1 = off + 1; | |
| 624 | |
| 625 ref = vis_alignaddr(ref); | |
| 626 | |
| 627 vis_ld64(ref[0], TMP0); | |
| 628 | |
| 629 vis_ld64(ref[8], TMP2); | |
| 630 | |
| 631 vis_ld64(constants_fe[0], MASK_fe); | |
| 632 | |
| 633 vis_ld64(constants_7f[0], MASK_7f); | |
| 634 | |
| 635 vis_ld64(constants128[0], CONST_128); | |
| 636 vis_faligndata(TMP0, TMP2, REF_0); | |
| 637 | |
| 638 if (off != 0x7) { | |
| 639 vis_alignaddr_g0((void *)off_plus_1); | |
| 640 vis_faligndata(TMP0, TMP2, REF_2); | |
| 641 } else { | |
| 642 vis_src1(TMP2, REF_2); | |
| 643 } | |
| 644 | |
| 645 ref += stride; | |
| 646 height = (height >> 1) - 1; | |
| 647 | |
| 648 do { /* 20 cycles */ | |
| 649 vis_ld64(ref[0], TMP0); | |
| 650 vis_xor(REF_0, REF_2, TMP4); | |
| 651 | |
| 652 vis_ld64_2(ref, 8, TMP2); | |
| 653 vis_and(TMP4, MASK_fe, TMP4); | |
| 654 ref += stride; | |
| 655 | |
| 656 vis_ld64(ref[0], TMP8); | |
| 657 vis_or(REF_0, REF_2, TMP6); | |
| 658 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 659 | |
| 660 vis_alignaddr_g0((void *)off); | |
| 661 | |
| 662 vis_ld64_2(ref, 8, TMP10); | |
| 663 ref += stride; | |
| 664 vis_faligndata(TMP0, TMP2, REF_0); | |
| 665 | |
| 666 if (off != 0x7) { | |
| 667 vis_alignaddr_g0((void *)off_plus_1); | |
| 668 vis_faligndata(TMP0, TMP2, REF_2); | |
| 669 } else { | |
| 670 vis_src1(TMP2, REF_2); | |
| 671 } | |
| 672 | |
| 673 vis_and(TMP4, MASK_7f, TMP4); | |
| 674 | |
| 675 vis_psub16(TMP6, TMP4, DST_0); | |
| 676 vis_st64(DST_0, dest[0]); | |
| 677 dest += stride; | |
| 678 | |
| 679 vis_xor(REF_0, REF_2, TMP12); | |
| 680 | |
| 681 vis_and(TMP12, MASK_fe, TMP12); | |
| 682 | |
| 683 vis_or(REF_0, REF_2, TMP14); | |
| 684 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 685 | |
| 686 vis_alignaddr_g0((void *)off); | |
| 687 vis_faligndata(TMP8, TMP10, REF_0); | |
| 688 if (off != 0x7) { | |
| 689 vis_alignaddr_g0((void *)off_plus_1); | |
| 690 vis_faligndata(TMP8, TMP10, REF_2); | |
| 691 } else { | |
| 692 vis_src1(TMP10, REF_2); | |
| 693 } | |
| 694 | |
| 695 vis_and(TMP12, MASK_7f, TMP12); | |
| 696 | |
| 697 vis_psub16(TMP14, TMP12, DST_0); | |
| 698 vis_st64(DST_0, dest[0]); | |
| 699 dest += stride; | |
| 700 } while (--height); | |
| 701 | |
| 702 vis_ld64(ref[0], TMP0); | |
| 703 vis_xor(REF_0, REF_2, TMP4); | |
| 704 | |
| 705 vis_ld64_2(ref, 8, TMP2); | |
| 706 vis_and(TMP4, MASK_fe, TMP4); | |
| 707 | |
| 708 vis_or(REF_0, REF_2, TMP6); | |
| 709 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 710 | |
| 711 vis_alignaddr_g0((void *)off); | |
| 712 | |
| 713 vis_faligndata(TMP0, TMP2, REF_0); | |
| 714 | |
| 715 if (off != 0x7) { | |
| 716 vis_alignaddr_g0((void *)off_plus_1); | |
| 717 vis_faligndata(TMP0, TMP2, REF_2); | |
| 718 } else { | |
| 719 vis_src1(TMP2, REF_2); | |
| 720 } | |
| 721 | |
| 722 vis_and(TMP4, MASK_7f, TMP4); | |
| 723 | |
| 724 vis_psub16(TMP6, TMP4, DST_0); | |
| 725 vis_st64(DST_0, dest[0]); | |
| 726 dest += stride; | |
| 727 | |
| 728 vis_xor(REF_0, REF_2, TMP12); | |
| 729 | |
| 730 vis_and(TMP12, MASK_fe, TMP12); | |
| 731 | |
| 732 vis_or(REF_0, REF_2, TMP14); | |
| 733 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 734 | |
| 735 vis_and(TMP12, MASK_7f, TMP12); | |
| 736 | |
| 737 vis_psub16(TMP14, TMP12, DST_0); | |
| 738 vis_st64(DST_0, dest[0]); | |
| 739 dest += stride; | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
740 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
741 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
742 static void MC_avg_x_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 743 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
744 { |
| 2979 | 745 uint8_t *ref = (uint8_t *) _ref; |
| 746 unsigned long off = (unsigned long) ref & 0x7; | |
| 747 unsigned long off_plus_1 = off + 1; | |
| 748 | |
| 749 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 750 | |
| 751 vis_ld64(constants3[0], CONST_3); | |
| 752 vis_fzero(ZERO); | |
| 753 vis_ld64(constants256_512[0], CONST_256); | |
| 754 | |
| 755 ref = vis_alignaddr(ref); | |
| 756 do { /* 26 cycles */ | |
| 757 vis_ld64(ref[0], TMP0); | |
| 758 | |
| 759 vis_ld64(ref[8], TMP2); | |
| 760 | |
| 761 vis_alignaddr_g0((void *)off); | |
| 762 | |
| 763 vis_ld64(ref[16], TMP4); | |
| 764 | |
| 765 vis_ld64(dest[0], DST_0); | |
| 766 vis_faligndata(TMP0, TMP2, REF_0); | |
| 767 | |
| 768 vis_ld64(dest[8], DST_2); | |
| 769 vis_faligndata(TMP2, TMP4, REF_4); | |
| 770 | |
| 771 if (off != 0x7) { | |
| 772 vis_alignaddr_g0((void *)off_plus_1); | |
| 773 vis_faligndata(TMP0, TMP2, REF_2); | |
| 774 vis_faligndata(TMP2, TMP4, REF_6); | |
| 775 } else { | |
| 776 vis_src1(TMP2, REF_2); | |
| 777 vis_src1(TMP4, REF_6); | |
| 778 } | |
| 779 | |
| 780 vis_mul8x16au(REF_0, CONST_256, TMP0); | |
| 781 | |
| 782 vis_pmerge(ZERO, REF_2, TMP4); | |
| 783 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
| 784 | |
| 785 vis_pmerge(ZERO, REF_2_1, TMP6); | |
| 786 | |
| 787 vis_padd16(TMP0, TMP4, TMP0); | |
| 788 | |
| 789 vis_mul8x16al(DST_0, CONST_512, TMP4); | |
| 790 vis_padd16(TMP2, TMP6, TMP2); | |
| 791 | |
| 792 vis_mul8x16al(DST_1, CONST_512, TMP6); | |
| 793 | |
| 794 vis_mul8x16au(REF_6, CONST_256, TMP12); | |
| 795 | |
| 796 vis_padd16(TMP0, TMP4, TMP0); | |
| 797 vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
| 798 | |
| 799 vis_padd16(TMP2, TMP6, TMP2); | |
| 800 vis_mul8x16au(REF_4, CONST_256, TMP16); | |
| 801 | |
| 802 vis_padd16(TMP0, CONST_3, TMP8); | |
| 803 vis_mul8x16au(REF_4_1, CONST_256, TMP18); | |
| 804 | |
| 805 vis_padd16(TMP2, CONST_3, TMP10); | |
| 806 vis_pack16(TMP8, DST_0); | |
| 807 | |
| 808 vis_pack16(TMP10, DST_1); | |
| 809 vis_padd16(TMP16, TMP12, TMP0); | |
| 810 | |
| 811 vis_st64(DST_0, dest[0]); | |
| 812 vis_mul8x16al(DST_2, CONST_512, TMP4); | |
| 813 vis_padd16(TMP18, TMP14, TMP2); | |
| 814 | |
| 815 vis_mul8x16al(DST_3, CONST_512, TMP6); | |
| 816 vis_padd16(TMP0, CONST_3, TMP0); | |
| 817 | |
| 818 vis_padd16(TMP2, CONST_3, TMP2); | |
| 819 | |
| 820 vis_padd16(TMP0, TMP4, TMP0); | |
| 821 | |
| 822 vis_padd16(TMP2, TMP6, TMP2); | |
| 823 vis_pack16(TMP0, DST_2); | |
| 824 | |
| 825 vis_pack16(TMP2, DST_3); | |
| 826 vis_st64(DST_2, dest[8]); | |
| 827 | |
| 828 ref += stride; | |
| 829 dest += stride; | |
| 830 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
831 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
832 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
833 static void MC_avg_x_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 834 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
835 { |
| 2979 | 836 uint8_t *ref = (uint8_t *) _ref; |
| 837 unsigned long off = (unsigned long) ref & 0x7; | |
| 838 unsigned long off_plus_1 = off + 1; | |
| 839 int stride_times_2 = stride << 1; | |
| 840 | |
| 841 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 842 | |
| 843 vis_ld64(constants3[0], CONST_3); | |
| 844 vis_fzero(ZERO); | |
| 845 vis_ld64(constants256_512[0], CONST_256); | |
| 846 | |
| 847 ref = vis_alignaddr(ref); | |
| 848 height >>= 2; | |
| 849 do { /* 47 cycles */ | |
| 850 vis_ld64(ref[0], TMP0); | |
| 851 | |
| 852 vis_ld64_2(ref, 8, TMP2); | |
| 853 ref += stride; | |
| 854 | |
| 855 vis_alignaddr_g0((void *)off); | |
| 856 | |
| 857 vis_ld64(ref[0], TMP4); | |
| 858 vis_faligndata(TMP0, TMP2, REF_0); | |
| 859 | |
| 860 vis_ld64_2(ref, 8, TMP6); | |
| 861 ref += stride; | |
| 862 | |
| 863 vis_ld64(ref[0], TMP8); | |
| 864 | |
| 865 vis_ld64_2(ref, 8, TMP10); | |
| 866 ref += stride; | |
| 867 vis_faligndata(TMP4, TMP6, REF_4); | |
| 868 | |
| 869 vis_ld64(ref[0], TMP12); | |
| 870 | |
| 871 vis_ld64_2(ref, 8, TMP14); | |
| 872 ref += stride; | |
| 873 vis_faligndata(TMP8, TMP10, REF_S0); | |
| 874 | |
| 875 vis_faligndata(TMP12, TMP14, REF_S4); | |
| 876 | |
| 877 if (off != 0x7) { | |
| 878 vis_alignaddr_g0((void *)off_plus_1); | |
| 879 | |
| 880 vis_ld64(dest[0], DST_0); | |
| 881 vis_faligndata(TMP0, TMP2, REF_2); | |
| 882 | |
| 883 vis_ld64_2(dest, stride, DST_2); | |
| 884 vis_faligndata(TMP4, TMP6, REF_6); | |
| 885 | |
| 886 vis_faligndata(TMP8, TMP10, REF_S2); | |
| 887 | |
| 888 vis_faligndata(TMP12, TMP14, REF_S6); | |
| 889 } else { | |
| 890 vis_ld64(dest[0], DST_0); | |
| 891 vis_src1(TMP2, REF_2); | |
| 892 | |
| 893 vis_ld64_2(dest, stride, DST_2); | |
| 894 vis_src1(TMP6, REF_6); | |
| 895 | |
| 896 vis_src1(TMP10, REF_S2); | |
| 897 | |
| 898 vis_src1(TMP14, REF_S6); | |
| 899 } | |
| 900 | |
| 901 vis_pmerge(ZERO, REF_0, TMP0); | |
| 902 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
| 903 | |
| 904 vis_pmerge(ZERO, REF_2, TMP4); | |
| 905 vis_mul8x16au(REF_2_1, CONST_256, TMP6); | |
| 906 | |
| 907 vis_padd16(TMP0, CONST_3, TMP0); | |
| 908 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
| 909 | |
| 910 vis_padd16(TMP2, CONST_3, TMP2); | |
| 911 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
| 912 | |
| 913 vis_padd16(TMP0, TMP4, TMP0); | |
| 914 vis_mul8x16au(REF_4, CONST_256, TMP8); | |
| 915 | |
| 916 vis_padd16(TMP2, TMP6, TMP2); | |
| 917 vis_mul8x16au(REF_4_1, CONST_256, TMP10); | |
| 918 | |
| 919 vis_padd16(TMP0, TMP16, TMP0); | |
| 920 vis_mul8x16au(REF_6, CONST_256, TMP12); | |
| 921 | |
| 922 vis_padd16(TMP2, TMP18, TMP2); | |
| 923 vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
| 924 | |
| 925 vis_padd16(TMP8, CONST_3, TMP8); | |
| 926 vis_mul8x16al(DST_2, CONST_512, TMP16); | |
| 927 | |
| 928 vis_padd16(TMP8, TMP12, TMP8); | |
| 929 vis_mul8x16al(DST_3, CONST_512, TMP18); | |
| 930 | |
| 931 vis_padd16(TMP10, TMP14, TMP10); | |
| 932 vis_pack16(TMP0, DST_0); | |
| 933 | |
| 934 vis_pack16(TMP2, DST_1); | |
| 935 vis_st64(DST_0, dest[0]); | |
| 936 dest += stride; | |
| 937 vis_padd16(TMP10, CONST_3, TMP10); | |
| 938 | |
| 939 vis_ld64_2(dest, stride, DST_0); | |
| 940 vis_padd16(TMP8, TMP16, TMP8); | |
| 941 | |
| 942 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/); | |
| 943 vis_padd16(TMP10, TMP18, TMP10); | |
| 944 vis_pack16(TMP8, DST_2); | |
| 945 | |
| 946 vis_pack16(TMP10, DST_3); | |
| 947 vis_st64(DST_2, dest[0]); | |
| 948 dest += stride; | |
| 949 | |
| 950 vis_mul8x16au(REF_S0_1, CONST_256, TMP2); | |
| 951 vis_pmerge(ZERO, REF_S0, TMP0); | |
| 952 | |
| 953 vis_pmerge(ZERO, REF_S2, TMP24); | |
| 954 vis_mul8x16au(REF_S2_1, CONST_256, TMP6); | |
| 955 | |
| 956 vis_padd16(TMP0, CONST_3, TMP0); | |
| 957 vis_mul8x16au(REF_S4, CONST_256, TMP8); | |
| 958 | |
| 959 vis_padd16(TMP2, CONST_3, TMP2); | |
| 960 vis_mul8x16au(REF_S4_1, CONST_256, TMP10); | |
| 961 | |
| 962 vis_padd16(TMP0, TMP24, TMP0); | |
| 963 vis_mul8x16au(REF_S6, CONST_256, TMP12); | |
| 964 | |
| 965 vis_padd16(TMP2, TMP6, TMP2); | |
| 966 vis_mul8x16au(REF_S6_1, CONST_256, TMP14); | |
| 967 | |
| 968 vis_padd16(TMP8, CONST_3, TMP8); | |
| 969 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
| 970 | |
| 971 vis_padd16(TMP10, CONST_3, TMP10); | |
| 972 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
| 973 | |
| 974 vis_padd16(TMP8, TMP12, TMP8); | |
| 975 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20); | |
| 976 | |
| 977 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22); | |
| 978 vis_padd16(TMP0, TMP16, TMP0); | |
| 979 | |
| 980 vis_padd16(TMP2, TMP18, TMP2); | |
| 981 vis_pack16(TMP0, DST_0); | |
| 982 | |
| 983 vis_padd16(TMP10, TMP14, TMP10); | |
| 984 vis_pack16(TMP2, DST_1); | |
| 985 vis_st64(DST_0, dest[0]); | |
| 986 dest += stride; | |
| 987 | |
| 988 vis_padd16(TMP8, TMP20, TMP8); | |
| 989 | |
| 990 vis_padd16(TMP10, TMP22, TMP10); | |
| 991 vis_pack16(TMP8, DST_2); | |
| 992 | |
| 993 vis_pack16(TMP10, DST_3); | |
| 994 vis_st64(DST_2, dest[0]); | |
| 995 dest += stride; | |
| 996 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
997 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
998 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
999 static void MC_put_y_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 1000 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1001 { |
| 2979 | 1002 uint8_t *ref = (uint8_t *) _ref; |
| 1003 | |
| 1004 ref = vis_alignaddr(ref); | |
| 1005 vis_ld64(ref[0], TMP0); | |
| 1006 | |
| 1007 vis_ld64_2(ref, 8, TMP2); | |
| 1008 | |
| 1009 vis_ld64_2(ref, 16, TMP4); | |
| 1010 ref += stride; | |
| 1011 | |
| 1012 vis_ld64(ref[0], TMP6); | |
| 1013 vis_faligndata(TMP0, TMP2, REF_0); | |
| 1014 | |
| 1015 vis_ld64_2(ref, 8, TMP8); | |
| 1016 vis_faligndata(TMP2, TMP4, REF_4); | |
| 1017 | |
| 1018 vis_ld64_2(ref, 16, TMP10); | |
| 1019 ref += stride; | |
| 1020 | |
| 1021 vis_ld64(constants_fe[0], MASK_fe); | |
| 1022 vis_faligndata(TMP6, TMP8, REF_2); | |
| 1023 | |
| 1024 vis_ld64(constants_7f[0], MASK_7f); | |
| 1025 vis_faligndata(TMP8, TMP10, REF_6); | |
| 1026 | |
| 1027 vis_ld64(constants128[0], CONST_128); | |
| 1028 height = (height >> 1) - 1; | |
| 1029 do { /* 24 cycles */ | |
| 1030 vis_ld64(ref[0], TMP0); | |
| 1031 vis_xor(REF_0, REF_2, TMP12); | |
| 1032 | |
| 1033 vis_ld64_2(ref, 8, TMP2); | |
| 1034 vis_xor(REF_4, REF_6, TMP16); | |
| 1035 | |
| 1036 vis_ld64_2(ref, 16, TMP4); | |
| 1037 ref += stride; | |
| 1038 vis_or(REF_0, REF_2, TMP14); | |
| 1039 | |
| 1040 vis_ld64(ref[0], TMP6); | |
| 1041 vis_or(REF_4, REF_6, TMP18); | |
| 1042 | |
| 1043 vis_ld64_2(ref, 8, TMP8); | |
| 1044 vis_faligndata(TMP0, TMP2, REF_0); | |
| 1045 | |
| 1046 vis_ld64_2(ref, 16, TMP10); | |
| 1047 ref += stride; | |
| 1048 vis_faligndata(TMP2, TMP4, REF_4); | |
| 1049 | |
| 1050 vis_and(TMP12, MASK_fe, TMP12); | |
| 1051 | |
| 1052 vis_and(TMP16, MASK_fe, TMP16); | |
| 1053 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 1054 | |
| 1055 vis_mul8x16(CONST_128, TMP16, TMP16); | |
| 1056 vis_xor(REF_0, REF_2, TMP0); | |
| 1057 | |
| 1058 vis_xor(REF_4, REF_6, TMP2); | |
| 1059 | |
| 1060 vis_or(REF_0, REF_2, TMP20); | |
| 1061 | |
| 1062 vis_and(TMP12, MASK_7f, TMP12); | |
| 1063 | |
| 1064 vis_and(TMP16, MASK_7f, TMP16); | |
| 1065 | |
| 1066 vis_psub16(TMP14, TMP12, TMP12); | |
| 1067 vis_st64(TMP12, dest[0]); | |
| 1068 | |
| 1069 vis_psub16(TMP18, TMP16, TMP16); | |
| 1070 vis_st64_2(TMP16, dest, 8); | |
| 1071 dest += stride; | |
| 1072 | |
| 1073 vis_or(REF_4, REF_6, TMP18); | |
| 1074 | |
| 1075 vis_and(TMP0, MASK_fe, TMP0); | |
| 1076 | |
| 1077 vis_and(TMP2, MASK_fe, TMP2); | |
| 1078 vis_mul8x16(CONST_128, TMP0, TMP0); | |
| 1079 | |
| 1080 vis_faligndata(TMP6, TMP8, REF_2); | |
| 1081 vis_mul8x16(CONST_128, TMP2, TMP2); | |
| 1082 | |
| 1083 vis_faligndata(TMP8, TMP10, REF_6); | |
| 1084 | |
| 1085 vis_and(TMP0, MASK_7f, TMP0); | |
| 1086 | |
| 1087 vis_and(TMP2, MASK_7f, TMP2); | |
| 1088 | |
| 1089 vis_psub16(TMP20, TMP0, TMP0); | |
| 1090 vis_st64(TMP0, dest[0]); | |
| 1091 | |
| 1092 vis_psub16(TMP18, TMP2, TMP2); | |
| 1093 vis_st64_2(TMP2, dest, 8); | |
| 1094 dest += stride; | |
| 1095 } while (--height); | |
| 1096 | |
| 1097 vis_ld64(ref[0], TMP0); | |
| 1098 vis_xor(REF_0, REF_2, TMP12); | |
| 1099 | |
| 1100 vis_ld64_2(ref, 8, TMP2); | |
| 1101 vis_xor(REF_4, REF_6, TMP16); | |
| 1102 | |
| 1103 vis_ld64_2(ref, 16, TMP4); | |
| 1104 vis_or(REF_0, REF_2, TMP14); | |
| 1105 | |
| 1106 vis_or(REF_4, REF_6, TMP18); | |
| 1107 | |
| 1108 vis_faligndata(TMP0, TMP2, REF_0); | |
| 1109 | |
| 1110 vis_faligndata(TMP2, TMP4, REF_4); | |
| 1111 | |
| 1112 vis_and(TMP12, MASK_fe, TMP12); | |
| 1113 | |
| 1114 vis_and(TMP16, MASK_fe, TMP16); | |
| 1115 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 1116 | |
| 1117 vis_mul8x16(CONST_128, TMP16, TMP16); | |
| 1118 vis_xor(REF_0, REF_2, TMP0); | |
| 1119 | |
| 1120 vis_xor(REF_4, REF_6, TMP2); | |
| 1121 | |
| 1122 vis_or(REF_0, REF_2, TMP20); | |
| 1123 | |
| 1124 vis_and(TMP12, MASK_7f, TMP12); | |
| 1125 | |
| 1126 vis_and(TMP16, MASK_7f, TMP16); | |
| 1127 | |
| 1128 vis_psub16(TMP14, TMP12, TMP12); | |
| 1129 vis_st64(TMP12, dest[0]); | |
| 1130 | |
| 1131 vis_psub16(TMP18, TMP16, TMP16); | |
| 1132 vis_st64_2(TMP16, dest, 8); | |
| 1133 dest += stride; | |
| 1134 | |
| 1135 vis_or(REF_4, REF_6, TMP18); | |
| 1136 | |
| 1137 vis_and(TMP0, MASK_fe, TMP0); | |
| 1138 | |
| 1139 vis_and(TMP2, MASK_fe, TMP2); | |
| 1140 vis_mul8x16(CONST_128, TMP0, TMP0); | |
| 1141 | |
| 1142 vis_mul8x16(CONST_128, TMP2, TMP2); | |
| 1143 | |
| 1144 vis_and(TMP0, MASK_7f, TMP0); | |
| 1145 | |
| 1146 vis_and(TMP2, MASK_7f, TMP2); | |
| 1147 | |
| 1148 vis_psub16(TMP20, TMP0, TMP0); | |
| 1149 vis_st64(TMP0, dest[0]); | |
| 1150 | |
| 1151 vis_psub16(TMP18, TMP2, TMP2); | |
| 1152 vis_st64_2(TMP2, dest, 8); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1153 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1154 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1155 static void MC_put_y_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 1156 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1157 { |
| 2979 | 1158 uint8_t *ref = (uint8_t *) _ref; |
| 1159 | |
| 1160 ref = vis_alignaddr(ref); | |
| 1161 vis_ld64(ref[0], TMP0); | |
| 1162 | |
| 1163 vis_ld64_2(ref, 8, TMP2); | |
| 1164 ref += stride; | |
| 1165 | |
| 1166 vis_ld64(ref[0], TMP4); | |
| 1167 | |
| 1168 vis_ld64_2(ref, 8, TMP6); | |
| 1169 ref += stride; | |
| 1170 | |
| 1171 vis_ld64(constants_fe[0], MASK_fe); | |
| 1172 vis_faligndata(TMP0, TMP2, REF_0); | |
| 1173 | |
| 1174 vis_ld64(constants_7f[0], MASK_7f); | |
| 1175 vis_faligndata(TMP4, TMP6, REF_2); | |
| 1176 | |
| 1177 vis_ld64(constants128[0], CONST_128); | |
| 1178 height = (height >> 1) - 1; | |
| 1179 do { /* 12 cycles */ | |
| 1180 vis_ld64(ref[0], TMP0); | |
| 1181 vis_xor(REF_0, REF_2, TMP4); | |
| 1182 | |
| 1183 vis_ld64_2(ref, 8, TMP2); | |
| 1184 ref += stride; | |
| 1185 vis_and(TMP4, MASK_fe, TMP4); | |
| 1186 | |
| 1187 vis_or(REF_0, REF_2, TMP6); | |
| 1188 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 1189 | |
| 1190 vis_faligndata(TMP0, TMP2, REF_0); | |
| 1191 vis_ld64(ref[0], TMP0); | |
| 1192 | |
| 1193 vis_ld64_2(ref, 8, TMP2); | |
| 1194 ref += stride; | |
| 1195 vis_xor(REF_0, REF_2, TMP12); | |
| 1196 | |
| 1197 vis_and(TMP4, MASK_7f, TMP4); | |
| 1198 | |
| 1199 vis_and(TMP12, MASK_fe, TMP12); | |
| 1200 | |
| 1201 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 1202 vis_or(REF_0, REF_2, TMP14); | |
| 1203 | |
| 1204 vis_psub16(TMP6, TMP4, DST_0); | |
| 1205 vis_st64(DST_0, dest[0]); | |
| 1206 dest += stride; | |
| 1207 | |
| 1208 vis_faligndata(TMP0, TMP2, REF_2); | |
| 1209 | |
| 1210 vis_and(TMP12, MASK_7f, TMP12); | |
| 1211 | |
| 1212 vis_psub16(TMP14, TMP12, DST_0); | |
| 1213 vis_st64(DST_0, dest[0]); | |
| 1214 dest += stride; | |
| 1215 } while (--height); | |
| 1216 | |
| 1217 vis_ld64(ref[0], TMP0); | |
| 1218 vis_xor(REF_0, REF_2, TMP4); | |
| 1219 | |
| 1220 vis_ld64_2(ref, 8, TMP2); | |
| 1221 vis_and(TMP4, MASK_fe, TMP4); | |
| 1222 | |
| 1223 vis_or(REF_0, REF_2, TMP6); | |
| 1224 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 1225 | |
| 1226 vis_faligndata(TMP0, TMP2, REF_0); | |
| 1227 | |
| 1228 vis_xor(REF_0, REF_2, TMP12); | |
| 1229 | |
| 1230 vis_and(TMP4, MASK_7f, TMP4); | |
| 1231 | |
| 1232 vis_and(TMP12, MASK_fe, TMP12); | |
| 1233 | |
| 1234 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 1235 vis_or(REF_0, REF_2, TMP14); | |
| 1236 | |
| 1237 vis_psub16(TMP6, TMP4, DST_0); | |
| 1238 vis_st64(DST_0, dest[0]); | |
| 1239 dest += stride; | |
| 1240 | |
| 1241 vis_and(TMP12, MASK_7f, TMP12); | |
| 1242 | |
| 1243 vis_psub16(TMP14, TMP12, DST_0); | |
| 1244 vis_st64(DST_0, dest[0]); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1245 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1246 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1247 static void MC_avg_y_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 1248 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1249 { |
| 2979 | 1250 uint8_t *ref = (uint8_t *) _ref; |
| 1251 int stride_8 = stride + 8; | |
| 1252 int stride_16 = stride + 16; | |
| 1253 | |
| 1254 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 1255 | |
| 1256 ref = vis_alignaddr(ref); | |
| 1257 | |
| 1258 vis_ld64(ref[ 0], TMP0); | |
| 1259 vis_fzero(ZERO); | |
| 1260 | |
| 1261 vis_ld64(ref[ 8], TMP2); | |
| 1262 | |
| 1263 vis_ld64(ref[16], TMP4); | |
| 1264 | |
| 1265 vis_ld64(constants3[0], CONST_3); | |
| 1266 vis_faligndata(TMP0, TMP2, REF_2); | |
| 1267 | |
| 1268 vis_ld64(constants256_512[0], CONST_256); | |
| 1269 vis_faligndata(TMP2, TMP4, REF_6); | |
| 1270 height >>= 1; | |
| 1271 | |
| 1272 do { /* 31 cycles */ | |
| 1273 vis_ld64_2(ref, stride, TMP0); | |
| 1274 vis_pmerge(ZERO, REF_2, TMP12); | |
| 1275 vis_mul8x16au(REF_2_1, CONST_256, TMP14); | |
| 1276 | |
| 1277 vis_ld64_2(ref, stride_8, TMP2); | |
| 1278 vis_pmerge(ZERO, REF_6, TMP16); | |
| 1279 vis_mul8x16au(REF_6_1, CONST_256, TMP18); | |
| 1280 | |
| 1281 vis_ld64_2(ref, stride_16, TMP4); | |
| 1282 ref += stride; | |
| 1283 | |
| 1284 vis_ld64(dest[0], DST_0); | |
| 1285 vis_faligndata(TMP0, TMP2, REF_0); | |
| 1286 | |
| 1287 vis_ld64_2(dest, 8, DST_2); | |
| 1288 vis_faligndata(TMP2, TMP4, REF_4); | |
| 1289 | |
| 1290 vis_ld64_2(ref, stride, TMP6); | |
| 1291 vis_pmerge(ZERO, REF_0, TMP0); | |
| 1292 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
| 1293 | |
| 1294 vis_ld64_2(ref, stride_8, TMP8); | |
| 1295 vis_pmerge(ZERO, REF_4, TMP4); | |
| 1296 | |
| 1297 vis_ld64_2(ref, stride_16, TMP10); | |
| 1298 ref += stride; | |
| 1299 | |
| 1300 vis_ld64_2(dest, stride, REF_S0/*DST_4*/); | |
| 1301 vis_faligndata(TMP6, TMP8, REF_2); | |
| 1302 vis_mul8x16au(REF_4_1, CONST_256, TMP6); | |
| 1303 | |
| 1304 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/); | |
| 1305 vis_faligndata(TMP8, TMP10, REF_6); | |
| 1306 vis_mul8x16al(DST_0, CONST_512, TMP20); | |
| 1307 | |
| 1308 vis_padd16(TMP0, CONST_3, TMP0); | |
| 1309 vis_mul8x16al(DST_1, CONST_512, TMP22); | |
| 1310 | |
| 1311 vis_padd16(TMP2, CONST_3, TMP2); | |
| 1312 vis_mul8x16al(DST_2, CONST_512, TMP24); | |
| 1313 | |
| 1314 vis_padd16(TMP4, CONST_3, TMP4); | |
| 1315 vis_mul8x16al(DST_3, CONST_512, TMP26); | |
| 1316 | |
| 1317 vis_padd16(TMP6, CONST_3, TMP6); | |
| 1318 | |
| 1319 vis_padd16(TMP12, TMP20, TMP12); | |
| 1320 vis_mul8x16al(REF_S0, CONST_512, TMP20); | |
| 1321 | |
| 1322 vis_padd16(TMP14, TMP22, TMP14); | |
| 1323 vis_mul8x16al(REF_S0_1, CONST_512, TMP22); | |
| 1324 | |
| 1325 vis_padd16(TMP16, TMP24, TMP16); | |
| 1326 vis_mul8x16al(REF_S2, CONST_512, TMP24); | |
| 1327 | |
| 1328 vis_padd16(TMP18, TMP26, TMP18); | |
| 1329 vis_mul8x16al(REF_S2_1, CONST_512, TMP26); | |
| 1330 | |
| 1331 vis_padd16(TMP12, TMP0, TMP12); | |
| 1332 vis_mul8x16au(REF_2, CONST_256, TMP28); | |
| 1333 | |
| 1334 vis_padd16(TMP14, TMP2, TMP14); | |
| 1335 vis_mul8x16au(REF_2_1, CONST_256, TMP30); | |
| 1336 | |
| 1337 vis_padd16(TMP16, TMP4, TMP16); | |
| 1338 vis_mul8x16au(REF_6, CONST_256, REF_S4); | |
| 1339 | |
| 1340 vis_padd16(TMP18, TMP6, TMP18); | |
| 1341 vis_mul8x16au(REF_6_1, CONST_256, REF_S6); | |
| 1342 | |
| 1343 vis_pack16(TMP12, DST_0); | |
| 1344 vis_padd16(TMP28, TMP0, TMP12); | |
| 1345 | |
| 1346 vis_pack16(TMP14, DST_1); | |
| 1347 vis_st64(DST_0, dest[0]); | |
| 1348 vis_padd16(TMP30, TMP2, TMP14); | |
| 1349 | |
| 1350 vis_pack16(TMP16, DST_2); | |
| 1351 vis_padd16(REF_S4, TMP4, TMP16); | |
| 1352 | |
| 1353 vis_pack16(TMP18, DST_3); | |
| 1354 vis_st64_2(DST_2, dest, 8); | |
| 1355 dest += stride; | |
| 1356 vis_padd16(REF_S6, TMP6, TMP18); | |
| 1357 | |
| 1358 vis_padd16(TMP12, TMP20, TMP12); | |
| 1359 | |
| 1360 vis_padd16(TMP14, TMP22, TMP14); | |
| 1361 vis_pack16(TMP12, DST_0); | |
| 1362 | |
| 1363 vis_padd16(TMP16, TMP24, TMP16); | |
| 1364 vis_pack16(TMP14, DST_1); | |
| 1365 vis_st64(DST_0, dest[0]); | |
| 1366 | |
| 1367 vis_padd16(TMP18, TMP26, TMP18); | |
| 1368 vis_pack16(TMP16, DST_2); | |
| 1369 | |
| 1370 vis_pack16(TMP18, DST_3); | |
| 1371 vis_st64_2(DST_2, dest, 8); | |
| 1372 dest += stride; | |
| 1373 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1374 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1375 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1376 static void MC_avg_y_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 1377 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1378 { |
| 2979 | 1379 uint8_t *ref = (uint8_t *) _ref; |
| 1380 int stride_8 = stride + 8; | |
| 1381 | |
| 1382 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 1383 | |
| 1384 ref = vis_alignaddr(ref); | |
| 1385 | |
| 1386 vis_ld64(ref[ 0], TMP0); | |
| 1387 vis_fzero(ZERO); | |
| 1388 | |
| 1389 vis_ld64(ref[ 8], TMP2); | |
| 1390 | |
| 1391 vis_ld64(constants3[0], CONST_3); | |
| 1392 vis_faligndata(TMP0, TMP2, REF_2); | |
| 1393 | |
| 1394 vis_ld64(constants256_512[0], CONST_256); | |
| 1395 | |
| 1396 height >>= 1; | |
| 1397 do { /* 20 cycles */ | |
| 1398 vis_ld64_2(ref, stride, TMP0); | |
| 1399 vis_pmerge(ZERO, REF_2, TMP8); | |
| 1400 vis_mul8x16au(REF_2_1, CONST_256, TMP10); | |
| 1401 | |
| 1402 vis_ld64_2(ref, stride_8, TMP2); | |
| 1403 ref += stride; | |
| 1404 | |
| 1405 vis_ld64(dest[0], DST_0); | |
| 1406 | |
| 1407 vis_ld64_2(dest, stride, DST_2); | |
| 1408 vis_faligndata(TMP0, TMP2, REF_0); | |
| 1409 | |
| 1410 vis_ld64_2(ref, stride, TMP4); | |
| 1411 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
| 1412 vis_pmerge(ZERO, REF_0, TMP12); | |
| 1413 | |
| 1414 vis_ld64_2(ref, stride_8, TMP6); | |
| 1415 ref += stride; | |
| 1416 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
| 1417 vis_pmerge(ZERO, REF_0_1, TMP14); | |
| 1418 | |
| 1419 vis_padd16(TMP12, CONST_3, TMP12); | |
| 1420 vis_mul8x16al(DST_2, CONST_512, TMP24); | |
| 1421 | |
| 1422 vis_padd16(TMP14, CONST_3, TMP14); | |
| 1423 vis_mul8x16al(DST_3, CONST_512, TMP26); | |
| 1424 | |
| 1425 vis_faligndata(TMP4, TMP6, REF_2); | |
| 1426 | |
| 1427 vis_padd16(TMP8, TMP12, TMP8); | |
| 1428 | |
| 1429 vis_padd16(TMP10, TMP14, TMP10); | |
| 1430 vis_mul8x16au(REF_2, CONST_256, TMP20); | |
| 1431 | |
| 1432 vis_padd16(TMP8, TMP16, TMP0); | |
| 1433 vis_mul8x16au(REF_2_1, CONST_256, TMP22); | |
| 1434 | |
| 1435 vis_padd16(TMP10, TMP18, TMP2); | |
| 1436 vis_pack16(TMP0, DST_0); | |
| 1437 | |
| 1438 vis_pack16(TMP2, DST_1); | |
| 1439 vis_st64(DST_0, dest[0]); | |
| 1440 dest += stride; | |
| 1441 vis_padd16(TMP12, TMP20, TMP12); | |
| 1442 | |
| 1443 vis_padd16(TMP14, TMP22, TMP14); | |
| 1444 | |
| 1445 vis_padd16(TMP12, TMP24, TMP0); | |
| 1446 | |
| 1447 vis_padd16(TMP14, TMP26, TMP2); | |
| 1448 vis_pack16(TMP0, DST_2); | |
| 1449 | |
| 1450 vis_pack16(TMP2, DST_3); | |
| 1451 vis_st64(DST_2, dest[0]); | |
| 1452 dest += stride; | |
| 1453 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1454 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1455 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1456 static void MC_put_xy_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 1457 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1458 { |
| 2979 | 1459 uint8_t *ref = (uint8_t *) _ref; |
| 1460 unsigned long off = (unsigned long) ref & 0x7; | |
| 1461 unsigned long off_plus_1 = off + 1; | |
| 1462 int stride_8 = stride + 8; | |
| 1463 int stride_16 = stride + 16; | |
| 1464 | |
| 1465 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 1466 | |
| 1467 ref = vis_alignaddr(ref); | |
| 1468 | |
| 1469 vis_ld64(ref[ 0], TMP0); | |
| 1470 vis_fzero(ZERO); | |
| 1471 | |
| 1472 vis_ld64(ref[ 8], TMP2); | |
| 1473 | |
| 1474 vis_ld64(ref[16], TMP4); | |
| 1475 | |
| 1476 vis_ld64(constants2[0], CONST_2); | |
| 1477 vis_faligndata(TMP0, TMP2, REF_S0); | |
| 1478 | |
| 1479 vis_ld64(constants256_512[0], CONST_256); | |
| 1480 vis_faligndata(TMP2, TMP4, REF_S4); | |
| 1481 | |
| 1482 if (off != 0x7) { | |
| 1483 vis_alignaddr_g0((void *)off_plus_1); | |
| 1484 vis_faligndata(TMP0, TMP2, REF_S2); | |
| 1485 vis_faligndata(TMP2, TMP4, REF_S6); | |
| 1486 } else { | |
| 1487 vis_src1(TMP2, REF_S2); | |
| 1488 vis_src1(TMP4, REF_S6); | |
| 1489 } | |
| 1490 | |
| 1491 height >>= 1; | |
| 1492 do { | |
| 1493 vis_ld64_2(ref, stride, TMP0); | |
| 1494 vis_mul8x16au(REF_S0, CONST_256, TMP12); | |
| 1495 vis_pmerge(ZERO, REF_S0_1, TMP14); | |
| 1496 | |
| 1497 vis_alignaddr_g0((void *)off); | |
| 1498 | |
| 1499 vis_ld64_2(ref, stride_8, TMP2); | |
| 1500 vis_mul8x16au(REF_S2, CONST_256, TMP16); | |
| 1501 vis_pmerge(ZERO, REF_S2_1, TMP18); | |
| 1502 | |
| 1503 vis_ld64_2(ref, stride_16, TMP4); | |
| 1504 ref += stride; | |
| 1505 vis_mul8x16au(REF_S4, CONST_256, TMP20); | |
| 1506 vis_pmerge(ZERO, REF_S4_1, TMP22); | |
| 1507 | |
| 1508 vis_ld64_2(ref, stride, TMP6); | |
| 1509 vis_mul8x16au(REF_S6, CONST_256, TMP24); | |
| 1510 vis_pmerge(ZERO, REF_S6_1, TMP26); | |
| 1511 | |
| 1512 vis_ld64_2(ref, stride_8, TMP8); | |
| 1513 vis_faligndata(TMP0, TMP2, REF_0); | |
| 1514 | |
| 1515 vis_ld64_2(ref, stride_16, TMP10); | |
| 1516 ref += stride; | |
| 1517 vis_faligndata(TMP2, TMP4, REF_4); | |
| 1518 | |
| 1519 vis_faligndata(TMP6, TMP8, REF_S0); | |
| 1520 | |
| 1521 vis_faligndata(TMP8, TMP10, REF_S4); | |
| 1522 | |
| 1523 if (off != 0x7) { | |
| 1524 vis_alignaddr_g0((void *)off_plus_1); | |
| 1525 vis_faligndata(TMP0, TMP2, REF_2); | |
| 1526 vis_faligndata(TMP2, TMP4, REF_6); | |
| 1527 vis_faligndata(TMP6, TMP8, REF_S2); | |
| 1528 vis_faligndata(TMP8, TMP10, REF_S6); | |
| 1529 } else { | |
| 1530 vis_src1(TMP2, REF_2); | |
| 1531 vis_src1(TMP4, REF_6); | |
| 1532 vis_src1(TMP8, REF_S2); | |
| 1533 vis_src1(TMP10, REF_S6); | |
| 1534 } | |
| 1535 | |
| 1536 vis_mul8x16au(REF_0, CONST_256, TMP0); | |
| 1537 vis_pmerge(ZERO, REF_0_1, TMP2); | |
| 1538 | |
| 1539 vis_mul8x16au(REF_2, CONST_256, TMP4); | |
| 1540 vis_pmerge(ZERO, REF_2_1, TMP6); | |
| 1541 | |
| 1542 vis_padd16(TMP0, CONST_2, TMP8); | |
| 1543 vis_mul8x16au(REF_4, CONST_256, TMP0); | |
| 1544 | |
| 1545 vis_padd16(TMP2, CONST_2, TMP10); | |
| 1546 vis_mul8x16au(REF_4_1, CONST_256, TMP2); | |
| 1547 | |
| 1548 vis_padd16(TMP8, TMP4, TMP8); | |
| 1549 vis_mul8x16au(REF_6, CONST_256, TMP4); | |
| 1550 | |
| 1551 vis_padd16(TMP10, TMP6, TMP10); | |
| 1552 vis_mul8x16au(REF_6_1, CONST_256, TMP6); | |
| 1553 | |
| 1554 vis_padd16(TMP12, TMP8, TMP12); | |
| 1555 | |
| 1556 vis_padd16(TMP14, TMP10, TMP14); | |
| 1557 | |
| 1558 vis_padd16(TMP12, TMP16, TMP12); | |
| 1559 | |
| 1560 vis_padd16(TMP14, TMP18, TMP14); | |
| 1561 vis_pack16(TMP12, DST_0); | |
| 1562 | |
| 1563 vis_pack16(TMP14, DST_1); | |
| 1564 vis_st64(DST_0, dest[0]); | |
| 1565 vis_padd16(TMP0, CONST_2, TMP12); | |
| 1566 | |
| 1567 vis_mul8x16au(REF_S0, CONST_256, TMP0); | |
| 1568 vis_padd16(TMP2, CONST_2, TMP14); | |
| 1569 | |
| 1570 vis_mul8x16au(REF_S0_1, CONST_256, TMP2); | |
| 1571 vis_padd16(TMP12, TMP4, TMP12); | |
| 1572 | |
| 1573 vis_mul8x16au(REF_S2, CONST_256, TMP4); | |
| 1574 vis_padd16(TMP14, TMP6, TMP14); | |
| 1575 | |
| 1576 vis_mul8x16au(REF_S2_1, CONST_256, TMP6); | |
| 1577 vis_padd16(TMP20, TMP12, TMP20); | |
| 1578 | |
| 1579 vis_padd16(TMP22, TMP14, TMP22); | |
| 1580 | |
| 1581 vis_padd16(TMP20, TMP24, TMP20); | |
| 1582 | |
| 1583 vis_padd16(TMP22, TMP26, TMP22); | |
| 1584 vis_pack16(TMP20, DST_2); | |
| 1585 | |
| 1586 vis_pack16(TMP22, DST_3); | |
| 1587 vis_st64_2(DST_2, dest, 8); | |
| 1588 dest += stride; | |
| 1589 vis_padd16(TMP0, TMP4, TMP24); | |
| 1590 | |
| 1591 vis_mul8x16au(REF_S4, CONST_256, TMP0); | |
| 1592 vis_padd16(TMP2, TMP6, TMP26); | |
| 1593 | |
| 1594 vis_mul8x16au(REF_S4_1, CONST_256, TMP2); | |
| 1595 vis_padd16(TMP24, TMP8, TMP24); | |
| 1596 | |
| 1597 vis_padd16(TMP26, TMP10, TMP26); | |
| 1598 vis_pack16(TMP24, DST_0); | |
| 1599 | |
| 1600 vis_pack16(TMP26, DST_1); | |
| 1601 vis_st64(DST_0, dest[0]); | |
| 1602 vis_pmerge(ZERO, REF_S6, TMP4); | |
| 1603 | |
| 1604 vis_pmerge(ZERO, REF_S6_1, TMP6); | |
| 1605 | |
| 1606 vis_padd16(TMP0, TMP4, TMP0); | |
| 1607 | |
| 1608 vis_padd16(TMP2, TMP6, TMP2); | |
| 1609 | |
| 1610 vis_padd16(TMP0, TMP12, TMP0); | |
| 1611 | |
| 1612 vis_padd16(TMP2, TMP14, TMP2); | |
| 1613 vis_pack16(TMP0, DST_2); | |
| 1614 | |
| 1615 vis_pack16(TMP2, DST_3); | |
| 1616 vis_st64_2(DST_2, dest, 8); | |
| 1617 dest += stride; | |
| 1618 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1619 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1620 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1621 static void MC_put_xy_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 1622 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1623 { |
| 2979 | 1624 uint8_t *ref = (uint8_t *) _ref; |
| 1625 unsigned long off = (unsigned long) ref & 0x7; | |
| 1626 unsigned long off_plus_1 = off + 1; | |
| 1627 int stride_8 = stride + 8; | |
| 1628 | |
| 1629 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 1630 | |
| 1631 ref = vis_alignaddr(ref); | |
| 1632 | |
| 1633 vis_ld64(ref[ 0], TMP0); | |
| 1634 vis_fzero(ZERO); | |
| 1635 | |
| 1636 vis_ld64(ref[ 8], TMP2); | |
| 1637 | |
| 1638 vis_ld64(constants2[0], CONST_2); | |
| 1639 | |
| 1640 vis_ld64(constants256_512[0], CONST_256); | |
| 1641 vis_faligndata(TMP0, TMP2, REF_S0); | |
| 1642 | |
| 1643 if (off != 0x7) { | |
| 1644 vis_alignaddr_g0((void *)off_plus_1); | |
| 1645 vis_faligndata(TMP0, TMP2, REF_S2); | |
| 1646 } else { | |
| 1647 vis_src1(TMP2, REF_S2); | |
| 1648 } | |
| 1649 | |
| 1650 height >>= 1; | |
| 1651 do { /* 26 cycles */ | |
| 1652 vis_ld64_2(ref, stride, TMP0); | |
| 1653 vis_mul8x16au(REF_S0, CONST_256, TMP8); | |
| 1654 vis_pmerge(ZERO, REF_S2, TMP12); | |
| 1655 | |
| 1656 vis_alignaddr_g0((void *)off); | |
| 1657 | |
| 1658 vis_ld64_2(ref, stride_8, TMP2); | |
| 1659 ref += stride; | |
| 1660 vis_mul8x16au(REF_S0_1, CONST_256, TMP10); | |
| 1661 vis_pmerge(ZERO, REF_S2_1, TMP14); | |
| 1662 | |
| 1663 vis_ld64_2(ref, stride, TMP4); | |
| 1664 | |
| 1665 vis_ld64_2(ref, stride_8, TMP6); | |
| 1666 ref += stride; | |
| 1667 vis_faligndata(TMP0, TMP2, REF_S4); | |
| 1668 | |
| 1669 vis_pmerge(ZERO, REF_S4, TMP18); | |
| 1670 | |
| 1671 vis_pmerge(ZERO, REF_S4_1, TMP20); | |
| 1672 | |
| 1673 vis_faligndata(TMP4, TMP6, REF_S0); | |
| 1674 | |
| 1675 if (off != 0x7) { | |
| 1676 vis_alignaddr_g0((void *)off_plus_1); | |
| 1677 vis_faligndata(TMP0, TMP2, REF_S6); | |
| 1678 vis_faligndata(TMP4, TMP6, REF_S2); | |
| 1679 } else { | |
| 1680 vis_src1(TMP2, REF_S6); | |
| 1681 vis_src1(TMP6, REF_S2); | |
| 1682 } | |
| 1683 | |
| 1684 vis_padd16(TMP18, CONST_2, TMP18); | |
| 1685 vis_mul8x16au(REF_S6, CONST_256, TMP22); | |
| 1686 | |
| 1687 vis_padd16(TMP20, CONST_2, TMP20); | |
| 1688 vis_mul8x16au(REF_S6_1, CONST_256, TMP24); | |
| 1689 | |
| 1690 vis_mul8x16au(REF_S0, CONST_256, TMP26); | |
| 1691 vis_pmerge(ZERO, REF_S0_1, TMP28); | |
| 1692 | |
| 1693 vis_mul8x16au(REF_S2, CONST_256, TMP30); | |
| 1694 vis_padd16(TMP18, TMP22, TMP18); | |
| 1695 | |
| 1696 vis_mul8x16au(REF_S2_1, CONST_256, TMP32); | |
| 1697 vis_padd16(TMP20, TMP24, TMP20); | |
| 1698 | |
| 1699 vis_padd16(TMP8, TMP18, TMP8); | |
| 1700 | |
| 1701 vis_padd16(TMP10, TMP20, TMP10); | |
| 1702 | |
| 1703 vis_padd16(TMP8, TMP12, TMP8); | |
| 1704 | |
| 1705 vis_padd16(TMP10, TMP14, TMP10); | |
| 1706 vis_pack16(TMP8, DST_0); | |
| 1707 | |
| 1708 vis_pack16(TMP10, DST_1); | |
| 1709 vis_st64(DST_0, dest[0]); | |
| 1710 dest += stride; | |
| 1711 vis_padd16(TMP18, TMP26, TMP18); | |
| 1712 | |
| 1713 vis_padd16(TMP20, TMP28, TMP20); | |
| 1714 | |
| 1715 vis_padd16(TMP18, TMP30, TMP18); | |
| 1716 | |
| 1717 vis_padd16(TMP20, TMP32, TMP20); | |
| 1718 vis_pack16(TMP18, DST_2); | |
| 1719 | |
| 1720 vis_pack16(TMP20, DST_3); | |
| 1721 vis_st64(DST_2, dest[0]); | |
| 1722 dest += stride; | |
| 1723 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1724 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1725 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1726 static void MC_avg_xy_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 1727 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1728 { |
| 2979 | 1729 uint8_t *ref = (uint8_t *) _ref; |
| 1730 unsigned long off = (unsigned long) ref & 0x7; | |
| 1731 unsigned long off_plus_1 = off + 1; | |
| 1732 int stride_8 = stride + 8; | |
| 1733 int stride_16 = stride + 16; | |
| 1734 | |
| 1735 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); | |
| 1736 | |
| 1737 ref = vis_alignaddr(ref); | |
| 1738 | |
| 1739 vis_ld64(ref[ 0], TMP0); | |
| 1740 vis_fzero(ZERO); | |
| 1741 | |
| 1742 vis_ld64(ref[ 8], TMP2); | |
| 1743 | |
| 1744 vis_ld64(ref[16], TMP4); | |
| 1745 | |
| 1746 vis_ld64(constants6[0], CONST_6); | |
| 1747 vis_faligndata(TMP0, TMP2, REF_S0); | |
| 1748 | |
| 1749 vis_ld64(constants256_1024[0], CONST_256); | |
| 1750 vis_faligndata(TMP2, TMP4, REF_S4); | |
| 1751 | |
| 1752 if (off != 0x7) { | |
| 1753 vis_alignaddr_g0((void *)off_plus_1); | |
| 1754 vis_faligndata(TMP0, TMP2, REF_S2); | |
| 1755 vis_faligndata(TMP2, TMP4, REF_S6); | |
| 1756 } else { | |
| 1757 vis_src1(TMP2, REF_S2); | |
| 1758 vis_src1(TMP4, REF_S6); | |
| 1759 } | |
| 1760 | |
| 1761 height >>= 1; | |
| 1762 do { /* 55 cycles */ | |
| 1763 vis_ld64_2(ref, stride, TMP0); | |
| 1764 vis_mul8x16au(REF_S0, CONST_256, TMP12); | |
| 1765 vis_pmerge(ZERO, REF_S0_1, TMP14); | |
| 1766 | |
| 1767 vis_alignaddr_g0((void *)off); | |
| 1768 | |
| 1769 vis_ld64_2(ref, stride_8, TMP2); | |
| 1770 vis_mul8x16au(REF_S2, CONST_256, TMP16); | |
| 1771 vis_pmerge(ZERO, REF_S2_1, TMP18); | |
| 1772 | |
| 1773 vis_ld64_2(ref, stride_16, TMP4); | |
| 1774 ref += stride; | |
| 1775 vis_mul8x16au(REF_S4, CONST_256, TMP20); | |
| 1776 vis_pmerge(ZERO, REF_S4_1, TMP22); | |
| 1777 | |
| 1778 vis_ld64_2(ref, stride, TMP6); | |
| 1779 vis_mul8x16au(REF_S6, CONST_256, TMP24); | |
| 1780 vis_pmerge(ZERO, REF_S6_1, TMP26); | |
| 1781 | |
| 1782 vis_ld64_2(ref, stride_8, TMP8); | |
| 1783 vis_faligndata(TMP0, TMP2, REF_0); | |
| 1784 | |
| 1785 vis_ld64_2(ref, stride_16, TMP10); | |
| 1786 ref += stride; | |
| 1787 vis_faligndata(TMP2, TMP4, REF_4); | |
| 1788 | |
| 1789 vis_ld64(dest[0], DST_0); | |
| 1790 vis_faligndata(TMP6, TMP8, REF_S0); | |
| 1791 | |
| 1792 vis_ld64_2(dest, 8, DST_2); | |
| 1793 vis_faligndata(TMP8, TMP10, REF_S4); | |
| 1794 | |
| 1795 if (off != 0x7) { | |
| 1796 vis_alignaddr_g0((void *)off_plus_1); | |
| 1797 vis_faligndata(TMP0, TMP2, REF_2); | |
| 1798 vis_faligndata(TMP2, TMP4, REF_6); | |
| 1799 vis_faligndata(TMP6, TMP8, REF_S2); | |
| 1800 vis_faligndata(TMP8, TMP10, REF_S6); | |
| 1801 } else { | |
| 1802 vis_src1(TMP2, REF_2); | |
| 1803 vis_src1(TMP4, REF_6); | |
| 1804 vis_src1(TMP8, REF_S2); | |
| 1805 vis_src1(TMP10, REF_S6); | |
| 1806 } | |
| 1807 | |
| 1808 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
| 1809 vis_pmerge(ZERO, REF_0, TMP0); | |
| 1810 | |
| 1811 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
| 1812 vis_pmerge(ZERO, REF_0_1, TMP2); | |
| 1813 | |
| 1814 vis_mul8x16au(REF_2, CONST_256, TMP4); | |
| 1815 vis_pmerge(ZERO, REF_2_1, TMP6); | |
| 1816 | |
| 1817 vis_mul8x16al(DST_2, CONST_1024, REF_0); | |
| 1818 vis_padd16(TMP0, CONST_6, TMP0); | |
| 1819 | |
| 1820 vis_mul8x16al(DST_3, CONST_1024, REF_2); | |
| 1821 vis_padd16(TMP2, CONST_6, TMP2); | |
| 1822 | |
| 1823 vis_padd16(TMP0, TMP4, TMP0); | |
| 1824 vis_mul8x16au(REF_4, CONST_256, TMP4); | |
| 1825 | |
| 1826 vis_padd16(TMP2, TMP6, TMP2); | |
| 1827 vis_mul8x16au(REF_4_1, CONST_256, TMP6); | |
| 1828 | |
| 1829 vis_padd16(TMP12, TMP0, TMP12); | |
| 1830 vis_mul8x16au(REF_6, CONST_256, TMP8); | |
| 1831 | |
| 1832 vis_padd16(TMP14, TMP2, TMP14); | |
| 1833 vis_mul8x16au(REF_6_1, CONST_256, TMP10); | |
| 1834 | |
| 1835 vis_padd16(TMP12, TMP16, TMP12); | |
| 1836 vis_mul8x16au(REF_S0, CONST_256, REF_4); | |
| 1837 | |
| 1838 vis_padd16(TMP14, TMP18, TMP14); | |
| 1839 vis_mul8x16au(REF_S0_1, CONST_256, REF_6); | |
| 1840 | |
| 1841 vis_padd16(TMP12, TMP30, TMP12); | |
| 1842 | |
| 1843 vis_padd16(TMP14, TMP32, TMP14); | |
| 1844 vis_pack16(TMP12, DST_0); | |
| 1845 | |
| 1846 vis_pack16(TMP14, DST_1); | |
| 1847 vis_st64(DST_0, dest[0]); | |
| 1848 vis_padd16(TMP4, CONST_6, TMP4); | |
| 1849 | |
| 1850 vis_ld64_2(dest, stride, DST_0); | |
| 1851 vis_padd16(TMP6, CONST_6, TMP6); | |
| 1852 vis_mul8x16au(REF_S2, CONST_256, TMP12); | |
| 1853 | |
| 1854 vis_padd16(TMP4, TMP8, TMP4); | |
| 1855 vis_mul8x16au(REF_S2_1, CONST_256, TMP14); | |
| 1856 | |
| 1857 vis_padd16(TMP6, TMP10, TMP6); | |
| 1858 | |
| 1859 vis_padd16(TMP20, TMP4, TMP20); | |
| 1860 | |
| 1861 vis_padd16(TMP22, TMP6, TMP22); | |
| 1862 | |
| 1863 vis_padd16(TMP20, TMP24, TMP20); | |
| 1864 | |
| 1865 vis_padd16(TMP22, TMP26, TMP22); | |
| 1866 | |
| 1867 vis_padd16(TMP20, REF_0, TMP20); | |
| 1868 vis_mul8x16au(REF_S4, CONST_256, REF_0); | |
| 1869 | |
| 1870 vis_padd16(TMP22, REF_2, TMP22); | |
| 1871 vis_pack16(TMP20, DST_2); | |
| 1872 | |
| 1873 vis_pack16(TMP22, DST_3); | |
| 1874 vis_st64_2(DST_2, dest, 8); | |
| 1875 dest += stride; | |
| 1876 | |
| 1877 vis_ld64_2(dest, 8, DST_2); | |
| 1878 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
| 1879 vis_pmerge(ZERO, REF_S4_1, REF_2); | |
| 1880 | |
| 1881 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
| 1882 vis_padd16(REF_4, TMP0, TMP8); | |
| 1883 | |
| 1884 vis_mul8x16au(REF_S6, CONST_256, REF_4); | |
| 1885 vis_padd16(REF_6, TMP2, TMP10); | |
| 1886 | |
| 1887 vis_mul8x16au(REF_S6_1, CONST_256, REF_6); | |
| 1888 vis_padd16(TMP8, TMP12, TMP8); | |
| 1889 | |
| 1890 vis_padd16(TMP10, TMP14, TMP10); | |
| 1891 | |
| 1892 vis_padd16(TMP8, TMP30, TMP8); | |
| 1893 | |
| 1894 vis_padd16(TMP10, TMP32, TMP10); | |
| 1895 vis_pack16(TMP8, DST_0); | |
| 1896 | |
| 1897 vis_pack16(TMP10, DST_1); | |
| 1898 vis_st64(DST_0, dest[0]); | |
| 1899 | |
| 1900 vis_padd16(REF_0, TMP4, REF_0); | |
| 1901 | |
| 1902 vis_mul8x16al(DST_2, CONST_1024, TMP30); | |
| 1903 vis_padd16(REF_2, TMP6, REF_2); | |
| 1904 | |
| 1905 vis_mul8x16al(DST_3, CONST_1024, TMP32); | |
| 1906 vis_padd16(REF_0, REF_4, REF_0); | |
| 1907 | |
| 1908 vis_padd16(REF_2, REF_6, REF_2); | |
| 1909 | |
| 1910 vis_padd16(REF_0, TMP30, REF_0); | |
| 1911 | |
| 1912 /* stall */ | |
| 1913 | |
| 1914 vis_padd16(REF_2, TMP32, REF_2); | |
| 1915 vis_pack16(REF_0, DST_2); | |
| 1916 | |
| 1917 vis_pack16(REF_2, DST_3); | |
| 1918 vis_st64_2(DST_2, dest, 8); | |
| 1919 dest += stride; | |
| 1920 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1921 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1922 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1923 static void MC_avg_xy_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 1924 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
1925 { |
| 2979 | 1926 uint8_t *ref = (uint8_t *) _ref; |
| 1927 unsigned long off = (unsigned long) ref & 0x7; | |
| 1928 unsigned long off_plus_1 = off + 1; | |
| 1929 int stride_8 = stride + 8; | |
| 1930 | |
| 1931 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); | |
| 1932 | |
| 1933 ref = vis_alignaddr(ref); | |
| 1934 | |
| 1935 vis_ld64(ref[0], TMP0); | |
| 1936 vis_fzero(ZERO); | |
| 1937 | |
| 1938 vis_ld64_2(ref, 8, TMP2); | |
| 1939 | |
| 1940 vis_ld64(constants6[0], CONST_6); | |
| 1941 | |
| 1942 vis_ld64(constants256_1024[0], CONST_256); | |
| 1943 vis_faligndata(TMP0, TMP2, REF_S0); | |
| 1944 | |
| 1945 if (off != 0x7) { | |
| 1946 vis_alignaddr_g0((void *)off_plus_1); | |
| 1947 vis_faligndata(TMP0, TMP2, REF_S2); | |
| 1948 } else { | |
| 1949 vis_src1(TMP2, REF_S2); | |
| 1950 } | |
| 1951 | |
| 1952 height >>= 1; | |
| 1953 do { /* 31 cycles */ | |
| 1954 vis_ld64_2(ref, stride, TMP0); | |
| 1955 vis_mul8x16au(REF_S0, CONST_256, TMP8); | |
| 1956 vis_pmerge(ZERO, REF_S0_1, TMP10); | |
| 1957 | |
| 1958 vis_ld64_2(ref, stride_8, TMP2); | |
| 1959 ref += stride; | |
| 1960 vis_mul8x16au(REF_S2, CONST_256, TMP12); | |
| 1961 vis_pmerge(ZERO, REF_S2_1, TMP14); | |
| 1962 | |
| 1963 vis_alignaddr_g0((void *)off); | |
| 1964 | |
| 1965 vis_ld64_2(ref, stride, TMP4); | |
| 1966 vis_faligndata(TMP0, TMP2, REF_S4); | |
| 1967 | |
| 1968 vis_ld64_2(ref, stride_8, TMP6); | |
| 1969 ref += stride; | |
| 1970 | |
| 1971 vis_ld64(dest[0], DST_0); | |
| 1972 vis_faligndata(TMP4, TMP6, REF_S0); | |
| 1973 | |
| 1974 vis_ld64_2(dest, stride, DST_2); | |
| 1975 | |
| 1976 if (off != 0x7) { | |
| 1977 vis_alignaddr_g0((void *)off_plus_1); | |
| 1978 vis_faligndata(TMP0, TMP2, REF_S6); | |
| 1979 vis_faligndata(TMP4, TMP6, REF_S2); | |
| 1980 } else { | |
| 1981 vis_src1(TMP2, REF_S6); | |
| 1982 vis_src1(TMP6, REF_S2); | |
| 1983 } | |
| 1984 | |
| 1985 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
| 1986 vis_pmerge(ZERO, REF_S4, TMP22); | |
| 1987 | |
| 1988 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
| 1989 vis_pmerge(ZERO, REF_S4_1, TMP24); | |
| 1990 | |
| 1991 vis_mul8x16au(REF_S6, CONST_256, TMP26); | |
| 1992 vis_pmerge(ZERO, REF_S6_1, TMP28); | |
| 1993 | |
| 1994 vis_mul8x16au(REF_S0, CONST_256, REF_S4); | |
| 1995 vis_padd16(TMP22, CONST_6, TMP22); | |
| 1996 | |
| 1997 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6); | |
| 1998 vis_padd16(TMP24, CONST_6, TMP24); | |
| 1999 | |
| 2000 vis_mul8x16al(DST_2, CONST_1024, REF_0); | |
| 2001 vis_padd16(TMP22, TMP26, TMP22); | |
| 2002 | |
| 2003 vis_mul8x16al(DST_3, CONST_1024, REF_2); | |
| 2004 vis_padd16(TMP24, TMP28, TMP24); | |
| 2005 | |
| 2006 vis_mul8x16au(REF_S2, CONST_256, TMP26); | |
| 2007 vis_padd16(TMP8, TMP22, TMP8); | |
| 2008 | |
| 2009 vis_mul8x16au(REF_S2_1, CONST_256, TMP28); | |
| 2010 vis_padd16(TMP10, TMP24, TMP10); | |
| 2011 | |
| 2012 vis_padd16(TMP8, TMP12, TMP8); | |
| 2013 | |
| 2014 vis_padd16(TMP10, TMP14, TMP10); | |
| 2015 | |
| 2016 vis_padd16(TMP8, TMP30, TMP8); | |
| 2017 | |
| 2018 vis_padd16(TMP10, TMP32, TMP10); | |
| 2019 vis_pack16(TMP8, DST_0); | |
| 2020 | |
| 2021 vis_pack16(TMP10, DST_1); | |
| 2022 vis_st64(DST_0, dest[0]); | |
| 2023 dest += stride; | |
| 2024 | |
| 2025 vis_padd16(REF_S4, TMP22, TMP12); | |
| 2026 | |
| 2027 vis_padd16(REF_S6, TMP24, TMP14); | |
| 2028 | |
| 2029 vis_padd16(TMP12, TMP26, TMP12); | |
| 2030 | |
| 2031 vis_padd16(TMP14, TMP28, TMP14); | |
| 2032 | |
| 2033 vis_padd16(TMP12, REF_0, TMP12); | |
| 2034 | |
| 2035 vis_padd16(TMP14, REF_2, TMP14); | |
| 2036 vis_pack16(TMP12, DST_2); | |
| 2037 | |
| 2038 vis_pack16(TMP14, DST_3); | |
| 2039 vis_st64(DST_2, dest[0]); | |
| 2040 dest += stride; | |
| 2041 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2042 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2043 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2044 /* End of rounding code */ |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2045 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2046 /* Start of no rounding code */ |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2047 /* The trick used in some of this file is the formula from the MMX |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2048 * motion comp code, which is: |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2049 * |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2050 * (x+y)>>1 == (x&y)+((x^y)>>1) |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2051 * |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2052 * This allows us to average 8 bytes at a time in a 64-bit FPU reg. |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2053 * We avoid overflows by masking before we do the shift, and we |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2054 * implement the shift by multiplying by 1/2 using mul8x16. So in |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2055 * VIS this is (assume 'x' is in f0, 'y' is in f2, a repeating mask |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2056 * of '0xfe' is in f4, a repeating mask of '0x7f' is in f6, and |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2057 * the value 0x80808080 is in f8): |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2058 * |
| 2979 | 2059 * fxor f0, f2, f10 |
| 2060 * fand f10, f4, f10 | |
| 2061 * fmul8x16 f8, f10, f10 | |
| 2062 * fand f10, f6, f10 | |
| 2063 * fand f0, f2, f12 | |
| 2064 * fpadd16 f12, f10, f10 | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2065 */ |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2066 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2067 static void MC_put_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 2068 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2069 { |
| 2979 | 2070 uint8_t *ref = (uint8_t *) _ref; |
| 2071 | |
| 2072 ref = vis_alignaddr(ref); | |
| 2073 do { /* 5 cycles */ | |
| 2074 vis_ld64(ref[0], TMP0); | |
| 2075 | |
| 2076 vis_ld64_2(ref, 8, TMP2); | |
| 2077 | |
| 2078 vis_ld64_2(ref, 16, TMP4); | |
| 2079 ref += stride; | |
| 2080 | |
| 2081 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2082 vis_st64(REF_0, dest[0]); | |
| 2083 | |
| 2084 vis_faligndata(TMP2, TMP4, REF_2); | |
| 2085 vis_st64_2(REF_2, dest, 8); | |
| 2086 dest += stride; | |
| 2087 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2088 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2089 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2090 static void MC_put_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 2091 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2092 { |
| 2979 | 2093 uint8_t *ref = (uint8_t *) _ref; |
| 2094 | |
| 2095 ref = vis_alignaddr(ref); | |
| 2096 do { /* 4 cycles */ | |
| 2097 vis_ld64(ref[0], TMP0); | |
| 2098 | |
| 2099 vis_ld64(ref[8], TMP2); | |
| 2100 ref += stride; | |
| 2101 | |
| 2102 /* stall */ | |
| 2103 | |
| 2104 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2105 vis_st64(REF_0, dest[0]); | |
| 2106 dest += stride; | |
| 2107 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2108 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2109 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2110 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2111 static void MC_avg_no_round_o_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 2112 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2113 { |
| 2979 | 2114 uint8_t *ref = (uint8_t *) _ref; |
| 2115 int stride_8 = stride + 8; | |
| 2116 | |
| 2117 ref = vis_alignaddr(ref); | |
| 2118 | |
| 2119 vis_ld64(ref[0], TMP0); | |
| 2120 | |
| 2121 vis_ld64(ref[8], TMP2); | |
| 2122 | |
| 2123 vis_ld64(ref[16], TMP4); | |
| 2124 | |
| 2125 vis_ld64(dest[0], DST_0); | |
| 2126 | |
| 2127 vis_ld64(dest[8], DST_2); | |
| 2128 | |
| 2129 vis_ld64(constants_fe[0], MASK_fe); | |
| 2130 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2131 | |
| 2132 vis_ld64(constants_7f[0], MASK_7f); | |
| 2133 vis_faligndata(TMP2, TMP4, REF_2); | |
| 2134 | |
| 2135 vis_ld64(constants128[0], CONST_128); | |
| 2136 | |
| 2137 ref += stride; | |
| 2138 height = (height >> 1) - 1; | |
| 2139 | |
| 2140 do { /* 24 cycles */ | |
| 2141 vis_ld64(ref[0], TMP0); | |
| 2142 vis_xor(DST_0, REF_0, TMP6); | |
| 2143 | |
| 2144 vis_ld64_2(ref, 8, TMP2); | |
| 2145 vis_and(TMP6, MASK_fe, TMP6); | |
| 2146 | |
| 2147 vis_ld64_2(ref, 16, TMP4); | |
| 2148 ref += stride; | |
| 2149 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 2150 vis_xor(DST_2, REF_2, TMP8); | |
| 2151 | |
| 2152 vis_and(TMP8, MASK_fe, TMP8); | |
| 2153 | |
| 2154 vis_and(DST_0, REF_0, TMP10); | |
| 2155 vis_ld64_2(dest, stride, DST_0); | |
| 2156 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 2157 | |
| 2158 vis_and(DST_2, REF_2, TMP12); | |
| 2159 vis_ld64_2(dest, stride_8, DST_2); | |
| 2160 | |
| 2161 vis_ld64(ref[0], TMP14); | |
| 2162 vis_and(TMP6, MASK_7f, TMP6); | |
| 2163 | |
| 2164 vis_and(TMP8, MASK_7f, TMP8); | |
| 2165 | |
| 2166 vis_padd16(TMP10, TMP6, TMP6); | |
| 2167 vis_st64(TMP6, dest[0]); | |
| 2168 | |
| 2169 vis_padd16(TMP12, TMP8, TMP8); | |
| 2170 vis_st64_2(TMP8, dest, 8); | |
| 2171 | |
| 2172 dest += stride; | |
| 2173 vis_ld64_2(ref, 8, TMP16); | |
| 2174 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2175 | |
| 2176 vis_ld64_2(ref, 16, TMP18); | |
| 2177 vis_faligndata(TMP2, TMP4, REF_2); | |
| 2178 ref += stride; | |
| 2179 | |
| 2180 vis_xor(DST_0, REF_0, TMP20); | |
| 2181 | |
| 2182 vis_and(TMP20, MASK_fe, TMP20); | |
| 2183 | |
| 2184 vis_xor(DST_2, REF_2, TMP22); | |
| 2185 vis_mul8x16(CONST_128, TMP20, TMP20); | |
| 2186 | |
| 2187 vis_and(TMP22, MASK_fe, TMP22); | |
| 2188 | |
| 2189 vis_and(DST_0, REF_0, TMP24); | |
| 2190 vis_mul8x16(CONST_128, TMP22, TMP22); | |
| 2191 | |
| 2192 vis_and(DST_2, REF_2, TMP26); | |
| 2193 | |
| 2194 vis_ld64_2(dest, stride, DST_0); | |
| 2195 vis_faligndata(TMP14, TMP16, REF_0); | |
| 2196 | |
| 2197 vis_ld64_2(dest, stride_8, DST_2); | |
| 2198 vis_faligndata(TMP16, TMP18, REF_2); | |
| 2199 | |
| 2200 vis_and(TMP20, MASK_7f, TMP20); | |
| 2201 | |
| 2202 vis_and(TMP22, MASK_7f, TMP22); | |
| 2203 | |
| 2204 vis_padd16(TMP24, TMP20, TMP20); | |
| 2205 vis_st64(TMP20, dest[0]); | |
| 2206 | |
| 2207 vis_padd16(TMP26, TMP22, TMP22); | |
| 2208 vis_st64_2(TMP22, dest, 8); | |
| 2209 dest += stride; | |
| 2210 } while (--height); | |
| 2211 | |
| 2212 vis_ld64(ref[0], TMP0); | |
| 2213 vis_xor(DST_0, REF_0, TMP6); | |
| 2214 | |
| 2215 vis_ld64_2(ref, 8, TMP2); | |
| 2216 vis_and(TMP6, MASK_fe, TMP6); | |
| 2217 | |
| 2218 vis_ld64_2(ref, 16, TMP4); | |
| 2219 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 2220 vis_xor(DST_2, REF_2, TMP8); | |
| 2221 | |
| 2222 vis_and(TMP8, MASK_fe, TMP8); | |
| 2223 | |
| 2224 vis_and(DST_0, REF_0, TMP10); | |
| 2225 vis_ld64_2(dest, stride, DST_0); | |
| 2226 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 2227 | |
| 2228 vis_and(DST_2, REF_2, TMP12); | |
| 2229 vis_ld64_2(dest, stride_8, DST_2); | |
| 2230 | |
| 2231 vis_ld64(ref[0], TMP14); | |
| 2232 vis_and(TMP6, MASK_7f, TMP6); | |
| 2233 | |
| 2234 vis_and(TMP8, MASK_7f, TMP8); | |
| 2235 | |
| 2236 vis_padd16(TMP10, TMP6, TMP6); | |
| 2237 vis_st64(TMP6, dest[0]); | |
| 2238 | |
| 2239 vis_padd16(TMP12, TMP8, TMP8); | |
| 2240 vis_st64_2(TMP8, dest, 8); | |
| 2241 | |
| 2242 dest += stride; | |
| 2243 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2244 | |
| 2245 vis_faligndata(TMP2, TMP4, REF_2); | |
| 2246 | |
| 2247 vis_xor(DST_0, REF_0, TMP20); | |
| 2248 | |
| 2249 vis_and(TMP20, MASK_fe, TMP20); | |
| 2250 | |
| 2251 vis_xor(DST_2, REF_2, TMP22); | |
| 2252 vis_mul8x16(CONST_128, TMP20, TMP20); | |
| 2253 | |
| 2254 vis_and(TMP22, MASK_fe, TMP22); | |
| 2255 | |
| 2256 vis_and(DST_0, REF_0, TMP24); | |
| 2257 vis_mul8x16(CONST_128, TMP22, TMP22); | |
| 2258 | |
| 2259 vis_and(DST_2, REF_2, TMP26); | |
| 2260 | |
| 2261 vis_and(TMP20, MASK_7f, TMP20); | |
| 2262 | |
| 2263 vis_and(TMP22, MASK_7f, TMP22); | |
| 2264 | |
| 2265 vis_padd16(TMP24, TMP20, TMP20); | |
| 2266 vis_st64(TMP20, dest[0]); | |
| 2267 | |
| 2268 vis_padd16(TMP26, TMP22, TMP22); | |
| 2269 vis_st64_2(TMP22, dest, 8); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2270 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2271 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2272 static void MC_avg_no_round_o_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 2273 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2274 { |
| 2979 | 2275 uint8_t *ref = (uint8_t *) _ref; |
| 2276 | |
| 2277 ref = vis_alignaddr(ref); | |
| 2278 | |
| 2279 vis_ld64(ref[0], TMP0); | |
| 2280 | |
| 2281 vis_ld64(ref[8], TMP2); | |
| 2282 | |
| 2283 vis_ld64(dest[0], DST_0); | |
| 2284 | |
| 2285 vis_ld64(constants_fe[0], MASK_fe); | |
| 2286 | |
| 2287 vis_ld64(constants_7f[0], MASK_7f); | |
| 2288 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2289 | |
| 2290 vis_ld64(constants128[0], CONST_128); | |
| 2291 | |
| 2292 ref += stride; | |
| 2293 height = (height >> 1) - 1; | |
| 2294 | |
| 2295 do { /* 12 cycles */ | |
| 2296 vis_ld64(ref[0], TMP0); | |
| 2297 vis_xor(DST_0, REF_0, TMP4); | |
| 2298 | |
| 2299 vis_ld64(ref[8], TMP2); | |
| 2300 vis_and(TMP4, MASK_fe, TMP4); | |
| 2301 | |
| 2302 vis_and(DST_0, REF_0, TMP6); | |
| 2303 vis_ld64_2(dest, stride, DST_0); | |
| 2304 ref += stride; | |
| 2305 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 2306 | |
| 2307 vis_ld64(ref[0], TMP12); | |
| 2308 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2309 | |
| 2310 vis_ld64(ref[8], TMP2); | |
| 2311 vis_xor(DST_0, REF_0, TMP0); | |
| 2312 ref += stride; | |
| 2313 | |
| 2314 vis_and(TMP0, MASK_fe, TMP0); | |
| 2315 | |
| 2316 vis_and(TMP4, MASK_7f, TMP4); | |
| 2317 | |
| 2318 vis_padd16(TMP6, TMP4, TMP4); | |
| 2319 vis_st64(TMP4, dest[0]); | |
| 2320 dest += stride; | |
| 2321 vis_mul8x16(CONST_128, TMP0, TMP0); | |
| 2322 | |
| 2323 vis_and(DST_0, REF_0, TMP6); | |
| 2324 vis_ld64_2(dest, stride, DST_0); | |
| 2325 | |
| 2326 vis_faligndata(TMP12, TMP2, REF_0); | |
| 2327 | |
| 2328 vis_and(TMP0, MASK_7f, TMP0); | |
| 2329 | |
| 2330 vis_padd16(TMP6, TMP0, TMP4); | |
| 2331 vis_st64(TMP4, dest[0]); | |
| 2332 dest += stride; | |
| 2333 } while (--height); | |
| 2334 | |
| 2335 vis_ld64(ref[0], TMP0); | |
| 2336 vis_xor(DST_0, REF_0, TMP4); | |
| 2337 | |
| 2338 vis_ld64(ref[8], TMP2); | |
| 2339 vis_and(TMP4, MASK_fe, TMP4); | |
| 2340 | |
| 2341 vis_and(DST_0, REF_0, TMP6); | |
| 2342 vis_ld64_2(dest, stride, DST_0); | |
| 2343 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 2344 | |
| 2345 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2346 | |
| 2347 vis_xor(DST_0, REF_0, TMP0); | |
| 2348 | |
| 2349 vis_and(TMP0, MASK_fe, TMP0); | |
| 2350 | |
| 2351 vis_and(TMP4, MASK_7f, TMP4); | |
| 2352 | |
| 2353 vis_padd16(TMP6, TMP4, TMP4); | |
| 2354 vis_st64(TMP4, dest[0]); | |
| 2355 dest += stride; | |
| 2356 vis_mul8x16(CONST_128, TMP0, TMP0); | |
| 2357 | |
| 2358 vis_and(DST_0, REF_0, TMP6); | |
| 2359 | |
| 2360 vis_and(TMP0, MASK_7f, TMP0); | |
| 2361 | |
| 2362 vis_padd16(TMP6, TMP0, TMP4); | |
| 2363 vis_st64(TMP4, dest[0]); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2364 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2365 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2366 static void MC_put_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 2367 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2368 { |
| 2979 | 2369 uint8_t *ref = (uint8_t *) _ref; |
| 2370 unsigned long off = (unsigned long) ref & 0x7; | |
| 2371 unsigned long off_plus_1 = off + 1; | |
| 2372 | |
| 2373 ref = vis_alignaddr(ref); | |
| 2374 | |
| 2375 vis_ld64(ref[0], TMP0); | |
| 2376 | |
| 2377 vis_ld64_2(ref, 8, TMP2); | |
| 2378 | |
| 2379 vis_ld64_2(ref, 16, TMP4); | |
| 2380 | |
| 2381 vis_ld64(constants_fe[0], MASK_fe); | |
| 2382 | |
| 2383 vis_ld64(constants_7f[0], MASK_7f); | |
| 2384 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2385 | |
| 2386 vis_ld64(constants128[0], CONST_128); | |
| 2387 vis_faligndata(TMP2, TMP4, REF_4); | |
| 2388 | |
| 2389 if (off != 0x7) { | |
| 2390 vis_alignaddr_g0((void *)off_plus_1); | |
| 2391 vis_faligndata(TMP0, TMP2, REF_2); | |
| 2392 vis_faligndata(TMP2, TMP4, REF_6); | |
| 2393 } else { | |
| 2394 vis_src1(TMP2, REF_2); | |
| 2395 vis_src1(TMP4, REF_6); | |
| 2396 } | |
| 2397 | |
| 2398 ref += stride; | |
| 2399 height = (height >> 1) - 1; | |
| 2400 | |
| 2401 do { /* 34 cycles */ | |
| 2402 vis_ld64(ref[0], TMP0); | |
| 2403 vis_xor(REF_0, REF_2, TMP6); | |
| 2404 | |
| 2405 vis_ld64_2(ref, 8, TMP2); | |
| 2406 vis_xor(REF_4, REF_6, TMP8); | |
| 2407 | |
| 2408 vis_ld64_2(ref, 16, TMP4); | |
| 2409 vis_and(TMP6, MASK_fe, TMP6); | |
| 2410 ref += stride; | |
| 2411 | |
| 2412 vis_ld64(ref[0], TMP14); | |
| 2413 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 2414 vis_and(TMP8, MASK_fe, TMP8); | |
| 2415 | |
| 2416 vis_ld64_2(ref, 8, TMP16); | |
| 2417 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 2418 vis_and(REF_0, REF_2, TMP10); | |
| 2419 | |
| 2420 vis_ld64_2(ref, 16, TMP18); | |
| 2421 ref += stride; | |
| 2422 vis_and(REF_4, REF_6, TMP12); | |
| 2423 | |
| 2424 vis_alignaddr_g0((void *)off); | |
| 2425 | |
| 2426 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2427 | |
| 2428 vis_faligndata(TMP2, TMP4, REF_4); | |
| 2429 | |
| 2430 if (off != 0x7) { | |
| 2431 vis_alignaddr_g0((void *)off_plus_1); | |
| 2432 vis_faligndata(TMP0, TMP2, REF_2); | |
| 2433 vis_faligndata(TMP2, TMP4, REF_6); | |
| 2434 } else { | |
| 2435 vis_src1(TMP2, REF_2); | |
| 2436 vis_src1(TMP4, REF_6); | |
| 2437 } | |
| 2438 | |
| 2439 vis_and(TMP6, MASK_7f, TMP6); | |
| 2440 | |
| 2441 vis_and(TMP8, MASK_7f, TMP8); | |
| 2442 | |
| 2443 vis_padd16(TMP10, TMP6, TMP6); | |
| 2444 vis_st64(TMP6, dest[0]); | |
| 2445 | |
| 2446 vis_padd16(TMP12, TMP8, TMP8); | |
| 2447 vis_st64_2(TMP8, dest, 8); | |
| 2448 dest += stride; | |
| 2449 | |
| 2450 vis_xor(REF_0, REF_2, TMP6); | |
| 2451 | |
| 2452 vis_xor(REF_4, REF_6, TMP8); | |
| 2453 | |
| 2454 vis_and(TMP6, MASK_fe, TMP6); | |
| 2455 | |
| 2456 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 2457 vis_and(TMP8, MASK_fe, TMP8); | |
| 2458 | |
| 2459 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 2460 vis_and(REF_0, REF_2, TMP10); | |
| 2461 | |
| 2462 vis_and(REF_4, REF_6, TMP12); | |
| 2463 | |
| 2464 vis_alignaddr_g0((void *)off); | |
| 2465 | |
| 2466 vis_faligndata(TMP14, TMP16, REF_0); | |
| 2467 | |
| 2468 vis_faligndata(TMP16, TMP18, REF_4); | |
| 2469 | |
| 2470 if (off != 0x7) { | |
| 2471 vis_alignaddr_g0((void *)off_plus_1); | |
| 2472 vis_faligndata(TMP14, TMP16, REF_2); | |
| 2473 vis_faligndata(TMP16, TMP18, REF_6); | |
| 2474 } else { | |
| 2475 vis_src1(TMP16, REF_2); | |
| 2476 vis_src1(TMP18, REF_6); | |
| 2477 } | |
| 2478 | |
| 2479 vis_and(TMP6, MASK_7f, TMP6); | |
| 2480 | |
| 2481 vis_and(TMP8, MASK_7f, TMP8); | |
| 2482 | |
| 2483 vis_padd16(TMP10, TMP6, TMP6); | |
| 2484 vis_st64(TMP6, dest[0]); | |
| 2485 | |
| 2486 vis_padd16(TMP12, TMP8, TMP8); | |
| 2487 vis_st64_2(TMP8, dest, 8); | |
| 2488 dest += stride; | |
| 2489 } while (--height); | |
| 2490 | |
| 2491 vis_ld64(ref[0], TMP0); | |
| 2492 vis_xor(REF_0, REF_2, TMP6); | |
| 2493 | |
| 2494 vis_ld64_2(ref, 8, TMP2); | |
| 2495 vis_xor(REF_4, REF_6, TMP8); | |
| 2496 | |
| 2497 vis_ld64_2(ref, 16, TMP4); | |
| 2498 vis_and(TMP6, MASK_fe, TMP6); | |
| 2499 | |
| 2500 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 2501 vis_and(TMP8, MASK_fe, TMP8); | |
| 2502 | |
| 2503 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 2504 vis_and(REF_0, REF_2, TMP10); | |
| 2505 | |
| 2506 vis_and(REF_4, REF_6, TMP12); | |
| 2507 | |
| 2508 vis_alignaddr_g0((void *)off); | |
| 2509 | |
| 2510 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2511 | |
| 2512 vis_faligndata(TMP2, TMP4, REF_4); | |
| 2513 | |
| 2514 if (off != 0x7) { | |
| 2515 vis_alignaddr_g0((void *)off_plus_1); | |
| 2516 vis_faligndata(TMP0, TMP2, REF_2); | |
| 2517 vis_faligndata(TMP2, TMP4, REF_6); | |
| 2518 } else { | |
| 2519 vis_src1(TMP2, REF_2); | |
| 2520 vis_src1(TMP4, REF_6); | |
| 2521 } | |
| 2522 | |
| 2523 vis_and(TMP6, MASK_7f, TMP6); | |
| 2524 | |
| 2525 vis_and(TMP8, MASK_7f, TMP8); | |
| 2526 | |
| 2527 vis_padd16(TMP10, TMP6, TMP6); | |
| 2528 vis_st64(TMP6, dest[0]); | |
| 2529 | |
| 2530 vis_padd16(TMP12, TMP8, TMP8); | |
| 2531 vis_st64_2(TMP8, dest, 8); | |
| 2532 dest += stride; | |
| 2533 | |
| 2534 vis_xor(REF_0, REF_2, TMP6); | |
| 2535 | |
| 2536 vis_xor(REF_4, REF_6, TMP8); | |
| 2537 | |
| 2538 vis_and(TMP6, MASK_fe, TMP6); | |
| 2539 | |
| 2540 vis_mul8x16(CONST_128, TMP6, TMP6); | |
| 2541 vis_and(TMP8, MASK_fe, TMP8); | |
| 2542 | |
| 2543 vis_mul8x16(CONST_128, TMP8, TMP8); | |
| 2544 vis_and(REF_0, REF_2, TMP10); | |
| 2545 | |
| 2546 vis_and(REF_4, REF_6, TMP12); | |
| 2547 | |
| 2548 vis_and(TMP6, MASK_7f, TMP6); | |
| 2549 | |
| 2550 vis_and(TMP8, MASK_7f, TMP8); | |
| 2551 | |
| 2552 vis_padd16(TMP10, TMP6, TMP6); | |
| 2553 vis_st64(TMP6, dest[0]); | |
| 2554 | |
| 2555 vis_padd16(TMP12, TMP8, TMP8); | |
| 2556 vis_st64_2(TMP8, dest, 8); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2557 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2558 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2559 static void MC_put_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 2560 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2561 { |
| 2979 | 2562 uint8_t *ref = (uint8_t *) _ref; |
| 2563 unsigned long off = (unsigned long) ref & 0x7; | |
| 2564 unsigned long off_plus_1 = off + 1; | |
| 2565 | |
| 2566 ref = vis_alignaddr(ref); | |
| 2567 | |
| 2568 vis_ld64(ref[0], TMP0); | |
| 2569 | |
| 2570 vis_ld64(ref[8], TMP2); | |
| 2571 | |
| 2572 vis_ld64(constants_fe[0], MASK_fe); | |
| 2573 | |
| 2574 vis_ld64(constants_7f[0], MASK_7f); | |
| 2575 | |
| 2576 vis_ld64(constants128[0], CONST_128); | |
| 2577 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2578 | |
| 2579 if (off != 0x7) { | |
| 2580 vis_alignaddr_g0((void *)off_plus_1); | |
| 2581 vis_faligndata(TMP0, TMP2, REF_2); | |
| 2582 } else { | |
| 2583 vis_src1(TMP2, REF_2); | |
| 2584 } | |
| 2585 | |
| 2586 ref += stride; | |
| 2587 height = (height >> 1) - 1; | |
| 2588 | |
| 2589 do { /* 20 cycles */ | |
| 2590 vis_ld64(ref[0], TMP0); | |
| 2591 vis_xor(REF_0, REF_2, TMP4); | |
| 2592 | |
| 2593 vis_ld64_2(ref, 8, TMP2); | |
| 2594 vis_and(TMP4, MASK_fe, TMP4); | |
| 2595 ref += stride; | |
| 2596 | |
| 2597 vis_ld64(ref[0], TMP8); | |
| 2598 vis_and(REF_0, REF_2, TMP6); | |
| 2599 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 2600 | |
| 2601 vis_alignaddr_g0((void *)off); | |
| 2602 | |
| 2603 vis_ld64_2(ref, 8, TMP10); | |
| 2604 ref += stride; | |
| 2605 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2606 | |
| 2607 if (off != 0x7) { | |
| 2608 vis_alignaddr_g0((void *)off_plus_1); | |
| 2609 vis_faligndata(TMP0, TMP2, REF_2); | |
| 2610 } else { | |
| 2611 vis_src1(TMP2, REF_2); | |
| 2612 } | |
| 2613 | |
| 2614 vis_and(TMP4, MASK_7f, TMP4); | |
| 2615 | |
| 2616 vis_padd16(TMP6, TMP4, DST_0); | |
| 2617 vis_st64(DST_0, dest[0]); | |
| 2618 dest += stride; | |
| 2619 | |
| 2620 vis_xor(REF_0, REF_2, TMP12); | |
| 2621 | |
| 2622 vis_and(TMP12, MASK_fe, TMP12); | |
| 2623 | |
| 2624 vis_and(REF_0, REF_2, TMP14); | |
| 2625 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 2626 | |
| 2627 vis_alignaddr_g0((void *)off); | |
| 2628 vis_faligndata(TMP8, TMP10, REF_0); | |
| 2629 if (off != 0x7) { | |
| 2630 vis_alignaddr_g0((void *)off_plus_1); | |
| 2631 vis_faligndata(TMP8, TMP10, REF_2); | |
| 2632 } else { | |
| 2633 vis_src1(TMP10, REF_2); | |
| 2634 } | |
| 2635 | |
| 2636 vis_and(TMP12, MASK_7f, TMP12); | |
| 2637 | |
| 2638 vis_padd16(TMP14, TMP12, DST_0); | |
| 2639 vis_st64(DST_0, dest[0]); | |
| 2640 dest += stride; | |
| 2641 } while (--height); | |
| 2642 | |
| 2643 vis_ld64(ref[0], TMP0); | |
| 2644 vis_xor(REF_0, REF_2, TMP4); | |
| 2645 | |
| 2646 vis_ld64_2(ref, 8, TMP2); | |
| 2647 vis_and(TMP4, MASK_fe, TMP4); | |
| 2648 | |
| 2649 vis_and(REF_0, REF_2, TMP6); | |
| 2650 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 2651 | |
| 2652 vis_alignaddr_g0((void *)off); | |
| 2653 | |
| 2654 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2655 | |
| 2656 if (off != 0x7) { | |
| 2657 vis_alignaddr_g0((void *)off_plus_1); | |
| 2658 vis_faligndata(TMP0, TMP2, REF_2); | |
| 2659 } else { | |
| 2660 vis_src1(TMP2, REF_2); | |
| 2661 } | |
| 2662 | |
| 2663 vis_and(TMP4, MASK_7f, TMP4); | |
| 2664 | |
| 2665 vis_padd16(TMP6, TMP4, DST_0); | |
| 2666 vis_st64(DST_0, dest[0]); | |
| 2667 dest += stride; | |
| 2668 | |
| 2669 vis_xor(REF_0, REF_2, TMP12); | |
| 2670 | |
| 2671 vis_and(TMP12, MASK_fe, TMP12); | |
| 2672 | |
| 2673 vis_and(REF_0, REF_2, TMP14); | |
| 2674 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 2675 | |
| 2676 vis_and(TMP12, MASK_7f, TMP12); | |
| 2677 | |
| 2678 vis_padd16(TMP14, TMP12, DST_0); | |
| 2679 vis_st64(DST_0, dest[0]); | |
| 2680 dest += stride; | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2681 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2682 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2683 static void MC_avg_no_round_x_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 2684 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2685 { |
| 2979 | 2686 uint8_t *ref = (uint8_t *) _ref; |
| 2687 unsigned long off = (unsigned long) ref & 0x7; | |
| 2688 unsigned long off_plus_1 = off + 1; | |
| 2689 | |
| 2690 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 2691 | |
| 2692 vis_ld64(constants3[0], CONST_3); | |
| 2693 vis_fzero(ZERO); | |
| 2694 vis_ld64(constants256_512[0], CONST_256); | |
| 2695 | |
| 2696 ref = vis_alignaddr(ref); | |
| 2697 do { /* 26 cycles */ | |
| 2698 vis_ld64(ref[0], TMP0); | |
| 2699 | |
| 2700 vis_ld64(ref[8], TMP2); | |
| 2701 | |
| 2702 vis_alignaddr_g0((void *)off); | |
| 2703 | |
| 2704 vis_ld64(ref[16], TMP4); | |
| 2705 | |
| 2706 vis_ld64(dest[0], DST_0); | |
| 2707 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2708 | |
| 2709 vis_ld64(dest[8], DST_2); | |
| 2710 vis_faligndata(TMP2, TMP4, REF_4); | |
| 2711 | |
| 2712 if (off != 0x7) { | |
| 2713 vis_alignaddr_g0((void *)off_plus_1); | |
| 2714 vis_faligndata(TMP0, TMP2, REF_2); | |
| 2715 vis_faligndata(TMP2, TMP4, REF_6); | |
| 2716 } else { | |
| 2717 vis_src1(TMP2, REF_2); | |
| 2718 vis_src1(TMP4, REF_6); | |
| 2719 } | |
| 2720 | |
| 2721 vis_mul8x16au(REF_0, CONST_256, TMP0); | |
| 2722 | |
| 2723 vis_pmerge(ZERO, REF_2, TMP4); | |
| 2724 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
| 2725 | |
| 2726 vis_pmerge(ZERO, REF_2_1, TMP6); | |
| 2727 | |
| 2728 vis_padd16(TMP0, TMP4, TMP0); | |
| 2729 | |
| 2730 vis_mul8x16al(DST_0, CONST_512, TMP4); | |
| 2731 vis_padd16(TMP2, TMP6, TMP2); | |
| 2732 | |
| 2733 vis_mul8x16al(DST_1, CONST_512, TMP6); | |
| 2734 | |
| 2735 vis_mul8x16au(REF_6, CONST_256, TMP12); | |
| 2736 | |
| 2737 vis_padd16(TMP0, TMP4, TMP0); | |
| 2738 vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
| 2739 | |
| 2740 vis_padd16(TMP2, TMP6, TMP2); | |
| 2741 vis_mul8x16au(REF_4, CONST_256, TMP16); | |
| 2742 | |
| 2743 vis_padd16(TMP0, CONST_3, TMP8); | |
| 2744 vis_mul8x16au(REF_4_1, CONST_256, TMP18); | |
| 2745 | |
| 2746 vis_padd16(TMP2, CONST_3, TMP10); | |
| 2747 vis_pack16(TMP8, DST_0); | |
| 2748 | |
| 2749 vis_pack16(TMP10, DST_1); | |
| 2750 vis_padd16(TMP16, TMP12, TMP0); | |
| 2751 | |
| 2752 vis_st64(DST_0, dest[0]); | |
| 2753 vis_mul8x16al(DST_2, CONST_512, TMP4); | |
| 2754 vis_padd16(TMP18, TMP14, TMP2); | |
| 2755 | |
| 2756 vis_mul8x16al(DST_3, CONST_512, TMP6); | |
| 2757 vis_padd16(TMP0, CONST_3, TMP0); | |
| 2758 | |
| 2759 vis_padd16(TMP2, CONST_3, TMP2); | |
| 2760 | |
| 2761 vis_padd16(TMP0, TMP4, TMP0); | |
| 2762 | |
| 2763 vis_padd16(TMP2, TMP6, TMP2); | |
| 2764 vis_pack16(TMP0, DST_2); | |
| 2765 | |
| 2766 vis_pack16(TMP2, DST_3); | |
| 2767 vis_st64(DST_2, dest[8]); | |
| 2768 | |
| 2769 ref += stride; | |
| 2770 dest += stride; | |
| 2771 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2772 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2773 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2774 static void MC_avg_no_round_x_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 2775 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2776 { |
| 2979 | 2777 uint8_t *ref = (uint8_t *) _ref; |
| 2778 unsigned long off = (unsigned long) ref & 0x7; | |
| 2779 unsigned long off_plus_1 = off + 1; | |
| 2780 int stride_times_2 = stride << 1; | |
| 2781 | |
| 2782 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 2783 | |
| 2784 vis_ld64(constants3[0], CONST_3); | |
| 2785 vis_fzero(ZERO); | |
| 2786 vis_ld64(constants256_512[0], CONST_256); | |
| 2787 | |
| 2788 ref = vis_alignaddr(ref); | |
| 2789 height >>= 2; | |
| 2790 do { /* 47 cycles */ | |
| 2791 vis_ld64(ref[0], TMP0); | |
| 2792 | |
| 2793 vis_ld64_2(ref, 8, TMP2); | |
| 2794 ref += stride; | |
| 2795 | |
| 2796 vis_alignaddr_g0((void *)off); | |
| 2797 | |
| 2798 vis_ld64(ref[0], TMP4); | |
| 2799 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2800 | |
| 2801 vis_ld64_2(ref, 8, TMP6); | |
| 2802 ref += stride; | |
| 2803 | |
| 2804 vis_ld64(ref[0], TMP8); | |
| 2805 | |
| 2806 vis_ld64_2(ref, 8, TMP10); | |
| 2807 ref += stride; | |
| 2808 vis_faligndata(TMP4, TMP6, REF_4); | |
| 2809 | |
| 2810 vis_ld64(ref[0], TMP12); | |
| 2811 | |
| 2812 vis_ld64_2(ref, 8, TMP14); | |
| 2813 ref += stride; | |
| 2814 vis_faligndata(TMP8, TMP10, REF_S0); | |
| 2815 | |
| 2816 vis_faligndata(TMP12, TMP14, REF_S4); | |
| 2817 | |
| 2818 if (off != 0x7) { | |
| 2819 vis_alignaddr_g0((void *)off_plus_1); | |
| 2820 | |
| 2821 vis_ld64(dest[0], DST_0); | |
| 2822 vis_faligndata(TMP0, TMP2, REF_2); | |
| 2823 | |
| 2824 vis_ld64_2(dest, stride, DST_2); | |
| 2825 vis_faligndata(TMP4, TMP6, REF_6); | |
| 2826 | |
| 2827 vis_faligndata(TMP8, TMP10, REF_S2); | |
| 2828 | |
| 2829 vis_faligndata(TMP12, TMP14, REF_S6); | |
| 2830 } else { | |
| 2831 vis_ld64(dest[0], DST_0); | |
| 2832 vis_src1(TMP2, REF_2); | |
| 2833 | |
| 2834 vis_ld64_2(dest, stride, DST_2); | |
| 2835 vis_src1(TMP6, REF_6); | |
| 2836 | |
| 2837 vis_src1(TMP10, REF_S2); | |
| 2838 | |
| 2839 vis_src1(TMP14, REF_S6); | |
| 2840 } | |
| 2841 | |
| 2842 vis_pmerge(ZERO, REF_0, TMP0); | |
| 2843 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
| 2844 | |
| 2845 vis_pmerge(ZERO, REF_2, TMP4); | |
| 2846 vis_mul8x16au(REF_2_1, CONST_256, TMP6); | |
| 2847 | |
| 2848 vis_padd16(TMP0, CONST_3, TMP0); | |
| 2849 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
| 2850 | |
| 2851 vis_padd16(TMP2, CONST_3, TMP2); | |
| 2852 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
| 2853 | |
| 2854 vis_padd16(TMP0, TMP4, TMP0); | |
| 2855 vis_mul8x16au(REF_4, CONST_256, TMP8); | |
| 2856 | |
| 2857 vis_padd16(TMP2, TMP6, TMP2); | |
| 2858 vis_mul8x16au(REF_4_1, CONST_256, TMP10); | |
| 2859 | |
| 2860 vis_padd16(TMP0, TMP16, TMP0); | |
| 2861 vis_mul8x16au(REF_6, CONST_256, TMP12); | |
| 2862 | |
| 2863 vis_padd16(TMP2, TMP18, TMP2); | |
| 2864 vis_mul8x16au(REF_6_1, CONST_256, TMP14); | |
| 2865 | |
| 2866 vis_padd16(TMP8, CONST_3, TMP8); | |
| 2867 vis_mul8x16al(DST_2, CONST_512, TMP16); | |
| 2868 | |
| 2869 vis_padd16(TMP8, TMP12, TMP8); | |
| 2870 vis_mul8x16al(DST_3, CONST_512, TMP18); | |
| 2871 | |
| 2872 vis_padd16(TMP10, TMP14, TMP10); | |
| 2873 vis_pack16(TMP0, DST_0); | |
| 2874 | |
| 2875 vis_pack16(TMP2, DST_1); | |
| 2876 vis_st64(DST_0, dest[0]); | |
| 2877 dest += stride; | |
| 2878 vis_padd16(TMP10, CONST_3, TMP10); | |
| 2879 | |
| 2880 vis_ld64_2(dest, stride, DST_0); | |
| 2881 vis_padd16(TMP8, TMP16, TMP8); | |
| 2882 | |
| 2883 vis_ld64_2(dest, stride_times_2, TMP4/*DST_2*/); | |
| 2884 vis_padd16(TMP10, TMP18, TMP10); | |
| 2885 vis_pack16(TMP8, DST_2); | |
| 2886 | |
| 2887 vis_pack16(TMP10, DST_3); | |
| 2888 vis_st64(DST_2, dest[0]); | |
| 2889 dest += stride; | |
| 2890 | |
| 2891 vis_mul8x16au(REF_S0_1, CONST_256, TMP2); | |
| 2892 vis_pmerge(ZERO, REF_S0, TMP0); | |
| 2893 | |
| 2894 vis_pmerge(ZERO, REF_S2, TMP24); | |
| 2895 vis_mul8x16au(REF_S2_1, CONST_256, TMP6); | |
| 2896 | |
| 2897 vis_padd16(TMP0, CONST_3, TMP0); | |
| 2898 vis_mul8x16au(REF_S4, CONST_256, TMP8); | |
| 2899 | |
| 2900 vis_padd16(TMP2, CONST_3, TMP2); | |
| 2901 vis_mul8x16au(REF_S4_1, CONST_256, TMP10); | |
| 2902 | |
| 2903 vis_padd16(TMP0, TMP24, TMP0); | |
| 2904 vis_mul8x16au(REF_S6, CONST_256, TMP12); | |
| 2905 | |
| 2906 vis_padd16(TMP2, TMP6, TMP2); | |
| 2907 vis_mul8x16au(REF_S6_1, CONST_256, TMP14); | |
| 2908 | |
| 2909 vis_padd16(TMP8, CONST_3, TMP8); | |
| 2910 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
| 2911 | |
| 2912 vis_padd16(TMP10, CONST_3, TMP10); | |
| 2913 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
| 2914 | |
| 2915 vis_padd16(TMP8, TMP12, TMP8); | |
| 2916 vis_mul8x16al(TMP4/*DST_2*/, CONST_512, TMP20); | |
| 2917 | |
| 2918 vis_mul8x16al(TMP5/*DST_3*/, CONST_512, TMP22); | |
| 2919 vis_padd16(TMP0, TMP16, TMP0); | |
| 2920 | |
| 2921 vis_padd16(TMP2, TMP18, TMP2); | |
| 2922 vis_pack16(TMP0, DST_0); | |
| 2923 | |
| 2924 vis_padd16(TMP10, TMP14, TMP10); | |
| 2925 vis_pack16(TMP2, DST_1); | |
| 2926 vis_st64(DST_0, dest[0]); | |
| 2927 dest += stride; | |
| 2928 | |
| 2929 vis_padd16(TMP8, TMP20, TMP8); | |
| 2930 | |
| 2931 vis_padd16(TMP10, TMP22, TMP10); | |
| 2932 vis_pack16(TMP8, DST_2); | |
| 2933 | |
| 2934 vis_pack16(TMP10, DST_3); | |
| 2935 vis_st64(DST_2, dest[0]); | |
| 2936 dest += stride; | |
| 2937 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2938 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2939 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2940 static void MC_put_no_round_y_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 2941 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
2942 { |
| 2979 | 2943 uint8_t *ref = (uint8_t *) _ref; |
| 2944 | |
| 2945 ref = vis_alignaddr(ref); | |
| 2946 vis_ld64(ref[0], TMP0); | |
| 2947 | |
| 2948 vis_ld64_2(ref, 8, TMP2); | |
| 2949 | |
| 2950 vis_ld64_2(ref, 16, TMP4); | |
| 2951 ref += stride; | |
| 2952 | |
| 2953 vis_ld64(ref[0], TMP6); | |
| 2954 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2955 | |
| 2956 vis_ld64_2(ref, 8, TMP8); | |
| 2957 vis_faligndata(TMP2, TMP4, REF_4); | |
| 2958 | |
| 2959 vis_ld64_2(ref, 16, TMP10); | |
| 2960 ref += stride; | |
| 2961 | |
| 2962 vis_ld64(constants_fe[0], MASK_fe); | |
| 2963 vis_faligndata(TMP6, TMP8, REF_2); | |
| 2964 | |
| 2965 vis_ld64(constants_7f[0], MASK_7f); | |
| 2966 vis_faligndata(TMP8, TMP10, REF_6); | |
| 2967 | |
| 2968 vis_ld64(constants128[0], CONST_128); | |
| 2969 height = (height >> 1) - 1; | |
| 2970 do { /* 24 cycles */ | |
| 2971 vis_ld64(ref[0], TMP0); | |
| 2972 vis_xor(REF_0, REF_2, TMP12); | |
| 2973 | |
| 2974 vis_ld64_2(ref, 8, TMP2); | |
| 2975 vis_xor(REF_4, REF_6, TMP16); | |
| 2976 | |
| 2977 vis_ld64_2(ref, 16, TMP4); | |
| 2978 ref += stride; | |
| 2979 vis_and(REF_0, REF_2, TMP14); | |
| 2980 | |
| 2981 vis_ld64(ref[0], TMP6); | |
| 2982 vis_and(REF_4, REF_6, TMP18); | |
| 2983 | |
| 2984 vis_ld64_2(ref, 8, TMP8); | |
| 2985 vis_faligndata(TMP0, TMP2, REF_0); | |
| 2986 | |
| 2987 vis_ld64_2(ref, 16, TMP10); | |
| 2988 ref += stride; | |
| 2989 vis_faligndata(TMP2, TMP4, REF_4); | |
| 2990 | |
| 2991 vis_and(TMP12, MASK_fe, TMP12); | |
| 2992 | |
| 2993 vis_and(TMP16, MASK_fe, TMP16); | |
| 2994 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 2995 | |
| 2996 vis_mul8x16(CONST_128, TMP16, TMP16); | |
| 2997 vis_xor(REF_0, REF_2, TMP0); | |
| 2998 | |
| 2999 vis_xor(REF_4, REF_6, TMP2); | |
| 3000 | |
| 3001 vis_and(REF_0, REF_2, TMP20); | |
| 3002 | |
| 3003 vis_and(TMP12, MASK_7f, TMP12); | |
| 3004 | |
| 3005 vis_and(TMP16, MASK_7f, TMP16); | |
| 3006 | |
| 3007 vis_padd16(TMP14, TMP12, TMP12); | |
| 3008 vis_st64(TMP12, dest[0]); | |
| 3009 | |
| 3010 vis_padd16(TMP18, TMP16, TMP16); | |
| 3011 vis_st64_2(TMP16, dest, 8); | |
| 3012 dest += stride; | |
| 3013 | |
| 3014 vis_and(REF_4, REF_6, TMP18); | |
| 3015 | |
| 3016 vis_and(TMP0, MASK_fe, TMP0); | |
| 3017 | |
| 3018 vis_and(TMP2, MASK_fe, TMP2); | |
| 3019 vis_mul8x16(CONST_128, TMP0, TMP0); | |
| 3020 | |
| 3021 vis_faligndata(TMP6, TMP8, REF_2); | |
| 3022 vis_mul8x16(CONST_128, TMP2, TMP2); | |
| 3023 | |
| 3024 vis_faligndata(TMP8, TMP10, REF_6); | |
| 3025 | |
| 3026 vis_and(TMP0, MASK_7f, TMP0); | |
| 3027 | |
| 3028 vis_and(TMP2, MASK_7f, TMP2); | |
| 3029 | |
| 3030 vis_padd16(TMP20, TMP0, TMP0); | |
| 3031 vis_st64(TMP0, dest[0]); | |
| 3032 | |
| 3033 vis_padd16(TMP18, TMP2, TMP2); | |
| 3034 vis_st64_2(TMP2, dest, 8); | |
| 3035 dest += stride; | |
| 3036 } while (--height); | |
| 3037 | |
| 3038 vis_ld64(ref[0], TMP0); | |
| 3039 vis_xor(REF_0, REF_2, TMP12); | |
| 3040 | |
| 3041 vis_ld64_2(ref, 8, TMP2); | |
| 3042 vis_xor(REF_4, REF_6, TMP16); | |
| 3043 | |
| 3044 vis_ld64_2(ref, 16, TMP4); | |
| 3045 vis_and(REF_0, REF_2, TMP14); | |
| 3046 | |
| 3047 vis_and(REF_4, REF_6, TMP18); | |
| 3048 | |
| 3049 vis_faligndata(TMP0, TMP2, REF_0); | |
| 3050 | |
| 3051 vis_faligndata(TMP2, TMP4, REF_4); | |
| 3052 | |
| 3053 vis_and(TMP12, MASK_fe, TMP12); | |
| 3054 | |
| 3055 vis_and(TMP16, MASK_fe, TMP16); | |
| 3056 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 3057 | |
| 3058 vis_mul8x16(CONST_128, TMP16, TMP16); | |
| 3059 vis_xor(REF_0, REF_2, TMP0); | |
| 3060 | |
| 3061 vis_xor(REF_4, REF_6, TMP2); | |
| 3062 | |
| 3063 vis_and(REF_0, REF_2, TMP20); | |
| 3064 | |
| 3065 vis_and(TMP12, MASK_7f, TMP12); | |
| 3066 | |
| 3067 vis_and(TMP16, MASK_7f, TMP16); | |
| 3068 | |
| 3069 vis_padd16(TMP14, TMP12, TMP12); | |
| 3070 vis_st64(TMP12, dest[0]); | |
| 3071 | |
| 3072 vis_padd16(TMP18, TMP16, TMP16); | |
| 3073 vis_st64_2(TMP16, dest, 8); | |
| 3074 dest += stride; | |
| 3075 | |
| 3076 vis_and(REF_4, REF_6, TMP18); | |
| 3077 | |
| 3078 vis_and(TMP0, MASK_fe, TMP0); | |
| 3079 | |
| 3080 vis_and(TMP2, MASK_fe, TMP2); | |
| 3081 vis_mul8x16(CONST_128, TMP0, TMP0); | |
| 3082 | |
| 3083 vis_mul8x16(CONST_128, TMP2, TMP2); | |
| 3084 | |
| 3085 vis_and(TMP0, MASK_7f, TMP0); | |
| 3086 | |
| 3087 vis_and(TMP2, MASK_7f, TMP2); | |
| 3088 | |
| 3089 vis_padd16(TMP20, TMP0, TMP0); | |
| 3090 vis_st64(TMP0, dest[0]); | |
| 3091 | |
| 3092 vis_padd16(TMP18, TMP2, TMP2); | |
| 3093 vis_st64_2(TMP2, dest, 8); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3094 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3095 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3096 static void MC_put_no_round_y_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 3097 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3098 { |
| 2979 | 3099 uint8_t *ref = (uint8_t *) _ref; |
| 3100 | |
| 3101 ref = vis_alignaddr(ref); | |
| 3102 vis_ld64(ref[0], TMP0); | |
| 3103 | |
| 3104 vis_ld64_2(ref, 8, TMP2); | |
| 3105 ref += stride; | |
| 3106 | |
| 3107 vis_ld64(ref[0], TMP4); | |
| 3108 | |
| 3109 vis_ld64_2(ref, 8, TMP6); | |
| 3110 ref += stride; | |
| 3111 | |
| 3112 vis_ld64(constants_fe[0], MASK_fe); | |
| 3113 vis_faligndata(TMP0, TMP2, REF_0); | |
| 3114 | |
| 3115 vis_ld64(constants_7f[0], MASK_7f); | |
| 3116 vis_faligndata(TMP4, TMP6, REF_2); | |
| 3117 | |
| 3118 vis_ld64(constants128[0], CONST_128); | |
| 3119 height = (height >> 1) - 1; | |
| 3120 do { /* 12 cycles */ | |
| 3121 vis_ld64(ref[0], TMP0); | |
| 3122 vis_xor(REF_0, REF_2, TMP4); | |
| 3123 | |
| 3124 vis_ld64_2(ref, 8, TMP2); | |
| 3125 ref += stride; | |
| 3126 vis_and(TMP4, MASK_fe, TMP4); | |
| 3127 | |
| 3128 vis_and(REF_0, REF_2, TMP6); | |
| 3129 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 3130 | |
| 3131 vis_faligndata(TMP0, TMP2, REF_0); | |
| 3132 vis_ld64(ref[0], TMP0); | |
| 3133 | |
| 3134 vis_ld64_2(ref, 8, TMP2); | |
| 3135 ref += stride; | |
| 3136 vis_xor(REF_0, REF_2, TMP12); | |
| 3137 | |
| 3138 vis_and(TMP4, MASK_7f, TMP4); | |
| 3139 | |
| 3140 vis_and(TMP12, MASK_fe, TMP12); | |
| 3141 | |
| 3142 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 3143 vis_and(REF_0, REF_2, TMP14); | |
| 3144 | |
| 3145 vis_padd16(TMP6, TMP4, DST_0); | |
| 3146 vis_st64(DST_0, dest[0]); | |
| 3147 dest += stride; | |
| 3148 | |
| 3149 vis_faligndata(TMP0, TMP2, REF_2); | |
| 3150 | |
| 3151 vis_and(TMP12, MASK_7f, TMP12); | |
| 3152 | |
| 3153 vis_padd16(TMP14, TMP12, DST_0); | |
| 3154 vis_st64(DST_0, dest[0]); | |
| 3155 dest += stride; | |
| 3156 } while (--height); | |
| 3157 | |
| 3158 vis_ld64(ref[0], TMP0); | |
| 3159 vis_xor(REF_0, REF_2, TMP4); | |
| 3160 | |
| 3161 vis_ld64_2(ref, 8, TMP2); | |
| 3162 vis_and(TMP4, MASK_fe, TMP4); | |
| 3163 | |
| 3164 vis_and(REF_0, REF_2, TMP6); | |
| 3165 vis_mul8x16(CONST_128, TMP4, TMP4); | |
| 3166 | |
| 3167 vis_faligndata(TMP0, TMP2, REF_0); | |
| 3168 | |
| 3169 vis_xor(REF_0, REF_2, TMP12); | |
| 3170 | |
| 3171 vis_and(TMP4, MASK_7f, TMP4); | |
| 3172 | |
| 3173 vis_and(TMP12, MASK_fe, TMP12); | |
| 3174 | |
| 3175 vis_mul8x16(CONST_128, TMP12, TMP12); | |
| 3176 vis_and(REF_0, REF_2, TMP14); | |
| 3177 | |
| 3178 vis_padd16(TMP6, TMP4, DST_0); | |
| 3179 vis_st64(DST_0, dest[0]); | |
| 3180 dest += stride; | |
| 3181 | |
| 3182 vis_and(TMP12, MASK_7f, TMP12); | |
| 3183 | |
| 3184 vis_padd16(TMP14, TMP12, DST_0); | |
| 3185 vis_st64(DST_0, dest[0]); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3186 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3187 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3188 static void MC_avg_no_round_y_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 3189 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3190 { |
| 2979 | 3191 uint8_t *ref = (uint8_t *) _ref; |
| 3192 int stride_8 = stride + 8; | |
| 3193 int stride_16 = stride + 16; | |
| 3194 | |
| 3195 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 3196 | |
| 3197 ref = vis_alignaddr(ref); | |
| 3198 | |
| 3199 vis_ld64(ref[ 0], TMP0); | |
| 3200 vis_fzero(ZERO); | |
| 3201 | |
| 3202 vis_ld64(ref[ 8], TMP2); | |
| 3203 | |
| 3204 vis_ld64(ref[16], TMP4); | |
| 3205 | |
| 3206 vis_ld64(constants3[0], CONST_3); | |
| 3207 vis_faligndata(TMP0, TMP2, REF_2); | |
| 3208 | |
| 3209 vis_ld64(constants256_512[0], CONST_256); | |
| 3210 vis_faligndata(TMP2, TMP4, REF_6); | |
| 3211 height >>= 1; | |
| 3212 | |
| 3213 do { /* 31 cycles */ | |
| 3214 vis_ld64_2(ref, stride, TMP0); | |
| 3215 vis_pmerge(ZERO, REF_2, TMP12); | |
| 3216 vis_mul8x16au(REF_2_1, CONST_256, TMP14); | |
| 3217 | |
| 3218 vis_ld64_2(ref, stride_8, TMP2); | |
| 3219 vis_pmerge(ZERO, REF_6, TMP16); | |
| 3220 vis_mul8x16au(REF_6_1, CONST_256, TMP18); | |
| 3221 | |
| 3222 vis_ld64_2(ref, stride_16, TMP4); | |
| 3223 ref += stride; | |
| 3224 | |
| 3225 vis_ld64(dest[0], DST_0); | |
| 3226 vis_faligndata(TMP0, TMP2, REF_0); | |
| 3227 | |
| 3228 vis_ld64_2(dest, 8, DST_2); | |
| 3229 vis_faligndata(TMP2, TMP4, REF_4); | |
| 3230 | |
| 3231 vis_ld64_2(ref, stride, TMP6); | |
| 3232 vis_pmerge(ZERO, REF_0, TMP0); | |
| 3233 vis_mul8x16au(REF_0_1, CONST_256, TMP2); | |
| 3234 | |
| 3235 vis_ld64_2(ref, stride_8, TMP8); | |
| 3236 vis_pmerge(ZERO, REF_4, TMP4); | |
| 3237 | |
| 3238 vis_ld64_2(ref, stride_16, TMP10); | |
| 3239 ref += stride; | |
| 3240 | |
| 3241 vis_ld64_2(dest, stride, REF_S0/*DST_4*/); | |
| 3242 vis_faligndata(TMP6, TMP8, REF_2); | |
| 3243 vis_mul8x16au(REF_4_1, CONST_256, TMP6); | |
| 3244 | |
| 3245 vis_ld64_2(dest, stride_8, REF_S2/*DST_6*/); | |
| 3246 vis_faligndata(TMP8, TMP10, REF_6); | |
| 3247 vis_mul8x16al(DST_0, CONST_512, TMP20); | |
| 3248 | |
| 3249 vis_padd16(TMP0, CONST_3, TMP0); | |
| 3250 vis_mul8x16al(DST_1, CONST_512, TMP22); | |
| 3251 | |
| 3252 vis_padd16(TMP2, CONST_3, TMP2); | |
| 3253 vis_mul8x16al(DST_2, CONST_512, TMP24); | |
| 3254 | |
| 3255 vis_padd16(TMP4, CONST_3, TMP4); | |
| 3256 vis_mul8x16al(DST_3, CONST_512, TMP26); | |
| 3257 | |
| 3258 vis_padd16(TMP6, CONST_3, TMP6); | |
| 3259 | |
| 3260 vis_padd16(TMP12, TMP20, TMP12); | |
| 3261 vis_mul8x16al(REF_S0, CONST_512, TMP20); | |
| 3262 | |
| 3263 vis_padd16(TMP14, TMP22, TMP14); | |
| 3264 vis_mul8x16al(REF_S0_1, CONST_512, TMP22); | |
| 3265 | |
| 3266 vis_padd16(TMP16, TMP24, TMP16); | |
| 3267 vis_mul8x16al(REF_S2, CONST_512, TMP24); | |
| 3268 | |
| 3269 vis_padd16(TMP18, TMP26, TMP18); | |
| 3270 vis_mul8x16al(REF_S2_1, CONST_512, TMP26); | |
| 3271 | |
| 3272 vis_padd16(TMP12, TMP0, TMP12); | |
| 3273 vis_mul8x16au(REF_2, CONST_256, TMP28); | |
| 3274 | |
| 3275 vis_padd16(TMP14, TMP2, TMP14); | |
| 3276 vis_mul8x16au(REF_2_1, CONST_256, TMP30); | |
| 3277 | |
| 3278 vis_padd16(TMP16, TMP4, TMP16); | |
| 3279 vis_mul8x16au(REF_6, CONST_256, REF_S4); | |
| 3280 | |
| 3281 vis_padd16(TMP18, TMP6, TMP18); | |
| 3282 vis_mul8x16au(REF_6_1, CONST_256, REF_S6); | |
| 3283 | |
| 3284 vis_pack16(TMP12, DST_0); | |
| 3285 vis_padd16(TMP28, TMP0, TMP12); | |
| 3286 | |
| 3287 vis_pack16(TMP14, DST_1); | |
| 3288 vis_st64(DST_0, dest[0]); | |
| 3289 vis_padd16(TMP30, TMP2, TMP14); | |
| 3290 | |
| 3291 vis_pack16(TMP16, DST_2); | |
| 3292 vis_padd16(REF_S4, TMP4, TMP16); | |
| 3293 | |
| 3294 vis_pack16(TMP18, DST_3); | |
| 3295 vis_st64_2(DST_2, dest, 8); | |
| 3296 dest += stride; | |
| 3297 vis_padd16(REF_S6, TMP6, TMP18); | |
| 3298 | |
| 3299 vis_padd16(TMP12, TMP20, TMP12); | |
| 3300 | |
| 3301 vis_padd16(TMP14, TMP22, TMP14); | |
| 3302 vis_pack16(TMP12, DST_0); | |
| 3303 | |
| 3304 vis_padd16(TMP16, TMP24, TMP16); | |
| 3305 vis_pack16(TMP14, DST_1); | |
| 3306 vis_st64(DST_0, dest[0]); | |
| 3307 | |
| 3308 vis_padd16(TMP18, TMP26, TMP18); | |
| 3309 vis_pack16(TMP16, DST_2); | |
| 3310 | |
| 3311 vis_pack16(TMP18, DST_3); | |
| 3312 vis_st64_2(DST_2, dest, 8); | |
| 3313 dest += stride; | |
| 3314 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3315 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3316 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3317 static void MC_avg_no_round_y_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 3318 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3319 { |
| 2979 | 3320 uint8_t *ref = (uint8_t *) _ref; |
| 3321 int stride_8 = stride + 8; | |
| 3322 | |
| 3323 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 3324 | |
| 3325 ref = vis_alignaddr(ref); | |
| 3326 | |
| 3327 vis_ld64(ref[ 0], TMP0); | |
| 3328 vis_fzero(ZERO); | |
| 3329 | |
| 3330 vis_ld64(ref[ 8], TMP2); | |
| 3331 | |
| 3332 vis_ld64(constants3[0], CONST_3); | |
| 3333 vis_faligndata(TMP0, TMP2, REF_2); | |
| 3334 | |
| 3335 vis_ld64(constants256_512[0], CONST_256); | |
| 3336 | |
| 3337 height >>= 1; | |
| 3338 do { /* 20 cycles */ | |
| 3339 vis_ld64_2(ref, stride, TMP0); | |
| 3340 vis_pmerge(ZERO, REF_2, TMP8); | |
| 3341 vis_mul8x16au(REF_2_1, CONST_256, TMP10); | |
| 3342 | |
| 3343 vis_ld64_2(ref, stride_8, TMP2); | |
| 3344 ref += stride; | |
| 3345 | |
| 3346 vis_ld64(dest[0], DST_0); | |
| 3347 | |
| 3348 vis_ld64_2(dest, stride, DST_2); | |
| 3349 vis_faligndata(TMP0, TMP2, REF_0); | |
| 3350 | |
| 3351 vis_ld64_2(ref, stride, TMP4); | |
| 3352 vis_mul8x16al(DST_0, CONST_512, TMP16); | |
| 3353 vis_pmerge(ZERO, REF_0, TMP12); | |
| 3354 | |
| 3355 vis_ld64_2(ref, stride_8, TMP6); | |
| 3356 ref += stride; | |
| 3357 vis_mul8x16al(DST_1, CONST_512, TMP18); | |
| 3358 vis_pmerge(ZERO, REF_0_1, TMP14); | |
| 3359 | |
| 3360 vis_padd16(TMP12, CONST_3, TMP12); | |
| 3361 vis_mul8x16al(DST_2, CONST_512, TMP24); | |
| 3362 | |
| 3363 vis_padd16(TMP14, CONST_3, TMP14); | |
| 3364 vis_mul8x16al(DST_3, CONST_512, TMP26); | |
| 3365 | |
| 3366 vis_faligndata(TMP4, TMP6, REF_2); | |
| 3367 | |
| 3368 vis_padd16(TMP8, TMP12, TMP8); | |
| 3369 | |
| 3370 vis_padd16(TMP10, TMP14, TMP10); | |
| 3371 vis_mul8x16au(REF_2, CONST_256, TMP20); | |
| 3372 | |
| 3373 vis_padd16(TMP8, TMP16, TMP0); | |
| 3374 vis_mul8x16au(REF_2_1, CONST_256, TMP22); | |
| 3375 | |
| 3376 vis_padd16(TMP10, TMP18, TMP2); | |
| 3377 vis_pack16(TMP0, DST_0); | |
| 3378 | |
| 3379 vis_pack16(TMP2, DST_1); | |
| 3380 vis_st64(DST_0, dest[0]); | |
| 3381 dest += stride; | |
| 3382 vis_padd16(TMP12, TMP20, TMP12); | |
| 3383 | |
| 3384 vis_padd16(TMP14, TMP22, TMP14); | |
| 3385 | |
| 3386 vis_padd16(TMP12, TMP24, TMP0); | |
| 3387 | |
| 3388 vis_padd16(TMP14, TMP26, TMP2); | |
| 3389 vis_pack16(TMP0, DST_2); | |
| 3390 | |
| 3391 vis_pack16(TMP2, DST_3); | |
| 3392 vis_st64(DST_2, dest[0]); | |
| 3393 dest += stride; | |
| 3394 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3395 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3396 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3397 static void MC_put_no_round_xy_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 3398 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3399 { |
| 2979 | 3400 uint8_t *ref = (uint8_t *) _ref; |
| 3401 unsigned long off = (unsigned long) ref & 0x7; | |
| 3402 unsigned long off_plus_1 = off + 1; | |
| 3403 int stride_8 = stride + 8; | |
| 3404 int stride_16 = stride + 16; | |
| 3405 | |
| 3406 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 3407 | |
| 3408 ref = vis_alignaddr(ref); | |
| 3409 | |
| 3410 vis_ld64(ref[ 0], TMP0); | |
| 3411 vis_fzero(ZERO); | |
| 3412 | |
| 3413 vis_ld64(ref[ 8], TMP2); | |
| 3414 | |
| 3415 vis_ld64(ref[16], TMP4); | |
| 3416 | |
| 3417 vis_ld64(constants1[0], CONST_1); | |
| 3418 vis_faligndata(TMP0, TMP2, REF_S0); | |
| 3419 | |
| 3420 vis_ld64(constants256_512[0], CONST_256); | |
| 3421 vis_faligndata(TMP2, TMP4, REF_S4); | |
| 3422 | |
| 3423 if (off != 0x7) { | |
| 3424 vis_alignaddr_g0((void *)off_plus_1); | |
| 3425 vis_faligndata(TMP0, TMP2, REF_S2); | |
| 3426 vis_faligndata(TMP2, TMP4, REF_S6); | |
| 3427 } else { | |
| 3428 vis_src1(TMP2, REF_S2); | |
| 3429 vis_src1(TMP4, REF_S6); | |
| 3430 } | |
| 3431 | |
| 3432 height >>= 1; | |
| 3433 do { | |
| 3434 vis_ld64_2(ref, stride, TMP0); | |
| 3435 vis_mul8x16au(REF_S0, CONST_256, TMP12); | |
| 3436 vis_pmerge(ZERO, REF_S0_1, TMP14); | |
| 3437 | |
| 3438 vis_alignaddr_g0((void *)off); | |
| 3439 | |
| 3440 vis_ld64_2(ref, stride_8, TMP2); | |
| 3441 vis_mul8x16au(REF_S2, CONST_256, TMP16); | |
| 3442 vis_pmerge(ZERO, REF_S2_1, TMP18); | |
| 3443 | |
| 3444 vis_ld64_2(ref, stride_16, TMP4); | |
| 3445 ref += stride; | |
| 3446 vis_mul8x16au(REF_S4, CONST_256, TMP20); | |
| 3447 vis_pmerge(ZERO, REF_S4_1, TMP22); | |
| 3448 | |
| 3449 vis_ld64_2(ref, stride, TMP6); | |
| 3450 vis_mul8x16au(REF_S6, CONST_256, TMP24); | |
| 3451 vis_pmerge(ZERO, REF_S6_1, TMP26); | |
| 3452 | |
| 3453 vis_ld64_2(ref, stride_8, TMP8); | |
| 3454 vis_faligndata(TMP0, TMP2, REF_0); | |
| 3455 | |
| 3456 vis_ld64_2(ref, stride_16, TMP10); | |
| 3457 ref += stride; | |
| 3458 vis_faligndata(TMP2, TMP4, REF_4); | |
| 3459 | |
| 3460 vis_faligndata(TMP6, TMP8, REF_S0); | |
| 3461 | |
| 3462 vis_faligndata(TMP8, TMP10, REF_S4); | |
| 3463 | |
| 3464 if (off != 0x7) { | |
| 3465 vis_alignaddr_g0((void *)off_plus_1); | |
| 3466 vis_faligndata(TMP0, TMP2, REF_2); | |
| 3467 vis_faligndata(TMP2, TMP4, REF_6); | |
| 3468 vis_faligndata(TMP6, TMP8, REF_S2); | |
| 3469 vis_faligndata(TMP8, TMP10, REF_S6); | |
| 3470 } else { | |
| 3471 vis_src1(TMP2, REF_2); | |
| 3472 vis_src1(TMP4, REF_6); | |
| 3473 vis_src1(TMP8, REF_S2); | |
| 3474 vis_src1(TMP10, REF_S6); | |
| 3475 } | |
| 3476 | |
| 3477 vis_mul8x16au(REF_0, CONST_256, TMP0); | |
| 3478 vis_pmerge(ZERO, REF_0_1, TMP2); | |
| 3479 | |
| 3480 vis_mul8x16au(REF_2, CONST_256, TMP4); | |
| 3481 vis_pmerge(ZERO, REF_2_1, TMP6); | |
| 3482 | |
| 3483 vis_padd16(TMP0, CONST_2, TMP8); | |
| 3484 vis_mul8x16au(REF_4, CONST_256, TMP0); | |
| 3485 | |
| 3486 vis_padd16(TMP2, CONST_1, TMP10); | |
| 3487 vis_mul8x16au(REF_4_1, CONST_256, TMP2); | |
| 3488 | |
| 3489 vis_padd16(TMP8, TMP4, TMP8); | |
| 3490 vis_mul8x16au(REF_6, CONST_256, TMP4); | |
| 3491 | |
| 3492 vis_padd16(TMP10, TMP6, TMP10); | |
| 3493 vis_mul8x16au(REF_6_1, CONST_256, TMP6); | |
| 3494 | |
| 3495 vis_padd16(TMP12, TMP8, TMP12); | |
| 3496 | |
| 3497 vis_padd16(TMP14, TMP10, TMP14); | |
| 3498 | |
| 3499 vis_padd16(TMP12, TMP16, TMP12); | |
| 3500 | |
| 3501 vis_padd16(TMP14, TMP18, TMP14); | |
| 3502 vis_pack16(TMP12, DST_0); | |
| 3503 | |
| 3504 vis_pack16(TMP14, DST_1); | |
| 3505 vis_st64(DST_0, dest[0]); | |
| 3506 vis_padd16(TMP0, CONST_1, TMP12); | |
| 3507 | |
| 3508 vis_mul8x16au(REF_S0, CONST_256, TMP0); | |
| 3509 vis_padd16(TMP2, CONST_1, TMP14); | |
| 3510 | |
| 3511 vis_mul8x16au(REF_S0_1, CONST_256, TMP2); | |
| 3512 vis_padd16(TMP12, TMP4, TMP12); | |
| 3513 | |
| 3514 vis_mul8x16au(REF_S2, CONST_256, TMP4); | |
| 3515 vis_padd16(TMP14, TMP6, TMP14); | |
| 3516 | |
| 3517 vis_mul8x16au(REF_S2_1, CONST_256, TMP6); | |
| 3518 vis_padd16(TMP20, TMP12, TMP20); | |
| 3519 | |
| 3520 vis_padd16(TMP22, TMP14, TMP22); | |
| 3521 | |
| 3522 vis_padd16(TMP20, TMP24, TMP20); | |
| 3523 | |
| 3524 vis_padd16(TMP22, TMP26, TMP22); | |
| 3525 vis_pack16(TMP20, DST_2); | |
| 3526 | |
| 3527 vis_pack16(TMP22, DST_3); | |
| 3528 vis_st64_2(DST_2, dest, 8); | |
| 3529 dest += stride; | |
| 3530 vis_padd16(TMP0, TMP4, TMP24); | |
| 3531 | |
| 3532 vis_mul8x16au(REF_S4, CONST_256, TMP0); | |
| 3533 vis_padd16(TMP2, TMP6, TMP26); | |
| 3534 | |
| 3535 vis_mul8x16au(REF_S4_1, CONST_256, TMP2); | |
| 3536 vis_padd16(TMP24, TMP8, TMP24); | |
| 3537 | |
| 3538 vis_padd16(TMP26, TMP10, TMP26); | |
| 3539 vis_pack16(TMP24, DST_0); | |
| 3540 | |
| 3541 vis_pack16(TMP26, DST_1); | |
| 3542 vis_st64(DST_0, dest[0]); | |
| 3543 vis_pmerge(ZERO, REF_S6, TMP4); | |
| 3544 | |
| 3545 vis_pmerge(ZERO, REF_S6_1, TMP6); | |
| 3546 | |
| 3547 vis_padd16(TMP0, TMP4, TMP0); | |
| 3548 | |
| 3549 vis_padd16(TMP2, TMP6, TMP2); | |
| 3550 | |
| 3551 vis_padd16(TMP0, TMP12, TMP0); | |
| 3552 | |
| 3553 vis_padd16(TMP2, TMP14, TMP2); | |
| 3554 vis_pack16(TMP0, DST_2); | |
| 3555 | |
| 3556 vis_pack16(TMP2, DST_3); | |
| 3557 vis_st64_2(DST_2, dest, 8); | |
| 3558 dest += stride; | |
| 3559 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3560 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3561 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3562 static void MC_put_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 3563 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3564 { |
| 2979 | 3565 uint8_t *ref = (uint8_t *) _ref; |
| 3566 unsigned long off = (unsigned long) ref & 0x7; | |
| 3567 unsigned long off_plus_1 = off + 1; | |
| 3568 int stride_8 = stride + 8; | |
| 3569 | |
| 3570 vis_set_gsr(5 << VIS_GSR_SCALEFACT_SHIFT); | |
| 3571 | |
| 3572 ref = vis_alignaddr(ref); | |
| 3573 | |
| 3574 vis_ld64(ref[ 0], TMP0); | |
| 3575 vis_fzero(ZERO); | |
| 3576 | |
| 3577 vis_ld64(ref[ 8], TMP2); | |
| 3578 | |
| 3579 vis_ld64(constants1[0], CONST_1); | |
| 3580 | |
| 3581 vis_ld64(constants256_512[0], CONST_256); | |
| 3582 vis_faligndata(TMP0, TMP2, REF_S0); | |
| 3583 | |
| 3584 if (off != 0x7) { | |
| 3585 vis_alignaddr_g0((void *)off_plus_1); | |
| 3586 vis_faligndata(TMP0, TMP2, REF_S2); | |
| 3587 } else { | |
| 3588 vis_src1(TMP2, REF_S2); | |
| 3589 } | |
| 3590 | |
| 3591 height >>= 1; | |
| 3592 do { /* 26 cycles */ | |
| 3593 vis_ld64_2(ref, stride, TMP0); | |
| 3594 vis_mul8x16au(REF_S0, CONST_256, TMP8); | |
| 3595 vis_pmerge(ZERO, REF_S2, TMP12); | |
| 3596 | |
| 3597 vis_alignaddr_g0((void *)off); | |
| 3598 | |
| 3599 vis_ld64_2(ref, stride_8, TMP2); | |
| 3600 ref += stride; | |
| 3601 vis_mul8x16au(REF_S0_1, CONST_256, TMP10); | |
| 3602 vis_pmerge(ZERO, REF_S2_1, TMP14); | |
| 3603 | |
| 3604 vis_ld64_2(ref, stride, TMP4); | |
| 3605 | |
| 3606 vis_ld64_2(ref, stride_8, TMP6); | |
| 3607 ref += stride; | |
| 3608 vis_faligndata(TMP0, TMP2, REF_S4); | |
| 3609 | |
| 3610 vis_pmerge(ZERO, REF_S4, TMP18); | |
| 3611 | |
| 3612 vis_pmerge(ZERO, REF_S4_1, TMP20); | |
| 3613 | |
| 3614 vis_faligndata(TMP4, TMP6, REF_S0); | |
| 3615 | |
| 3616 if (off != 0x7) { | |
| 3617 vis_alignaddr_g0((void *)off_plus_1); | |
| 3618 vis_faligndata(TMP0, TMP2, REF_S6); | |
| 3619 vis_faligndata(TMP4, TMP6, REF_S2); | |
| 3620 } else { | |
| 3621 vis_src1(TMP2, REF_S6); | |
| 3622 vis_src1(TMP6, REF_S2); | |
| 3623 } | |
| 3624 | |
| 3625 vis_padd16(TMP18, CONST_1, TMP18); | |
| 3626 vis_mul8x16au(REF_S6, CONST_256, TMP22); | |
| 3627 | |
| 3628 vis_padd16(TMP20, CONST_1, TMP20); | |
| 3629 vis_mul8x16au(REF_S6_1, CONST_256, TMP24); | |
| 3630 | |
| 3631 vis_mul8x16au(REF_S0, CONST_256, TMP26); | |
| 3632 vis_pmerge(ZERO, REF_S0_1, TMP28); | |
| 3633 | |
| 3634 vis_mul8x16au(REF_S2, CONST_256, TMP30); | |
| 3635 vis_padd16(TMP18, TMP22, TMP18); | |
| 3636 | |
| 3637 vis_mul8x16au(REF_S2_1, CONST_256, TMP32); | |
| 3638 vis_padd16(TMP20, TMP24, TMP20); | |
| 3639 | |
| 3640 vis_padd16(TMP8, TMP18, TMP8); | |
| 3641 | |
| 3642 vis_padd16(TMP10, TMP20, TMP10); | |
| 3643 | |
| 3644 vis_padd16(TMP8, TMP12, TMP8); | |
| 3645 | |
| 3646 vis_padd16(TMP10, TMP14, TMP10); | |
| 3647 vis_pack16(TMP8, DST_0); | |
| 3648 | |
| 3649 vis_pack16(TMP10, DST_1); | |
| 3650 vis_st64(DST_0, dest[0]); | |
| 3651 dest += stride; | |
| 3652 vis_padd16(TMP18, TMP26, TMP18); | |
| 3653 | |
| 3654 vis_padd16(TMP20, TMP28, TMP20); | |
| 3655 | |
| 3656 vis_padd16(TMP18, TMP30, TMP18); | |
| 3657 | |
| 3658 vis_padd16(TMP20, TMP32, TMP20); | |
| 3659 vis_pack16(TMP18, DST_2); | |
| 3660 | |
| 3661 vis_pack16(TMP20, DST_3); | |
| 3662 vis_st64(DST_2, dest[0]); | |
| 3663 dest += stride; | |
| 3664 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3665 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3666 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3667 static void MC_avg_no_round_xy_16_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 3668 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3669 { |
| 2979 | 3670 uint8_t *ref = (uint8_t *) _ref; |
| 3671 unsigned long off = (unsigned long) ref & 0x7; | |
| 3672 unsigned long off_plus_1 = off + 1; | |
| 3673 int stride_8 = stride + 8; | |
| 3674 int stride_16 = stride + 16; | |
| 3675 | |
| 3676 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); | |
| 3677 | |
| 3678 ref = vis_alignaddr(ref); | |
| 3679 | |
| 3680 vis_ld64(ref[ 0], TMP0); | |
| 3681 vis_fzero(ZERO); | |
| 3682 | |
| 3683 vis_ld64(ref[ 8], TMP2); | |
| 3684 | |
| 3685 vis_ld64(ref[16], TMP4); | |
| 3686 | |
| 3687 vis_ld64(constants6[0], CONST_6); | |
| 3688 vis_faligndata(TMP0, TMP2, REF_S0); | |
| 3689 | |
| 3690 vis_ld64(constants256_1024[0], CONST_256); | |
| 3691 vis_faligndata(TMP2, TMP4, REF_S4); | |
| 3692 | |
| 3693 if (off != 0x7) { | |
| 3694 vis_alignaddr_g0((void *)off_plus_1); | |
| 3695 vis_faligndata(TMP0, TMP2, REF_S2); | |
| 3696 vis_faligndata(TMP2, TMP4, REF_S6); | |
| 3697 } else { | |
| 3698 vis_src1(TMP2, REF_S2); | |
| 3699 vis_src1(TMP4, REF_S6); | |
| 3700 } | |
| 3701 | |
| 3702 height >>= 1; | |
| 3703 do { /* 55 cycles */ | |
| 3704 vis_ld64_2(ref, stride, TMP0); | |
| 3705 vis_mul8x16au(REF_S0, CONST_256, TMP12); | |
| 3706 vis_pmerge(ZERO, REF_S0_1, TMP14); | |
| 3707 | |
| 3708 vis_alignaddr_g0((void *)off); | |
| 3709 | |
| 3710 vis_ld64_2(ref, stride_8, TMP2); | |
| 3711 vis_mul8x16au(REF_S2, CONST_256, TMP16); | |
| 3712 vis_pmerge(ZERO, REF_S2_1, TMP18); | |
| 3713 | |
| 3714 vis_ld64_2(ref, stride_16, TMP4); | |
| 3715 ref += stride; | |
| 3716 vis_mul8x16au(REF_S4, CONST_256, TMP20); | |
| 3717 vis_pmerge(ZERO, REF_S4_1, TMP22); | |
| 3718 | |
| 3719 vis_ld64_2(ref, stride, TMP6); | |
| 3720 vis_mul8x16au(REF_S6, CONST_256, TMP24); | |
| 3721 vis_pmerge(ZERO, REF_S6_1, TMP26); | |
| 3722 | |
| 3723 vis_ld64_2(ref, stride_8, TMP8); | |
| 3724 vis_faligndata(TMP0, TMP2, REF_0); | |
| 3725 | |
| 3726 vis_ld64_2(ref, stride_16, TMP10); | |
| 3727 ref += stride; | |
| 3728 vis_faligndata(TMP2, TMP4, REF_4); | |
| 3729 | |
| 3730 vis_ld64(dest[0], DST_0); | |
| 3731 vis_faligndata(TMP6, TMP8, REF_S0); | |
| 3732 | |
| 3733 vis_ld64_2(dest, 8, DST_2); | |
| 3734 vis_faligndata(TMP8, TMP10, REF_S4); | |
| 3735 | |
| 3736 if (off != 0x7) { | |
| 3737 vis_alignaddr_g0((void *)off_plus_1); | |
| 3738 vis_faligndata(TMP0, TMP2, REF_2); | |
| 3739 vis_faligndata(TMP2, TMP4, REF_6); | |
| 3740 vis_faligndata(TMP6, TMP8, REF_S2); | |
| 3741 vis_faligndata(TMP8, TMP10, REF_S6); | |
| 3742 } else { | |
| 3743 vis_src1(TMP2, REF_2); | |
| 3744 vis_src1(TMP4, REF_6); | |
| 3745 vis_src1(TMP8, REF_S2); | |
| 3746 vis_src1(TMP10, REF_S6); | |
| 3747 } | |
| 3748 | |
| 3749 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
| 3750 vis_pmerge(ZERO, REF_0, TMP0); | |
| 3751 | |
| 3752 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
| 3753 vis_pmerge(ZERO, REF_0_1, TMP2); | |
| 3754 | |
| 3755 vis_mul8x16au(REF_2, CONST_256, TMP4); | |
| 3756 vis_pmerge(ZERO, REF_2_1, TMP6); | |
| 3757 | |
| 3758 vis_mul8x16al(DST_2, CONST_1024, REF_0); | |
| 3759 vis_padd16(TMP0, CONST_6, TMP0); | |
| 3760 | |
| 3761 vis_mul8x16al(DST_3, CONST_1024, REF_2); | |
| 3762 vis_padd16(TMP2, CONST_6, TMP2); | |
| 3763 | |
| 3764 vis_padd16(TMP0, TMP4, TMP0); | |
| 3765 vis_mul8x16au(REF_4, CONST_256, TMP4); | |
| 3766 | |
| 3767 vis_padd16(TMP2, TMP6, TMP2); | |
| 3768 vis_mul8x16au(REF_4_1, CONST_256, TMP6); | |
| 3769 | |
| 3770 vis_padd16(TMP12, TMP0, TMP12); | |
| 3771 vis_mul8x16au(REF_6, CONST_256, TMP8); | |
| 3772 | |
| 3773 vis_padd16(TMP14, TMP2, TMP14); | |
| 3774 vis_mul8x16au(REF_6_1, CONST_256, TMP10); | |
| 3775 | |
| 3776 vis_padd16(TMP12, TMP16, TMP12); | |
| 3777 vis_mul8x16au(REF_S0, CONST_256, REF_4); | |
| 3778 | |
| 3779 vis_padd16(TMP14, TMP18, TMP14); | |
| 3780 vis_mul8x16au(REF_S0_1, CONST_256, REF_6); | |
| 3781 | |
| 3782 vis_padd16(TMP12, TMP30, TMP12); | |
| 3783 | |
| 3784 vis_padd16(TMP14, TMP32, TMP14); | |
| 3785 vis_pack16(TMP12, DST_0); | |
| 3786 | |
| 3787 vis_pack16(TMP14, DST_1); | |
| 3788 vis_st64(DST_0, dest[0]); | |
| 3789 vis_padd16(TMP4, CONST_6, TMP4); | |
| 3790 | |
| 3791 vis_ld64_2(dest, stride, DST_0); | |
| 3792 vis_padd16(TMP6, CONST_6, TMP6); | |
| 3793 vis_mul8x16au(REF_S2, CONST_256, TMP12); | |
| 3794 | |
| 3795 vis_padd16(TMP4, TMP8, TMP4); | |
| 3796 vis_mul8x16au(REF_S2_1, CONST_256, TMP14); | |
| 3797 | |
| 3798 vis_padd16(TMP6, TMP10, TMP6); | |
| 3799 | |
| 3800 vis_padd16(TMP20, TMP4, TMP20); | |
| 3801 | |
| 3802 vis_padd16(TMP22, TMP6, TMP22); | |
| 3803 | |
| 3804 vis_padd16(TMP20, TMP24, TMP20); | |
| 3805 | |
| 3806 vis_padd16(TMP22, TMP26, TMP22); | |
| 3807 | |
| 3808 vis_padd16(TMP20, REF_0, TMP20); | |
| 3809 vis_mul8x16au(REF_S4, CONST_256, REF_0); | |
| 3810 | |
| 3811 vis_padd16(TMP22, REF_2, TMP22); | |
| 3812 vis_pack16(TMP20, DST_2); | |
| 3813 | |
| 3814 vis_pack16(TMP22, DST_3); | |
| 3815 vis_st64_2(DST_2, dest, 8); | |
| 3816 dest += stride; | |
| 3817 | |
| 3818 vis_ld64_2(dest, 8, DST_2); | |
| 3819 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
| 3820 vis_pmerge(ZERO, REF_S4_1, REF_2); | |
| 3821 | |
| 3822 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
| 3823 vis_padd16(REF_4, TMP0, TMP8); | |
| 3824 | |
| 3825 vis_mul8x16au(REF_S6, CONST_256, REF_4); | |
| 3826 vis_padd16(REF_6, TMP2, TMP10); | |
| 3827 | |
| 3828 vis_mul8x16au(REF_S6_1, CONST_256, REF_6); | |
| 3829 vis_padd16(TMP8, TMP12, TMP8); | |
| 3830 | |
| 3831 vis_padd16(TMP10, TMP14, TMP10); | |
| 3832 | |
| 3833 vis_padd16(TMP8, TMP30, TMP8); | |
| 3834 | |
| 3835 vis_padd16(TMP10, TMP32, TMP10); | |
| 3836 vis_pack16(TMP8, DST_0); | |
| 3837 | |
| 3838 vis_pack16(TMP10, DST_1); | |
| 3839 vis_st64(DST_0, dest[0]); | |
| 3840 | |
| 3841 vis_padd16(REF_0, TMP4, REF_0); | |
| 3842 | |
| 3843 vis_mul8x16al(DST_2, CONST_1024, TMP30); | |
| 3844 vis_padd16(REF_2, TMP6, REF_2); | |
| 3845 | |
| 3846 vis_mul8x16al(DST_3, CONST_1024, TMP32); | |
| 3847 vis_padd16(REF_0, REF_4, REF_0); | |
| 3848 | |
| 3849 vis_padd16(REF_2, REF_6, REF_2); | |
| 3850 | |
| 3851 vis_padd16(REF_0, TMP30, REF_0); | |
| 3852 | |
| 3853 /* stall */ | |
| 3854 | |
| 3855 vis_padd16(REF_2, TMP32, REF_2); | |
| 3856 vis_pack16(REF_0, DST_2); | |
| 3857 | |
| 3858 vis_pack16(REF_2, DST_3); | |
| 3859 vis_st64_2(DST_2, dest, 8); | |
| 3860 dest += stride; | |
| 3861 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3862 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3863 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3864 static void MC_avg_no_round_xy_8_vis (uint8_t * dest, const uint8_t * _ref, |
| 2979 | 3865 const int stride, int height) |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3866 { |
| 2979 | 3867 uint8_t *ref = (uint8_t *) _ref; |
| 3868 unsigned long off = (unsigned long) ref & 0x7; | |
| 3869 unsigned long off_plus_1 = off + 1; | |
| 3870 int stride_8 = stride + 8; | |
| 3871 | |
| 3872 vis_set_gsr(4 << VIS_GSR_SCALEFACT_SHIFT); | |
| 3873 | |
| 3874 ref = vis_alignaddr(ref); | |
| 3875 | |
| 3876 vis_ld64(ref[0], TMP0); | |
| 3877 vis_fzero(ZERO); | |
| 3878 | |
| 3879 vis_ld64_2(ref, 8, TMP2); | |
| 3880 | |
| 3881 vis_ld64(constants6[0], CONST_6); | |
| 3882 | |
| 3883 vis_ld64(constants256_1024[0], CONST_256); | |
| 3884 vis_faligndata(TMP0, TMP2, REF_S0); | |
| 3885 | |
| 3886 if (off != 0x7) { | |
| 3887 vis_alignaddr_g0((void *)off_plus_1); | |
| 3888 vis_faligndata(TMP0, TMP2, REF_S2); | |
| 3889 } else { | |
| 3890 vis_src1(TMP2, REF_S2); | |
| 3891 } | |
| 3892 | |
| 3893 height >>= 1; | |
| 3894 do { /* 31 cycles */ | |
| 3895 vis_ld64_2(ref, stride, TMP0); | |
| 3896 vis_mul8x16au(REF_S0, CONST_256, TMP8); | |
| 3897 vis_pmerge(ZERO, REF_S0_1, TMP10); | |
| 3898 | |
| 3899 vis_ld64_2(ref, stride_8, TMP2); | |
| 3900 ref += stride; | |
| 3901 vis_mul8x16au(REF_S2, CONST_256, TMP12); | |
| 3902 vis_pmerge(ZERO, REF_S2_1, TMP14); | |
| 3903 | |
| 3904 vis_alignaddr_g0((void *)off); | |
| 3905 | |
| 3906 vis_ld64_2(ref, stride, TMP4); | |
| 3907 vis_faligndata(TMP0, TMP2, REF_S4); | |
| 3908 | |
| 3909 vis_ld64_2(ref, stride_8, TMP6); | |
| 3910 ref += stride; | |
| 3911 | |
| 3912 vis_ld64(dest[0], DST_0); | |
| 3913 vis_faligndata(TMP4, TMP6, REF_S0); | |
| 3914 | |
| 3915 vis_ld64_2(dest, stride, DST_2); | |
| 3916 | |
| 3917 if (off != 0x7) { | |
| 3918 vis_alignaddr_g0((void *)off_plus_1); | |
| 3919 vis_faligndata(TMP0, TMP2, REF_S6); | |
| 3920 vis_faligndata(TMP4, TMP6, REF_S2); | |
| 3921 } else { | |
| 3922 vis_src1(TMP2, REF_S6); | |
| 3923 vis_src1(TMP6, REF_S2); | |
| 3924 } | |
| 3925 | |
| 3926 vis_mul8x16al(DST_0, CONST_1024, TMP30); | |
| 3927 vis_pmerge(ZERO, REF_S4, TMP22); | |
| 3928 | |
| 3929 vis_mul8x16al(DST_1, CONST_1024, TMP32); | |
| 3930 vis_pmerge(ZERO, REF_S4_1, TMP24); | |
| 3931 | |
| 3932 vis_mul8x16au(REF_S6, CONST_256, TMP26); | |
| 3933 vis_pmerge(ZERO, REF_S6_1, TMP28); | |
| 3934 | |
| 3935 vis_mul8x16au(REF_S0, CONST_256, REF_S4); | |
| 3936 vis_padd16(TMP22, CONST_6, TMP22); | |
| 3937 | |
| 3938 vis_mul8x16au(REF_S0_1, CONST_256, REF_S6); | |
| 3939 vis_padd16(TMP24, CONST_6, TMP24); | |
| 3940 | |
| 3941 vis_mul8x16al(DST_2, CONST_1024, REF_0); | |
| 3942 vis_padd16(TMP22, TMP26, TMP22); | |
| 3943 | |
| 3944 vis_mul8x16al(DST_3, CONST_1024, REF_2); | |
| 3945 vis_padd16(TMP24, TMP28, TMP24); | |
| 3946 | |
| 3947 vis_mul8x16au(REF_S2, CONST_256, TMP26); | |
| 3948 vis_padd16(TMP8, TMP22, TMP8); | |
| 3949 | |
| 3950 vis_mul8x16au(REF_S2_1, CONST_256, TMP28); | |
| 3951 vis_padd16(TMP10, TMP24, TMP10); | |
| 3952 | |
| 3953 vis_padd16(TMP8, TMP12, TMP8); | |
| 3954 | |
| 3955 vis_padd16(TMP10, TMP14, TMP10); | |
| 3956 | |
| 3957 vis_padd16(TMP8, TMP30, TMP8); | |
| 3958 | |
| 3959 vis_padd16(TMP10, TMP32, TMP10); | |
| 3960 vis_pack16(TMP8, DST_0); | |
| 3961 | |
| 3962 vis_pack16(TMP10, DST_1); | |
| 3963 vis_st64(DST_0, dest[0]); | |
| 3964 dest += stride; | |
| 3965 | |
| 3966 vis_padd16(REF_S4, TMP22, TMP12); | |
| 3967 | |
| 3968 vis_padd16(REF_S6, TMP24, TMP14); | |
| 3969 | |
| 3970 vis_padd16(TMP12, TMP26, TMP12); | |
| 3971 | |
| 3972 vis_padd16(TMP14, TMP28, TMP14); | |
| 3973 | |
| 3974 vis_padd16(TMP12, REF_0, TMP12); | |
| 3975 | |
| 3976 vis_padd16(TMP14, REF_2, TMP14); | |
| 3977 vis_pack16(TMP12, DST_2); | |
| 3978 | |
| 3979 vis_pack16(TMP14, DST_3); | |
| 3980 vis_st64(DST_2, dest[0]); | |
| 3981 dest += stride; | |
| 3982 } while (--height); | |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3983 } |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3984 |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3985 /* End of no rounding code */ |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3986 |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3987 #define ACCEL_SPARC_VIS 1 |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3988 #define ACCEL_SPARC_VIS2 2 |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3989 |
|
8693
18737839ed27
Add missing void keyword to parameterless function declarations.
diego
parents:
8250
diff
changeset
|
3990 static int vis_level(void) |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3991 { |
|
5758
a73ecbffc421
Hopefully fix sparc compilation again, add incorrectly removed variable declaration
reimar
parents:
5757
diff
changeset
|
3992 int accel = 0; |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3993 accel |= ACCEL_SPARC_VIS; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3994 accel |= ACCEL_SPARC_VIS2; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3995 return accel; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3996 } |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
3997 |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3998 /* libavcodec initialization code */ |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
3999 void dsputil_init_vis(DSPContext* c, AVCodecContext *avctx) |
|
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
4000 { |
|
5967
15ed47af1838
Misc spelling fixes, prefer American over British English.
diego
parents:
5963
diff
changeset
|
4001 /* VIS-specific optimizations */ |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4002 int accel = vis_level (); |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4003 |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4004 if (accel & ACCEL_SPARC_VIS) { |
| 5618 | 4005 if(avctx->idct_algo==FF_IDCT_SIMPLEVIS){ |
| 4006 c->idct_put = ff_simple_idct_put_vis; | |
| 4007 c->idct_add = ff_simple_idct_add_vis; | |
| 4008 c->idct = ff_simple_idct_vis; | |
| 4009 c->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; | |
| 4010 } | |
| 4011 | |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4012 c->put_pixels_tab[0][0] = MC_put_o_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4013 c->put_pixels_tab[0][1] = MC_put_x_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4014 c->put_pixels_tab[0][2] = MC_put_y_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4015 c->put_pixels_tab[0][3] = MC_put_xy_16_vis; |
| 2967 | 4016 |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4017 c->put_pixels_tab[1][0] = MC_put_o_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4018 c->put_pixels_tab[1][1] = MC_put_x_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4019 c->put_pixels_tab[1][2] = MC_put_y_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4020 c->put_pixels_tab[1][3] = MC_put_xy_8_vis; |
| 2967 | 4021 |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4022 c->avg_pixels_tab[0][0] = MC_avg_o_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4023 c->avg_pixels_tab[0][1] = MC_avg_x_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4024 c->avg_pixels_tab[0][2] = MC_avg_y_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4025 c->avg_pixels_tab[0][3] = MC_avg_xy_16_vis; |
| 2967 | 4026 |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4027 c->avg_pixels_tab[1][0] = MC_avg_o_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4028 c->avg_pixels_tab[1][1] = MC_avg_x_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4029 c->avg_pixels_tab[1][2] = MC_avg_y_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4030 c->avg_pixels_tab[1][3] = MC_avg_xy_8_vis; |
| 2967 | 4031 |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4032 c->put_no_rnd_pixels_tab[0][0] = MC_put_no_round_o_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4033 c->put_no_rnd_pixels_tab[0][1] = MC_put_no_round_x_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4034 c->put_no_rnd_pixels_tab[0][2] = MC_put_no_round_y_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4035 c->put_no_rnd_pixels_tab[0][3] = MC_put_no_round_xy_16_vis; |
| 2967 | 4036 |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4037 c->put_no_rnd_pixels_tab[1][0] = MC_put_no_round_o_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4038 c->put_no_rnd_pixels_tab[1][1] = MC_put_no_round_x_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4039 c->put_no_rnd_pixels_tab[1][2] = MC_put_no_round_y_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4040 c->put_no_rnd_pixels_tab[1][3] = MC_put_no_round_xy_8_vis; |
| 2967 | 4041 |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4042 c->avg_no_rnd_pixels_tab[0][0] = MC_avg_no_round_o_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4043 c->avg_no_rnd_pixels_tab[0][1] = MC_avg_no_round_x_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4044 c->avg_no_rnd_pixels_tab[0][2] = MC_avg_no_round_y_16_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4045 c->avg_no_rnd_pixels_tab[0][3] = MC_avg_no_round_xy_16_vis; |
| 2967 | 4046 |
|
1966
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4047 c->avg_no_rnd_pixels_tab[1][0] = MC_avg_no_round_o_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4048 c->avg_no_rnd_pixels_tab[1][1] = MC_avg_no_round_x_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4049 c->avg_no_rnd_pixels_tab[1][2] = MC_avg_no_round_y_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4050 c->avg_no_rnd_pixels_tab[1][3] = MC_avg_no_round_xy_8_vis; |
|
e1fc7c598558
License change and cpu detection patch by (James Morrison <ja2morri at csclub dot uwaterloo dot ca>)
michael
parents:
1959
diff
changeset
|
4051 } |
|
1959
55b7435c59b8
VIS optimized motion compensation code. by (David S. Miller <davem at redhat dot com>)
michael
parents:
diff
changeset
|
4052 } |
