Mercurial > libavcodec.hg
annotate ppc/snow_altivec.c @ 4057:ba767c63a07f libavcodec
remove unused variables
| author | bcoudurier |
|---|---|
| date | Sun, 22 Oct 2006 15:15:15 +0000 |
| parents | c8c591fe26f8 |
| children | d5ba514e3f4a |
| rev | line source |
|---|---|
| 3222 | 1 /* |
| 2 * Altivec optimized snow DSP utils | |
| 3 * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org> | |
| 4 * | |
|
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
5 * This file is part of FFmpeg. |
|
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
6 * |
|
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
7 * FFmpeg is free software; you can redistribute it and/or |
| 3222 | 8 * modify it under the terms of the GNU Lesser General Public |
| 9 * License as published by the Free Software Foundation; either | |
|
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
10 * version 2.1 of the License, or (at your option) any later version. |
| 3222 | 11 * |
|
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
12 * FFmpeg is distributed in the hope that it will be useful, |
| 3222 | 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 15 * Lesser General Public License for more details. | |
| 16 * | |
| 17 * You should have received a copy of the GNU Lesser General Public | |
|
3947
c8c591fe26f8
Change license headers to say 'FFmpeg' instead of 'this program/this library'
diego
parents:
3577
diff
changeset
|
18 * License along with FFmpeg; if not, write to the Free Software |
| 3222 | 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
| 20 * | |
| 21 * | |
| 22 */ | |
| 23 | |
| 24 #include "../dsputil.h" | |
| 25 | |
| 26 #include "gcc_fixes.h" | |
| 27 #include "dsputil_altivec.h" | |
| 28 #include "../snow.h" | |
| 29 | |
| 30 #undef NDEBUG | |
| 31 #include <assert.h> | |
| 32 | |
| 33 | |
| 34 | |
| 35 //FIXME remove this replication | |
| 36 #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num))) | |
| 37 | |
| 38 static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line) | |
| 39 { | |
| 40 int offset; | |
| 41 DWTELEM * buffer; | |
| 42 | |
| 43 // av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line); | |
| 44 | |
| 45 assert(buf->data_stack_top >= 0); | |
| 46 // assert(!buf->line[line]); | |
| 47 if (buf->line[line]) | |
| 48 return buf->line[line]; | |
| 49 | |
| 50 offset = buf->line_width * line; | |
| 51 buffer = buf->data_stack[buf->data_stack_top]; | |
| 52 buf->data_stack_top--; | |
| 53 buf->line[line] = buffer; | |
| 54 | |
| 55 // av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1); | |
| 56 | |
| 57 return buffer; | |
| 58 } | |
| 59 | |
| 60 | |
| 61 //altivec code | |
| 62 | |
| 63 void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width) | |
| 64 { | |
| 65 const int w2= (width+1)>>1; | |
| 66 DECLARE_ALIGNED_16(DWTELEM, temp[(width>>1)]); | |
| 67 const int w_l= (width>>1); | |
| 68 const int w_r= w2 - 1; | |
| 69 int i; | |
| 70 vector signed int t1, t2, x, y, tmp1, tmp2; | |
| 71 vector signed int *vbuf, *vtmp; | |
| 72 vector unsigned char align; | |
| 73 | |
| 74 | |
| 75 | |
| 76 { // Lift 0 | |
| 77 DWTELEM * const ref = b + w2 - 1; | |
| 78 DWTELEM b_0 = b[0]; | |
| 79 vbuf = (vector signed int *)b; | |
| 80 | |
| 81 tmp1 = vec_ld (0, ref); | |
| 82 align = vec_lvsl (0, ref); | |
| 83 tmp2 = vec_ld (15, ref); | |
| 84 t1= vec_perm(tmp1, tmp2, align); | |
| 85 | |
| 86 i = 0; | |
| 87 | |
| 88 for (i=0; i<w_l-15; i+=16) { | |
| 89 #if 0 | |
| 90 b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3); | |
| 91 b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3); | |
| 92 b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3); | |
| 93 b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3); | |
| 94 #else | |
| 95 | |
| 96 tmp1 = vec_ld (0, ref+4+i); | |
| 97 tmp2 = vec_ld (15, ref+4+i); | |
| 98 | |
| 99 t2 = vec_perm(tmp1, tmp2, align); | |
| 100 | |
| 101 y = vec_add(t1,vec_sld(t1,t2,4)); | |
| 102 y = vec_add(vec_add(y,y),y); | |
| 103 | |
| 104 tmp1 = vec_ld (0, ref+8+i); | |
| 105 | |
| 106 y = vec_add(y, vec_splat_s32(4)); | |
| 107 y = vec_sra(y, vec_splat_u32(3)); | |
| 108 | |
| 109 tmp2 = vec_ld (15, ref+8+i); | |
| 110 | |
| 111 *vbuf = vec_sub(*vbuf, y); | |
| 112 | |
| 113 t1=t2; | |
| 114 | |
| 115 vbuf++; | |
| 116 | |
| 117 t2 = vec_perm(tmp1, tmp2, align); | |
| 118 | |
| 119 y = vec_add(t1,vec_sld(t1,t2,4)); | |
| 120 y = vec_add(vec_add(y,y),y); | |
| 121 | |
| 122 tmp1 = vec_ld (0, ref+12+i); | |
| 123 | |
| 124 y = vec_add(y, vec_splat_s32(4)); | |
| 125 y = vec_sra(y, vec_splat_u32(3)); | |
| 126 | |
| 127 tmp2 = vec_ld (15, ref+12+i); | |
| 128 | |
| 129 *vbuf = vec_sub(*vbuf, y); | |
| 130 | |
| 131 t1=t2; | |
| 132 | |
| 133 vbuf++; | |
| 134 | |
| 135 t2 = vec_perm(tmp1, tmp2, align); | |
| 136 | |
| 137 y = vec_add(t1,vec_sld(t1,t2,4)); | |
| 138 y = vec_add(vec_add(y,y),y); | |
| 139 | |
| 140 tmp1 = vec_ld (0, ref+16+i); | |
| 141 | |
| 142 y = vec_add(y, vec_splat_s32(4)); | |
| 143 y = vec_sra(y, vec_splat_u32(3)); | |
| 144 | |
| 145 tmp2 = vec_ld (15, ref+16+i); | |
| 146 | |
| 147 *vbuf = vec_sub(*vbuf, y); | |
| 148 | |
| 149 t1=t2; | |
| 150 | |
| 151 t2 = vec_perm(tmp1, tmp2, align); | |
| 152 | |
| 153 y = vec_add(t1,vec_sld(t1,t2,4)); | |
| 154 y = vec_add(vec_add(y,y),y); | |
| 155 | |
| 156 vbuf++; | |
| 157 | |
| 158 y = vec_add(y, vec_splat_s32(4)); | |
| 159 y = vec_sra(y, vec_splat_u32(3)); | |
| 160 *vbuf = vec_sub(*vbuf, y); | |
| 161 | |
| 162 t1=t2; | |
| 163 | |
| 164 vbuf++; | |
| 165 | |
| 166 #endif | |
| 167 } | |
| 168 | |
| 169 snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); | |
| 170 b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); | |
| 171 } | |
| 172 | |
| 173 { // Lift 1 | |
| 174 DWTELEM * const dst = b+w2; | |
| 175 | |
| 176 i = 0; | |
| 177 for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){ | |
| 178 dst[i] = dst[i] - (b[i] + b[i + 1]); | |
| 179 } | |
| 180 | |
| 181 align = vec_lvsl(0, b+i); | |
| 182 tmp1 = vec_ld(0, b+i); | |
| 183 vbuf = (vector signed int*) (dst + i); | |
| 184 tmp2 = vec_ld(15, b+i); | |
| 185 | |
| 186 t1 = vec_perm(tmp1, tmp2, align); | |
| 187 | |
| 188 for (; i<w_r-3; i+=4) { | |
| 189 | |
| 190 #if 0 | |
| 191 dst[i] = dst[i] - (b[i] + b[i + 1]); | |
| 192 dst[i+1] = dst[i+1] - (b[i+1] + b[i + 2]); | |
| 193 dst[i+2] = dst[i+2] - (b[i+2] + b[i + 3]); | |
| 194 dst[i+3] = dst[i+3] - (b[i+3] + b[i + 4]); | |
| 195 #else | |
| 196 | |
| 197 tmp1 = vec_ld(0, b+4+i); | |
| 198 tmp2 = vec_ld(15, b+4+i); | |
| 199 | |
| 200 t2 = vec_perm(tmp1, tmp2, align); | |
| 201 | |
| 202 y = vec_add(t1, vec_sld(t1,t2,4)); | |
| 203 *vbuf = vec_sub (*vbuf, y); | |
| 204 | |
| 205 vbuf++; | |
| 206 | |
| 207 t1 = t2; | |
| 208 | |
| 209 #endif | |
| 210 | |
| 211 } | |
| 212 | |
| 213 snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); | |
| 214 } | |
| 215 | |
| 216 { // Lift 2 | |
| 217 DWTELEM * const ref = b+w2 - 1; | |
| 218 DWTELEM b_0 = b[0]; | |
| 219 vbuf= (vector signed int *) b; | |
| 220 | |
| 221 tmp1 = vec_ld (0, ref); | |
| 222 align = vec_lvsl (0, ref); | |
| 223 tmp2 = vec_ld (15, ref); | |
| 224 t1= vec_perm(tmp1, tmp2, align); | |
| 225 | |
| 226 i = 0; | |
| 227 for (; i<w_l-15; i+=16) { | |
| 228 #if 0 | |
| 229 b[i] = b[i] - (((8 -(ref[i] + ref[i+1])) - (b[i] <<2)) >> 4); | |
| 230 b[i+1] = b[i+1] - (((8 -(ref[i+1] + ref[i+2])) - (b[i+1]<<2)) >> 4); | |
| 231 b[i+2] = b[i+2] - (((8 -(ref[i+2] + ref[i+3])) - (b[i+2]<<2)) >> 4); | |
| 232 b[i+3] = b[i+3] - (((8 -(ref[i+3] + ref[i+4])) - (b[i+3]<<2)) >> 4); | |
| 233 #else | |
| 234 tmp1 = vec_ld (0, ref+4+i); | |
| 235 tmp2 = vec_ld (15, ref+4+i); | |
| 236 | |
| 237 t2 = vec_perm(tmp1, tmp2, align); | |
| 238 | |
| 239 y = vec_add(t1,vec_sld(t1,t2,4)); | |
| 240 y = vec_sub(vec_splat_s32(8),y); | |
| 241 | |
| 242 tmp1 = vec_ld (0, ref+8+i); | |
| 243 | |
| 244 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
| 245 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
| 246 | |
| 247 tmp2 = vec_ld (15, ref+8+i); | |
| 248 | |
| 249 *vbuf = vec_sub( *vbuf, y); | |
| 250 | |
| 251 t1 = t2; | |
| 252 | |
| 253 vbuf++; | |
| 254 | |
| 255 t2 = vec_perm(tmp1, tmp2, align); | |
| 256 | |
| 257 y = vec_add(t1,vec_sld(t1,t2,4)); | |
| 258 y = vec_sub(vec_splat_s32(8),y); | |
| 259 | |
| 260 tmp1 = vec_ld (0, ref+12+i); | |
| 261 | |
| 262 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
| 263 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
| 264 | |
| 265 tmp2 = vec_ld (15, ref+12+i); | |
| 266 | |
| 267 *vbuf = vec_sub( *vbuf, y); | |
| 268 | |
| 269 t1 = t2; | |
| 270 | |
| 271 vbuf++; | |
| 272 | |
| 273 t2 = vec_perm(tmp1, tmp2, align); | |
| 274 | |
| 275 y = vec_add(t1,vec_sld(t1,t2,4)); | |
| 276 y = vec_sub(vec_splat_s32(8),y); | |
| 277 | |
| 278 tmp1 = vec_ld (0, ref+16+i); | |
| 279 | |
| 280 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
| 281 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
| 282 | |
| 283 tmp2 = vec_ld (15, ref+16+i); | |
| 284 | |
| 285 *vbuf = vec_sub( *vbuf, y); | |
| 286 | |
| 287 t1 = t2; | |
| 288 | |
| 289 vbuf++; | |
| 290 | |
| 291 t2 = vec_perm(tmp1, tmp2, align); | |
| 292 | |
| 293 y = vec_add(t1,vec_sld(t1,t2,4)); | |
| 294 y = vec_sub(vec_splat_s32(8),y); | |
| 295 | |
| 296 t1 = t2; | |
| 297 | |
| 298 x = vec_sl(*vbuf,vec_splat_u32(2)); | |
| 299 y = vec_sra(vec_sub(y,x),vec_splat_u32(4)); | |
| 300 *vbuf = vec_sub( *vbuf, y); | |
| 301 | |
| 302 vbuf++; | |
| 303 | |
| 304 #endif | |
| 305 } | |
| 306 | |
| 307 snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); | |
| 308 b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS); | |
| 309 } | |
| 310 | |
| 311 { // Lift 3 | |
| 312 DWTELEM * const src = b+w2; | |
| 313 | |
| 314 vbuf = (vector signed int *)b; | |
| 315 vtmp = (vector signed int *)temp; | |
| 316 | |
| 317 i = 0; | |
| 318 align = vec_lvsl(0, src); | |
| 319 | |
| 320 for (; i<w_r-3; i+=4) { | |
| 321 #if 0 | |
| 322 temp[i] = src[i] - ((-3*(b[i] + b[i+1]))>>1); | |
| 323 temp[i+1] = src[i+1] - ((-3*(b[i+1] + b[i+2]))>>1); | |
| 324 temp[i+2] = src[i+2] - ((-3*(b[i+2] + b[i+3]))>>1); | |
| 325 temp[i+3] = src[i+3] - ((-3*(b[i+3] + b[i+4]))>>1); | |
| 326 #else | |
| 327 tmp1 = vec_ld(0,src+i); | |
| 328 t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4)); | |
| 329 tmp2 = vec_ld(15,src+i); | |
| 330 t1 = vec_sub(vec_splat_s32(0),t1); //bad! | |
| 331 t1 = vec_add(t1,vec_add(t1,t1)); | |
| 332 t2 = vec_perm(tmp1 ,tmp2 ,align); | |
| 333 t1 = vec_sra(t1,vec_splat_u32(1)); | |
| 334 vbuf++; | |
| 335 *vtmp = vec_sub(t2,t1); | |
| 336 vtmp++; | |
| 337 | |
| 338 #endif | |
| 339 | |
| 340 } | |
| 341 | |
| 342 snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -3, 0, 1); | |
| 343 } | |
| 344 | |
| 345 { | |
| 346 //Interleave | |
| 347 int a; | |
| 348 vector signed int *t = (vector signed int *)temp, | |
| 349 *v = (vector signed int *)b; | |
| 350 | |
| 351 snow_interleave_line_header(&i, width, b, temp); | |
| 352 | |
| 353 for (; (i & 0xE) != 0xE; i-=2){ | |
| 354 b[i+1] = temp[i>>1]; | |
| 355 b[i] = b[i>>1]; | |
| 356 } | |
| 357 for (i-=14; i>=0; i-=16){ | |
| 358 a=i/4; | |
| 359 | |
| 360 v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]); | |
| 361 v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]); | |
| 362 v[a+1]=vec_mergel(v[a>>1],t[a>>1]); | |
| 363 v[a]=vec_mergeh(v[a>>1],t[a>>1]); | |
| 364 | |
| 365 } | |
| 366 | |
| 367 } | |
| 368 } | |
| 369 | |
| 370 void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width) | |
| 371 { | |
| 372 int i, w4 = width/4; | |
| 373 vector signed int *v0, *v1,*v2,*v3,*v4,*v5; | |
| 374 vector signed int t1, t2; | |
| 375 | |
| 376 v0=(vector signed int *)b0; | |
| 377 v1=(vector signed int *)b1; | |
| 378 v2=(vector signed int *)b2; | |
| 379 v3=(vector signed int *)b3; | |
| 380 v4=(vector signed int *)b4; | |
| 381 v5=(vector signed int *)b5; | |
| 382 | |
| 383 for (i=0; i< w4;i++) | |
| 384 { | |
| 385 | |
| 386 #if 0 | |
| 387 b4[i] -= (3*(b3[i] + b5[i])+4)>>3; | |
| 388 b3[i] -= ((b2[i] + b4[i])); | |
| 389 b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4; | |
| 390 b1[i] += (3*(b0[i] + b2[i]))>>1; | |
| 391 #else | |
| 392 t1 = vec_add(v3[i], v5[i]); | |
| 393 t2 = vec_add(t1, vec_add(t1,t1)); | |
| 394 t1 = vec_add(t2, vec_splat_s32(4)); | |
| 395 v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3))); | |
| 396 | |
| 397 v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i])); | |
| 398 | |
| 399 t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i])); | |
| 400 t2 = vec_sl(v2[i], vec_splat_u32(2)); | |
| 401 v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4))); | |
| 402 t1 = vec_add(v0[i], v2[i]); | |
| 403 t2 = vec_add(t1, vec_add(t1,t1)); | |
| 404 v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1))); | |
| 405 | |
| 406 #endif | |
| 407 } | |
| 408 | |
| 409 for(i*=4; i < width; i++) | |
| 410 { | |
| 411 b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; | |
| 412 b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; | |
| 413 b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; | |
| 414 b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; | |
| 415 } | |
| 416 } | |
| 417 | |
| 3272 | 418 #define LOAD_BLOCKS \ |
| 419 tmp1 = vec_ld(0, &block[3][y*src_stride]);\ | |
| 420 align = vec_lvsl(0, &block[3][y*src_stride]);\ | |
| 421 tmp2 = vec_ld(15, &block[3][y*src_stride]);\ | |
| 422 \ | |
| 423 b3 = vec_perm(tmp1,tmp2,align);\ | |
| 424 \ | |
| 425 tmp1 = vec_ld(0, &block[2][y*src_stride]);\ | |
| 426 align = vec_lvsl(0, &block[2][y*src_stride]);\ | |
| 427 tmp2 = vec_ld(15, &block[2][y*src_stride]);\ | |
| 428 \ | |
| 429 b2 = vec_perm(tmp1,tmp2,align);\ | |
| 430 \ | |
| 431 tmp1 = vec_ld(0, &block[1][y*src_stride]);\ | |
| 432 align = vec_lvsl(0, &block[1][y*src_stride]);\ | |
| 433 tmp2 = vec_ld(15, &block[1][y*src_stride]);\ | |
| 434 \ | |
| 435 b1 = vec_perm(tmp1,tmp2,align);\ | |
| 436 \ | |
| 437 tmp1 = vec_ld(0, &block[0][y*src_stride]);\ | |
| 438 align = vec_lvsl(0, &block[0][y*src_stride]);\ | |
| 439 tmp2 = vec_ld(15, &block[0][y*src_stride]);\ | |
| 440 \ | |
| 441 b0 = vec_perm(tmp1,tmp2,align); | |
| 442 | |
| 443 #define LOAD_OBMCS \ | |
| 444 tmp1 = vec_ld(0, obmc1);\ | |
| 445 align = vec_lvsl(0, obmc1);\ | |
| 446 tmp2 = vec_ld(15, obmc1);\ | |
| 447 \ | |
| 448 ob1 = vec_perm(tmp1,tmp2,align);\ | |
| 449 \ | |
| 450 tmp1 = vec_ld(0, obmc2);\ | |
| 451 align = vec_lvsl(0, obmc2);\ | |
| 452 tmp2 = vec_ld(15, obmc2);\ | |
| 453 \ | |
| 454 ob2 = vec_perm(tmp1,tmp2,align);\ | |
| 455 \ | |
| 456 tmp1 = vec_ld(0, obmc3);\ | |
| 457 align = vec_lvsl(0, obmc3);\ | |
| 458 tmp2 = vec_ld(15, obmc3);\ | |
| 459 \ | |
| 460 ob3 = vec_perm(tmp1,tmp2,align);\ | |
| 461 \ | |
| 462 tmp1 = vec_ld(0, obmc4);\ | |
| 463 align = vec_lvsl(0, obmc4);\ | |
| 464 tmp2 = vec_ld(15, obmc4);\ | |
| 465 \ | |
| 466 ob4 = vec_perm(tmp1,tmp2,align); | |
| 467 | |
| 468 /* interleave logic | |
| 469 * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ] | |
| 470 * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ] | |
| 471 * h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ] | |
| 472 */ | |
| 473 | |
| 474 #define STEPS_0_1\ | |
| 475 h1 = (vector unsigned short)\ | |
| 476 vec_mergeh(ob1, ob2);\ | |
| 477 \ | |
| 478 h2 = (vector unsigned short)\ | |
| 479 vec_mergeh(ob3, ob4);\ | |
| 480 \ | |
| 481 ih = (vector unsigned char)\ | |
| 482 vec_mergeh(h1,h2);\ | |
| 483 \ | |
| 484 l1 = (vector unsigned short) vec_mergeh(b3, b2);\ | |
| 485 \ | |
| 486 ih1 = (vector unsigned char) vec_mergel(h1, h2);\ | |
| 487 \ | |
| 488 l2 = (vector unsigned short) vec_mergeh(b1, b0);\ | |
| 489 \ | |
| 490 il = (vector unsigned char) vec_mergeh(l1, l2);\ | |
| 491 \ | |
| 492 v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ | |
| 493 \ | |
| 494 il1 = (vector unsigned char) vec_mergel(l1, l2);\ | |
| 495 \ | |
| 496 v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); | |
| 497 | |
| 498 #define FINAL_STEP_SCALAR\ | |
| 499 for(x=0; x<b_w; x++)\ | |
| 500 if(add){\ | |
| 501 vbuf[x] += dst[x + src_x];\ | |
| 502 vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\ | |
| 503 if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\ | |
| 504 dst8[x + y*src_stride] = vbuf[x];\ | |
| 505 }else{\ | |
| 506 dst[x + src_x] -= vbuf[x];\ | |
| 507 } | |
| 3222 | 508 |
| 509 static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc, | |
| 510 const int obmc_stride, | |
| 511 uint8_t * * block, int b_w, | |
| 512 int b_h, int src_x, int src_y, | |
| 513 int src_stride, slice_buffer * sb, | |
| 514 int add, uint8_t * dst8) | |
| 515 { | |
| 516 int y, x; | |
| 517 DWTELEM * dst; | |
| 518 vector unsigned short h1, h2, l1, l2; | |
| 3272 | 519 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; |
| 3222 | 520 vector unsigned char b0,b1,b2,b3; |
| 3272 | 521 vector unsigned char ob1,ob2,ob3,ob4; |
| 522 | |
| 523 DECLARE_ALIGNED_16(int, vbuf[16]); | |
| 524 vector signed int *v = (vector signed int *)vbuf, *d; | |
| 3222 | 525 |
| 526 for(y=0; y<b_h; y++){ | |
| 527 //FIXME ugly missue of obmc_stride | |
| 528 | |
| 529 uint8_t *obmc1= obmc + y*obmc_stride; | |
| 530 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
| 531 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
| 532 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
| 533 | |
| 534 dst = slice_buffer_get_line(sb, src_y + y); | |
| 535 d = (vector signed int *)(dst + src_x); | |
| 536 | |
| 3272 | 537 //FIXME i could avoid some loads! |
| 3222 | 538 |
| 3272 | 539 // load blocks |
| 540 LOAD_BLOCKS | |
| 3222 | 541 |
| 3272 | 542 // load obmcs |
| 543 LOAD_OBMCS | |
| 3271 | 544 |
| 3272 | 545 // steps 0 1 |
| 546 STEPS_0_1 | |
| 3222 | 547 |
| 3272 | 548 FINAL_STEP_SCALAR |
| 3222 | 549 |
| 550 } | |
| 551 | |
| 3272 | 552 } |
| 3222 | 553 |
| 3272 | 554 #define STEPS_2_3\ |
| 555 h1 = (vector unsigned short) vec_mergel(ob1, ob2);\ | |
| 556 \ | |
| 557 h2 = (vector unsigned short) vec_mergel(ob3, ob4);\ | |
| 558 \ | |
| 559 ih = (vector unsigned char) vec_mergeh(h1,h2);\ | |
| 560 \ | |
| 561 l1 = (vector unsigned short) vec_mergel(b3, b2);\ | |
| 562 \ | |
| 563 l2 = (vector unsigned short) vec_mergel(b1, b0);\ | |
| 564 \ | |
| 565 ih1 = (vector unsigned char) vec_mergel(h1,h2);\ | |
| 566 \ | |
| 567 il = (vector unsigned char) vec_mergeh(l1,l2);\ | |
| 568 \ | |
| 569 v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\ | |
| 570 \ | |
| 571 il1 = (vector unsigned char) vec_mergel(l1,l2);\ | |
| 572 \ | |
| 573 v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0)); | |
| 574 | |
| 3222 | 575 |
| 576 static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc, | |
| 577 const int obmc_stride, | |
| 578 uint8_t * * block, int b_w, | |
| 579 int b_h, int src_x, int src_y, | |
| 580 int src_stride, slice_buffer * sb, | |
| 581 int add, uint8_t * dst8) | |
| 582 { | |
| 583 int y, x; | |
| 584 DWTELEM * dst; | |
| 585 vector unsigned short h1, h2, l1, l2; | |
| 3272 | 586 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; |
| 3222 | 587 vector unsigned char b0,b1,b2,b3; |
| 3272 | 588 vector unsigned char ob1,ob2,ob3,ob4; |
| 589 DECLARE_ALIGNED_16(int, vbuf[b_w]); | |
| 590 vector signed int *v = (vector signed int *)vbuf, *d; | |
| 591 | |
| 592 for(y=0; y<b_h; y++){ | |
| 593 //FIXME ugly missue of obmc_stride | |
| 594 | |
| 595 uint8_t *obmc1= obmc + y*obmc_stride; | |
| 596 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
| 597 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
| 598 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
| 599 | |
| 600 dst = slice_buffer_get_line(sb, src_y + y); | |
| 601 d = (vector signed int *)(dst + src_x); | |
| 602 | |
| 603 // load blocks | |
| 604 LOAD_BLOCKS | |
| 605 | |
| 606 // load obmcs | |
| 607 LOAD_OBMCS | |
| 608 | |
| 609 // steps 0 1 2 3 | |
| 610 STEPS_0_1 | |
| 611 | |
| 612 STEPS_2_3 | |
| 613 | |
| 614 FINAL_STEP_SCALAR | |
| 615 | |
| 616 } | |
| 617 } | |
| 618 | |
| 619 #define FINAL_STEP_VEC \ | |
| 620 \ | |
| 621 if(add)\ | |
| 622 {\ | |
| 623 for(x=0; x<b_w/4; x++)\ | |
| 624 {\ | |
| 625 v[x] = vec_add(v[x], d[x]);\ | |
| 626 v[x] = vec_sra(vec_add(v[x],\ | |
| 627 vec_sl( vec_splat_s32(1),\ | |
| 628 vec_splat_u32(7))),\ | |
| 629 vec_splat_u32(8));\ | |
| 630 \ | |
| 3288 | 631 mask = (vector bool int) vec_sl((vector signed int)\ |
| 3272 | 632 vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\ |
| 3288 | 633 mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\ |
| 3272 | 634 \ |
| 3288 | 635 mask = (vector bool int)\ |
| 3272 | 636 vec_cmpeq((vector signed int)mask,\ |
| 637 (vector signed int)vec_splat_u32(0));\ | |
| 638 \ | |
| 639 vs = vec_sra(v[x],vec_splat_u32(8));\ | |
| 640 vs = vec_sra(v[x],vec_splat_u32(8));\ | |
| 641 vs = vec_sra(v[x],vec_splat_u32(15));\ | |
| 642 \ | |
| 643 vs = vec_nor(vs,vs);\ | |
| 644 \ | |
| 645 v[x]= vec_sel(v[x],vs,mask);\ | |
| 646 }\ | |
| 647 \ | |
| 648 for(x=0; x<b_w; x++)\ | |
| 649 dst8[x + y*src_stride] = vbuf[x];\ | |
| 650 \ | |
| 651 }\ | |
| 652 else\ | |
| 653 for(x=0; x<b_w/4; x++)\ | |
| 654 d[x] = vec_sub(d[x], v[x]); | |
| 655 | |
| 656 static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc, | |
| 657 const int obmc_stride, | |
| 658 uint8_t * * block, int b_w, | |
| 659 int b_h, int src_x, int src_y, | |
| 660 int src_stride, slice_buffer * sb, | |
| 661 int add, uint8_t * dst8) | |
| 662 { | |
| 663 int y, x; | |
| 664 DWTELEM * dst; | |
| 665 vector bool int mask; | |
| 666 vector signed int vs; | |
| 667 vector unsigned short h1, h2, l1, l2; | |
| 668 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; | |
| 669 vector unsigned char b0,b1,b2,b3; | |
| 670 vector unsigned char ob1,ob2,ob3,ob4; | |
| 671 | |
| 672 DECLARE_ALIGNED_16(int, vbuf[16]); | |
| 673 vector signed int *v = (vector signed int *)vbuf, *d; | |
| 3222 | 674 |
| 675 for(y=0; y<b_h; y++){ | |
| 676 //FIXME ugly missue of obmc_stride | |
| 677 | |
| 678 uint8_t *obmc1= obmc + y*obmc_stride; | |
| 679 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
| 680 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
| 681 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
| 682 | |
| 3272 | 683 dst = slice_buffer_get_line(sb, src_y + y); |
| 684 d = (vector signed int *)(dst + src_x); | |
| 685 | |
| 686 //FIXME i could avoid some loads! | |
| 687 | |
| 688 // load blocks | |
| 689 LOAD_BLOCKS | |
| 690 | |
| 691 // load obmcs | |
| 692 LOAD_OBMCS | |
| 693 | |
| 694 // steps 0 1 | |
| 695 STEPS_0_1 | |
| 696 | |
| 697 FINAL_STEP_VEC | |
| 698 | |
| 699 } | |
| 700 | |
| 701 } | |
| 3222 | 702 |
| 3272 | 703 static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc, |
| 704 const int obmc_stride, | |
| 705 uint8_t * * block, int b_w, | |
| 706 int b_h, int src_x, int src_y, | |
| 707 int src_stride, slice_buffer * sb, | |
| 708 int add, uint8_t * dst8) | |
| 709 { | |
| 710 int y, x; | |
| 711 DWTELEM * dst; | |
| 712 vector bool int mask; | |
| 713 vector signed int vs; | |
| 714 vector unsigned short h1, h2, l1, l2; | |
| 715 vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align; | |
| 716 vector unsigned char b0,b1,b2,b3; | |
| 717 vector unsigned char ob1,ob2,ob3,ob4; | |
| 718 DECLARE_ALIGNED_16(int, vbuf[b_w]); | |
| 719 vector signed int *v = (vector signed int *)vbuf, *d; | |
| 720 | |
| 721 for(y=0; y<b_h; y++){ | |
| 722 //FIXME ugly missue of obmc_stride | |
| 723 | |
| 724 uint8_t *obmc1= obmc + y*obmc_stride; | |
| 725 uint8_t *obmc2= obmc1+ (obmc_stride>>1); | |
| 726 uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1); | |
| 727 uint8_t *obmc4= obmc3+ (obmc_stride>>1); | |
| 3222 | 728 |
| 729 dst = slice_buffer_get_line(sb, src_y + y); | |
| 730 d = (vector signed int *)(dst + src_x); | |
| 731 | |
| 732 // load blocks | |
| 3272 | 733 LOAD_BLOCKS |
| 3222 | 734 |
| 3272 | 735 // load obmcs |
| 736 LOAD_OBMCS | |
| 3222 | 737 |
| 3272 | 738 // steps 0 1 2 3 |
| 739 STEPS_0_1 | |
| 3222 | 740 |
| 3272 | 741 STEPS_2_3 |
| 3222 | 742 |
| 3272 | 743 FINAL_STEP_VEC |
| 3222 | 744 |
| 3272 | 745 } |
| 3222 | 746 } |
| 747 | |
| 748 | |
| 749 void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride, | |
| 750 uint8_t * * block, int b_w, int b_h, | |
| 751 int src_x, int src_y, int src_stride, | |
| 752 slice_buffer * sb, int add, | |
| 753 uint8_t * dst8) | |
| 754 { | |
| 3272 | 755 if (src_x&15) { |
| 756 if (b_w == 16) | |
| 757 inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block, | |
| 758 b_w, b_h, src_x, src_y, | |
| 759 src_stride, sb, add, dst8); | |
| 760 else if (b_w == 8) | |
| 761 inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block, | |
| 762 b_w, b_h, src_x, src_y, | |
| 763 src_stride, sb, add, dst8); | |
| 764 else | |
| 765 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, | |
| 766 src_y, src_stride, sb, add, dst8); | |
| 767 } else { | |
| 768 if (b_w == 16) | |
| 769 inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block, | |
| 770 b_w, b_h, src_x, src_y, | |
| 771 src_stride, sb, add, dst8); | |
| 772 else if (b_w == 8) | |
| 773 inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block, | |
| 774 b_w, b_h, src_x, src_y, | |
| 775 src_stride, sb, add, dst8); | |
| 776 else | |
| 777 ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x, | |
| 778 src_y, src_stride, sb, add, dst8); | |
| 779 } | |
| 3222 | 780 } |
| 3547 | 781 |
| 782 | |
| 783 void snow_init_altivec(DSPContext* c, AVCodecContext *avctx) | |
| 784 { | |
| 785 c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec; | |
| 786 c->vertical_compose97i = ff_snow_vertical_compose97i_altivec; | |
| 787 c->inner_add_yblock = ff_snow_inner_add_yblock_altivec; | |
| 788 } |
