Mercurial > libavcodec.hg
comparison libpostproc/postprocess_altivec_template.c @ 2041:b996fbe0a7e7 libavcodec
Newer version, using a vectorized version of the
new organisation of code in doVertLowPass. it
seems to be faster in AltiVec also...
Also includes a compile fix for the new do_a_deblock
when using AltiVec.
patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
| author | michael |
|---|---|
| date | Fri, 28 May 2004 13:31:38 +0000 |
| parents | 6a6c678517b3 |
| children | 703b80c99891 |
comparison
equal
deleted
inserted
replaced
| 2040:5de466b3360e | 2041:b996fbe0a7e7 |
|---|---|
| 22 #ifdef CONFIG_DARWIN | 22 #ifdef CONFIG_DARWIN |
| 23 #define AVV(x...) (x) | 23 #define AVV(x...) (x) |
| 24 #else | 24 #else |
| 25 #define AVV(x...) {x} | 25 #define AVV(x...) {x} |
| 26 #endif | 26 #endif |
| 27 | |
| 28 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \ | |
| 29 do { \ | |
| 30 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \ | |
| 31 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \ | |
| 32 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \ | |
| 33 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \ | |
| 34 tempA1 = vec_mergeh (src_a, src_e); \ | |
| 35 tempB1 = vec_mergel (src_a, src_e); \ | |
| 36 tempC1 = vec_mergeh (src_b, src_f); \ | |
| 37 tempD1 = vec_mergel (src_b, src_f); \ | |
| 38 tempE1 = vec_mergeh (src_c, src_g); \ | |
| 39 tempF1 = vec_mergel (src_c, src_g); \ | |
| 40 tempG1 = vec_mergeh (src_d, src_h); \ | |
| 41 tempH1 = vec_mergel (src_d, src_h); \ | |
| 42 tempA2 = vec_mergeh (tempA1, tempE1); \ | |
| 43 tempB2 = vec_mergel (tempA1, tempE1); \ | |
| 44 tempC2 = vec_mergeh (tempB1, tempF1); \ | |
| 45 tempD2 = vec_mergel (tempB1, tempF1); \ | |
| 46 tempE2 = vec_mergeh (tempC1, tempG1); \ | |
| 47 tempF2 = vec_mergel (tempC1, tempG1); \ | |
| 48 tempG2 = vec_mergeh (tempD1, tempH1); \ | |
| 49 tempH2 = vec_mergel (tempD1, tempH1); \ | |
| 50 src_a = vec_mergeh (tempA2, tempE2); \ | |
| 51 src_b = vec_mergel (tempA2, tempE2); \ | |
| 52 src_c = vec_mergeh (tempB2, tempF2); \ | |
| 53 src_d = vec_mergel (tempB2, tempF2); \ | |
| 54 src_e = vec_mergeh (tempC2, tempG2); \ | |
| 55 src_f = vec_mergel (tempC2, tempG2); \ | |
| 56 src_g = vec_mergeh (tempD2, tempH2); \ | |
| 57 src_h = vec_mergel (tempD2, tempH2); \ | |
| 58 } while (0) | |
| 59 | |
| 27 | 60 |
| 28 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { | 61 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { |
| 29 /* | 62 /* |
| 30 this code makes no assumption on src or stride. | 63 this code makes no assumption on src or stride. |
| 31 One could remove the recomputation of the perm | 64 One could remove the recomputation of the perm |
| 131 return 1; | 164 return 1; |
| 132 } | 165 } |
| 133 else return 2; | 166 else return 2; |
| 134 } | 167 } |
| 135 | 168 |
| 169 /* this is the same as vertClassify_altivec, | |
| 170 with an added 8x8 transpose after the loading, | |
| 171 and w/o the stride*4 offset */ | |
| 172 static inline int horizClassify_altivec(uint8_t src[], int stride, PPContext *c) { | |
| 173 /* | |
| 174 this code makes no assumption on src or stride. | |
| 175 One could remove the recomputation of the perm | |
| 176 vector by assuming (stride % 16) == 0, unfortunately | |
| 177 this is not always true. | |
| 178 */ | |
| 179 register int y; | |
| 180 short __attribute__ ((aligned(16))) data[8]; | |
| 181 int numEq; | |
| 182 uint8_t *src2 = src; | |
| 183 vector signed short v_dcOffset; | |
| 184 vector signed short v2QP; | |
| 185 vector unsigned short v4QP; | |
| 186 vector unsigned short v_dcThreshold; | |
| 187 int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0; | |
| 188 const vector signed int zero = vec_splat_s32(0); | |
| 189 const vector signed short mask = vec_splat_s16(1); | |
| 190 vector signed int v_numEq = vec_splat_s32(0); | |
| 191 | |
| 192 data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; | |
| 193 data[1] = data[0] * 2 + 1; | |
| 194 data[2] = c->QP * 2; | |
| 195 data[3] = c->QP * 4; | |
| 196 vector signed short v_data = vec_ld(0, data); | |
| 197 v_dcOffset = vec_splat(v_data, 0); | |
| 198 v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1); | |
| 199 v2QP = vec_splat(v_data, 2); | |
| 200 v4QP = (vector unsigned short)vec_splat(v_data, 3); | |
| 201 | |
| 202 // src2 += stride * 4; | |
| 203 | |
| 204 #define LOAD_LINE(i) \ | |
| 205 register int j##i = i * stride; \ | |
| 206 vector unsigned char perm##i = vec_lvsl(j##i, src2); \ | |
| 207 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ | |
| 208 vector unsigned char v_srcA2##i; \ | |
| 209 if (two_vectors) \ | |
| 210 v_srcA2##i = vec_ld(j##i + 16, src2); \ | |
| 211 const vector unsigned char v_srcA##i = \ | |
| 212 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ | |
| 213 vector signed short v_srcAss##i = \ | |
| 214 (vector signed short)vec_mergeh((vector signed char)zero, \ | |
| 215 (vector signed char)v_srcA##i) | |
| 216 | |
| 217 LOAD_LINE(0); | |
| 218 LOAD_LINE(1); | |
| 219 LOAD_LINE(2); | |
| 220 LOAD_LINE(3); | |
| 221 LOAD_LINE(4); | |
| 222 LOAD_LINE(5); | |
| 223 LOAD_LINE(6); | |
| 224 LOAD_LINE(7); | |
| 225 #undef LOAD_LINE | |
| 226 | |
| 227 ALTIVEC_TRANSPOSE_8x8_SHORT(v_srcAss0, | |
| 228 v_srcAss1, | |
| 229 v_srcAss2, | |
| 230 v_srcAss3, | |
| 231 v_srcAss4, | |
| 232 v_srcAss5, | |
| 233 v_srcAss6, | |
| 234 v_srcAss7); | |
| 235 | |
| 236 #define ITER(i, j) \ | |
| 237 const vector signed short v_diff##i = \ | |
| 238 vec_sub(v_srcAss##i, v_srcAss##j); \ | |
| 239 const vector signed short v_sum##i = \ | |
| 240 vec_add(v_diff##i, v_dcOffset); \ | |
| 241 const vector signed short v_comp##i = \ | |
| 242 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ | |
| 243 v_dcThreshold); \ | |
| 244 const vector signed short v_part##i = vec_and(mask, v_comp##i); \ | |
| 245 v_numEq = vec_sum4s(v_part##i, v_numEq); | |
| 246 | |
| 247 ITER(0, 1); | |
| 248 ITER(1, 2); | |
| 249 ITER(2, 3); | |
| 250 ITER(3, 4); | |
| 251 ITER(4, 5); | |
| 252 ITER(5, 6); | |
| 253 ITER(6, 7); | |
| 254 #undef ITER | |
| 255 | |
| 256 v_numEq = vec_sums(v_numEq, zero); | |
| 257 | |
| 258 v_numEq = vec_splat(v_numEq, 3); | |
| 259 vec_ste(v_numEq, 0, &numEq); | |
| 260 | |
| 261 if (numEq > c->ppMode.flatnessThreshold) | |
| 262 { | |
| 263 const vector unsigned char mmoP1 = (const vector unsigned char) | |
| 264 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, | |
| 265 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B); | |
| 266 const vector unsigned char mmoP2 = (const vector unsigned char) | |
| 267 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, | |
| 268 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); | |
| 269 const vector unsigned char mmoP = (const vector unsigned char) | |
| 270 vec_lvsl(8, (unsigned char*)0); | |
| 271 | |
| 272 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); | |
| 273 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); | |
| 274 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); | |
| 275 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); | |
| 276 vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2); | |
| 277 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); | |
| 278 vector signed short mmoDiff = vec_sub(mmoL, mmoR); | |
| 279 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); | |
| 280 | |
| 281 if (vec_any_gt(mmoSum, v4QP)) | |
| 282 return 0; | |
| 283 else | |
| 284 return 1; | |
| 285 } | |
| 286 else return 2; | |
| 287 } | |
| 288 | |
| 136 | 289 |
| 137 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { | 290 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { |
| 138 /* | 291 /* |
| 139 this code makes no assumption on src or stride. | 292 this code makes no assumption on src or stride. |
| 140 One could remove the recomputation of the perm | 293 One could remove the recomputation of the perm |
| 178 #undef LOAD_LINE | 331 #undef LOAD_LINE |
| 179 | 332 |
| 180 const vector unsigned short v_1 = vec_splat_u16(1); | 333 const vector unsigned short v_1 = vec_splat_u16(1); |
| 181 const vector unsigned short v_2 = vec_splat_u16(2); | 334 const vector unsigned short v_2 = vec_splat_u16(2); |
| 182 const vector unsigned short v_4 = vec_splat_u16(4); | 335 const vector unsigned short v_4 = vec_splat_u16(4); |
| 183 const vector signed short v_8 = vec_splat_s16(8); | 336 |
| 184 | 337 const vector signed short v_diff01 = vec_sub(vb0, vb1); |
| 185 const vector signed short v_first = vec_sel(vb1, vb0, | 338 const vector unsigned short v_cmp01 = |
| 186 vec_cmplt(vec_abs(vec_sub(vb0, vb1)), | 339 (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp); |
| 187 vqp)); | 340 const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01); |
| 188 const vector signed short v_last = vec_sel(vb8, vb9, | 341 const vector signed short v_diff89 = vec_sub(vb8, vb9); |
| 189 vec_cmplt(vec_abs(vec_sub(vb8, vb9)), | 342 const vector unsigned short v_cmp89 = |
| 190 vqp)); | 343 (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp); |
| 191 | 344 const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89); |
| 192 const vector signed short v_sums0 = vec_add(v_first, vb1); | 345 |
| 193 const vector signed short v_sums1 = vec_add(vb1, vb2); | 346 const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1); |
| 194 const vector signed short v_sums2 = vec_add(vb2, vb3); | 347 const vector signed short temp02 = vec_add(vb2, vb3); |
| 195 const vector signed short v_sums3 = vec_add(vb3, vb4); | 348 const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4); |
| 196 const vector signed short v_sums4 = vec_add(vb4, vb5); | 349 const vector signed short v_sumsB0 = vec_add(temp02, temp03); |
| 197 const vector signed short v_sums5 = vec_add(vb5, vb6); | 350 |
| 198 const vector signed short v_sums6 = vec_add(vb6, vb7); | 351 const vector signed short temp11 = vec_sub(v_sumsB0, v_first); |
| 199 const vector signed short v_sums7 = vec_add(vb7, vb8); | 352 const vector signed short v_sumsB1 = vec_add(temp11, vb4); |
| 200 const vector signed short v_sums8 = vec_add(vb8, v_last); | 353 |
| 201 | 354 const vector signed short temp21 = vec_sub(v_sumsB1, v_first); |
| 202 const vector signed short vr1 = vec_sra(vec_add(vec_add(vec_sl(v_sums0, v_2), | 355 const vector signed short v_sumsB2 = vec_add(temp21, vb5); |
| 203 vec_sl(vec_add(v_first, v_sums2), v_1)), | 356 |
| 204 vec_add(v_sums4, v_8)), | 357 const vector signed short temp31 = vec_sub(v_sumsB2, v_first); |
| 205 v_4); | 358 const vector signed short v_sumsB3 = vec_add(temp31, vb6); |
| 206 const vector signed short vr2 = vec_sra(vec_add(vec_add(vec_sl(vb2, v_2), | 359 |
| 207 v_sums5), | 360 const vector signed short temp41 = vec_sub(v_sumsB3, v_first); |
| 208 vec_add(v_8, | 361 const vector signed short v_sumsB4 = vec_add(temp41, vb7); |
| 209 vec_sl(vec_add(v_first, | 362 |
| 210 vec_add(v_sums0, v_sums3)), | 363 const vector signed short temp51 = vec_sub(v_sumsB4, vb1); |
| 211 v_1))), | 364 const vector signed short v_sumsB5 = vec_add(temp51, vb8); |
| 212 v_4); | 365 |
| 213 const vector signed short vr3 = vec_sra(vec_add(vec_add(vec_sl(vb3, v_2), | 366 const vector signed short temp61 = vec_sub(v_sumsB5, vb2); |
| 214 v_sums6), | 367 const vector signed short v_sumsB6 = vec_add(temp61, v_last); |
| 215 vec_add(v_8, | 368 |
| 216 vec_sl(vec_add(v_first, | 369 const vector signed short temp71 = vec_sub(v_sumsB6, vb3); |
| 217 vec_add(v_sums1, v_sums4)), | 370 const vector signed short v_sumsB7 = vec_add(temp71, v_last); |
| 218 v_1))), | 371 |
| 219 v_4); | 372 const vector signed short temp81 = vec_sub(v_sumsB7, vb4); |
| 220 const vector signed short vr4 = vec_sra(vec_add(vec_add(vec_sl(vb4, v_2), | 373 const vector signed short v_sumsB8 = vec_add(temp81, v_last); |
| 221 v_sums7), | 374 |
| 222 vec_add(v_8, | 375 const vector signed short temp91 = vec_sub(v_sumsB8, vb5); |
| 223 vec_add(v_sums0, | 376 const vector signed short v_sumsB9 = vec_add(temp91, v_last); |
| 224 vec_sl(vec_add(v_sums2, v_sums5), | 377 |
| 225 v_1)))), | 378 #define COMPUTE_VR(i, j, k) \ |
| 226 v_4); | 379 const vector signed short temps1##i = \ |
| 227 const vector signed short vr5 = vec_sra(vec_add(vec_add(vec_sl(vb5, v_2), | 380 vec_add(v_sumsB##i, v_sumsB##k); \ |
| 228 v_sums8), | 381 const vector signed short temps2##i = \ |
| 229 vec_add(v_8, | 382 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ |
| 230 vec_add(v_sums1, | 383 const vector signed short vr##j = vec_sra(temps2##i, v_4) |
| 231 vec_sl(vec_add(v_sums3, v_sums6), | 384 |
| 232 v_1)))), | 385 COMPUTE_VR(0, 1, 2); |
| 233 v_4); | 386 COMPUTE_VR(1, 2, 3); |
| 234 const vector signed short vr6 = vec_sra(vec_add(vec_add(vec_sl(vb6, v_2), | 387 COMPUTE_VR(2, 3, 4); |
| 235 v_sums2), | 388 COMPUTE_VR(3, 4, 5); |
| 236 vec_add(v_8, | 389 COMPUTE_VR(4, 5, 6); |
| 237 vec_sl(vec_add(v_last, | 390 COMPUTE_VR(5, 6, 7); |
| 238 vec_add(v_sums7, v_sums4)), | 391 COMPUTE_VR(6, 7, 8); |
| 239 v_1))), | 392 COMPUTE_VR(7, 8, 9); |
| 240 v_4); | 393 |
| 241 const vector signed short vr7 = vec_sra(vec_add(vec_add(vec_sl(vec_add(v_last, vb7), v_2), | 394 const vector signed char neg1 = vec_splat_s8(-1); |
| 242 vec_sl(vec_add(vb8, v_sums5), v_1)), | 395 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 243 vec_add(v_8, v_sums3)), | 396 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
| 244 v_4); | |
| 245 const vector signed short vr8 = vec_sra(vec_add(vec_add(vec_sl(v_sums8, v_2), | |
| 246 vec_sl(vec_add(v_last, v_sums6), v_1)), | |
| 247 vec_add(v_sums4, v_8)), | |
| 248 v_4); | |
| 249 | |
| 250 const vector unsigned char neg1 = (vector unsigned char)AVV(-1, -1, -1, -1, -1, -1, -1, -1, | |
| 251 -1, -1, -1, -1, -1, -1, -1, -1); | |
| 252 const vector unsigned char permHH = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
| 253 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | |
| 254 | 397 |
| 255 #define PACK_AND_STORE(i) \ | 398 #define PACK_AND_STORE(i) \ |
| 256 const vector unsigned char perms##i = \ | 399 const vector unsigned char perms##i = \ |
| 257 vec_lvsr(i * stride, src2); \ | 400 vec_lvsr(i * stride, src2); \ |
| 258 const vector unsigned char vf##i = \ | 401 const vector unsigned char vf##i = \ |
| 259 vec_packsu(vr##i, (vector signed short)zero); \ | 402 vec_packsu(vr##i, (vector signed short)zero); \ |
| 260 const vector unsigned char vg##i = \ | 403 const vector unsigned char vg##i = \ |
| 261 vec_perm(vf##i, vbT##i, permHH); \ | 404 vec_perm(vf##i, vbT##i, permHH); \ |
| 262 const vector unsigned char mask##i = \ | 405 const vector unsigned char mask##i = \ |
| 263 vec_perm((vector unsigned char)zero, neg1, perms##i); \ | 406 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ |
| 264 const vector unsigned char vg2##i = \ | 407 const vector unsigned char vg2##i = \ |
| 265 vec_perm(vg##i, vg##i, perms##i); \ | 408 vec_perm(vg##i, vg##i, perms##i); \ |
| 266 const vector unsigned char svA##i = \ | 409 const vector unsigned char svA##i = \ |
| 267 vec_sel(vbA##i, vg2##i, mask##i); \ | 410 vec_sel(vbA##i, vg2##i, mask##i); \ |
| 268 const vector unsigned char svB##i = \ | 411 const vector unsigned char svB##i = \ |
| 381 const vector signed short vb4minusd = vec_sub(vb4, dornotd); | 524 const vector signed short vb4minusd = vec_sub(vb4, dornotd); |
| 382 const vector signed short vb5plusd = vec_add(vb5, dornotd); | 525 const vector signed short vb5plusd = vec_add(vb5, dornotd); |
| 383 /* finally, stores */ | 526 /* finally, stores */ |
| 384 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); | 527 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); |
| 385 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); | 528 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); |
| 386 | 529 |
| 387 const vector unsigned char neg1 = (vector unsigned char)AVV(-1, -1, -1, -1, -1, -1, -1, -1, | 530 const vector signed char neg1 = vec_splat_s8(-1); |
| 388 -1, -1, -1, -1, -1, -1, -1, -1); | 531 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 389 | 532 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
| 390 const vector unsigned char permHH = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
| 391 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | |
| 392 | 533 |
| 393 #define STORE(i) \ | 534 #define STORE(i) \ |
| 394 const vector unsigned char perms##i = \ | 535 const vector unsigned char perms##i = \ |
| 395 vec_lvsr(i * stride, src2); \ | 536 vec_lvsr(i * stride, src2); \ |
| 396 const vector unsigned char vg##i = \ | 537 const vector unsigned char vg##i = \ |
| 397 vec_perm(st##i, vbT##i, permHH); \ | 538 vec_perm(st##i, vbT##i, permHH); \ |
| 398 const vector unsigned char mask##i = \ | 539 const vector unsigned char mask##i = \ |
| 399 vec_perm((vector unsigned char)zero, neg1, perms##i); \ | 540 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ |
| 400 const vector unsigned char vg2##i = \ | 541 const vector unsigned char vg2##i = \ |
| 401 vec_perm(vg##i, vg##i, perms##i); \ | 542 vec_perm(vg##i, vg##i, perms##i); \ |
| 402 const vector unsigned char svA##i = \ | 543 const vector unsigned char svA##i = \ |
| 403 vec_sel(vbA##i, vg2##i, mask##i); \ | 544 vec_sel(vbA##i, vg2##i, mask##i); \ |
| 404 const vector unsigned char svB##i = \ | 545 const vector unsigned char svB##i = \ |
| 678 ITER(4, 5, 6); | 819 ITER(4, 5, 6); |
| 679 ITER(5, 6, 7); | 820 ITER(5, 6, 7); |
| 680 ITER(6, 7, 8); | 821 ITER(6, 7, 8); |
| 681 ITER(7, 8, 9); | 822 ITER(7, 8, 9); |
| 682 | 823 |
| 683 const vector signed char neg1 = vec_splat_s8( -1 ); | 824 const vector signed char neg1 = vec_splat_s8(-1); |
| 684 | 825 |
| 685 #define STORE_LINE(i) \ | 826 #define STORE_LINE(i) \ |
| 686 const vector unsigned char permST##i = \ | 827 const vector unsigned char permST##i = \ |
| 687 vec_lvsr(i * stride, srcCopy); \ | 828 vec_lvsr(i * stride, srcCopy); \ |
| 688 const vector unsigned char maskST##i = \ | 829 const vector unsigned char maskST##i = \ |
| 706 #undef STORE_LINE | 847 #undef STORE_LINE |
| 707 #undef ITER | 848 #undef ITER |
| 708 #undef F2 | 849 #undef F2 |
| 709 } | 850 } |
| 710 | 851 |
| 711 #define horizClassify_altivec(a...) horizClassify_C(a) | |
| 712 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) | 852 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) |
| 713 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) | 853 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) |
| 854 #define do_a_deblock_altivec(a...) do_a_deblock_C(a) | |
| 855 | |
| 856 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, | |
| 857 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) | |
| 858 { | |
| 859 const vector signed int zero = vec_splat_s32(0); | |
| 860 const vector signed short vsint16_1 = vec_splat_s16(1); | |
| 861 vector signed int v_dp = zero; | |
| 862 vector signed int v_sysdp = zero; | |
| 863 int d, sysd, i; | |
| 864 | |
| 865 tempBluredPast[127]= maxNoise[0]; | |
| 866 tempBluredPast[128]= maxNoise[1]; | |
| 867 tempBluredPast[129]= maxNoise[2]; | |
| 868 | |
| 869 #define LOAD_LINE(src, i) \ | |
| 870 register int j##src##i = i * stride; \ | |
| 871 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \ | |
| 872 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \ | |
| 873 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \ | |
| 874 const vector unsigned char v_##src##A##i = \ | |
| 875 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ | |
| 876 vector signed short v_##src##Ass##i = \ | |
| 877 (vector signed short)vec_mergeh((vector signed char)zero, \ | |
| 878 (vector signed char)v_##src##A##i) | |
| 879 | |
| 880 LOAD_LINE(src, 0); | |
| 881 LOAD_LINE(src, 1); | |
| 882 LOAD_LINE(src, 2); | |
| 883 LOAD_LINE(src, 3); | |
| 884 LOAD_LINE(src, 4); | |
| 885 LOAD_LINE(src, 5); | |
| 886 LOAD_LINE(src, 6); | |
| 887 LOAD_LINE(src, 7); | |
| 888 | |
| 889 LOAD_LINE(tempBlured, 0); | |
| 890 LOAD_LINE(tempBlured, 1); | |
| 891 LOAD_LINE(tempBlured, 2); | |
| 892 LOAD_LINE(tempBlured, 3); | |
| 893 LOAD_LINE(tempBlured, 4); | |
| 894 LOAD_LINE(tempBlured, 5); | |
| 895 LOAD_LINE(tempBlured, 6); | |
| 896 LOAD_LINE(tempBlured, 7); | |
| 897 #undef LOAD_LINE | |
| 898 | |
| 899 #define ACCUMULATE_DIFFS(i) \ | |
| 900 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \ | |
| 901 v_srcAss##i); \ | |
| 902 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \ | |
| 903 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp) | |
| 904 | |
| 905 ACCUMULATE_DIFFS(0); | |
| 906 ACCUMULATE_DIFFS(1); | |
| 907 ACCUMULATE_DIFFS(2); | |
| 908 ACCUMULATE_DIFFS(3); | |
| 909 ACCUMULATE_DIFFS(4); | |
| 910 ACCUMULATE_DIFFS(5); | |
| 911 ACCUMULATE_DIFFS(6); | |
| 912 ACCUMULATE_DIFFS(7); | |
| 913 #undef ACCUMULATE_DIFFS | |
| 914 | |
| 915 v_dp = vec_sums(v_dp, zero); | |
| 916 v_sysdp = vec_sums(v_sysdp, zero); | |
| 917 | |
| 918 v_dp = vec_splat(v_dp, 3); | |
| 919 v_sysdp = vec_splat(v_sysdp, 3); | |
| 920 | |
| 921 vec_ste(v_dp, 0, &d); | |
| 922 vec_ste(v_sysdp, 0, &sysd); | |
| 923 | |
| 924 i = d; | |
| 925 d = (4*d | |
| 926 +(*(tempBluredPast-256)) | |
| 927 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) | |
| 928 +(*(tempBluredPast+256)) | |
| 929 +4)>>3; | |
| 930 | |
| 931 *tempBluredPast=i; | |
| 932 | |
| 933 if (d > maxNoise[1]) { | |
| 934 if (d < maxNoise[2]) { | |
| 935 #define OP(i) v_tempBluredAss##i = vec_avg(v_tempBluredAss##i, v_srcAss##i); | |
| 936 | |
| 937 OP(0); | |
| 938 OP(1); | |
| 939 OP(2); | |
| 940 OP(3); | |
| 941 OP(4); | |
| 942 OP(5); | |
| 943 OP(6); | |
| 944 OP(7); | |
| 945 #undef OP | |
| 946 } else { | |
| 947 #define OP(i) v_tempBluredAss##i = v_srcAss##i; | |
| 948 | |
| 949 OP(0); | |
| 950 OP(1); | |
| 951 OP(2); | |
| 952 OP(3); | |
| 953 OP(4); | |
| 954 OP(5); | |
| 955 OP(6); | |
| 956 OP(7); | |
| 957 #undef OP | |
| 958 } | |
| 959 } else { | |
| 960 if (d < maxNoise[0]) { | |
| 961 const vector signed short vsint16_7 = vec_splat_s16(7); | |
| 962 const vector signed short vsint16_4 = vec_splat_s16(4); | |
| 963 const vector unsigned short vuint16_3 = vec_splat_u16(3); | |
| 964 | |
| 965 #define OP(i) \ | |
| 966 const vector signed short v_temp##i = \ | |
| 967 vec_mladd(v_tempBluredAss##i, \ | |
| 968 vsint16_7, v_srcAss##i); \ | |
| 969 const vector signed short v_temp2##i = \ | |
| 970 vec_add(v_temp##i, vsint16_4); \ | |
| 971 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3) | |
| 972 | |
| 973 OP(0); | |
| 974 OP(1); | |
| 975 OP(2); | |
| 976 OP(3); | |
| 977 OP(4); | |
| 978 OP(5); | |
| 979 OP(6); | |
| 980 OP(7); | |
| 981 #undef OP | |
| 982 } else { | |
| 983 const vector signed short vsint16_3 = vec_splat_s16(3); | |
| 984 const vector signed short vsint16_2 = vec_splat_s16(2); | |
| 985 | |
| 986 #define OP(i) \ | |
| 987 const vector signed short v_temp##i = \ | |
| 988 vec_mladd(v_tempBluredAss##i, \ | |
| 989 vsint16_3, v_srcAss##i); \ | |
| 990 const vector signed short v_temp2##i = \ | |
| 991 vec_add(v_temp##i, vsint16_2); \ | |
| 992 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2) | |
| 993 | |
| 994 OP(0); | |
| 995 OP(1); | |
| 996 OP(2); | |
| 997 OP(3); | |
| 998 OP(4); | |
| 999 OP(5); | |
| 1000 OP(6); | |
| 1001 OP(7); | |
| 1002 #undef OP | |
| 1003 } | |
| 1004 } | |
| 1005 | |
| 1006 const vector signed char neg1 = vec_splat_s8(-1); | |
| 1007 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
| 1008 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | |
| 1009 | |
| 1010 #define PACK_AND_STORE(src, i) \ | |
| 1011 const vector unsigned char perms##src##i = \ | |
| 1012 vec_lvsr(i * stride, src); \ | |
| 1013 const vector unsigned char vf##src##i = \ | |
| 1014 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \ | |
| 1015 const vector unsigned char vg##src##i = \ | |
| 1016 vec_perm(vf##src##i, v_##src##A##i, permHH); \ | |
| 1017 const vector unsigned char mask##src##i = \ | |
| 1018 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \ | |
| 1019 const vector unsigned char vg2##src##i = \ | |
| 1020 vec_perm(vg##src##i, vg##src##i, perms##src##i); \ | |
| 1021 const vector unsigned char svA##src##i = \ | |
| 1022 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \ | |
| 1023 const vector unsigned char svB##src##i = \ | |
| 1024 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \ | |
| 1025 vec_st(svA##src##i, i * stride, src); \ | |
| 1026 vec_st(svB##src##i, i * stride + 16, src) | |
| 1027 | |
| 1028 PACK_AND_STORE(src, 0); | |
| 1029 PACK_AND_STORE(src, 1); | |
| 1030 PACK_AND_STORE(src, 2); | |
| 1031 PACK_AND_STORE(src, 3); | |
| 1032 PACK_AND_STORE(src, 4); | |
| 1033 PACK_AND_STORE(src, 5); | |
| 1034 PACK_AND_STORE(src, 6); | |
| 1035 PACK_AND_STORE(src, 7); | |
| 1036 PACK_AND_STORE(tempBlured, 0); | |
| 1037 PACK_AND_STORE(tempBlured, 1); | |
| 1038 PACK_AND_STORE(tempBlured, 2); | |
| 1039 PACK_AND_STORE(tempBlured, 3); | |
| 1040 PACK_AND_STORE(tempBlured, 4); | |
| 1041 PACK_AND_STORE(tempBlured, 5); | |
| 1042 PACK_AND_STORE(tempBlured, 6); | |
| 1043 PACK_AND_STORE(tempBlured, 7); | |
| 1044 #undef PACK_AND_STORE | |
| 1045 } |
