comparison libpostproc/postprocess_altivec_template.c @ 2041:b996fbe0a7e7 libavcodec

Newer version, using a vectorized version of the new organisation of code in doVertLowPass. it seems to be faster in AltiVec also... Also includes a compile fix for the new do_a_deblock when using AltiVec. patch by (Romain Dolbeau <dolbeau at irisa dot fr>)
author michael
date Fri, 28 May 2004 13:31:38 +0000
parents 6a6c678517b3
children 703b80c99891
comparison
equal deleted inserted replaced
2040:5de466b3360e 2041:b996fbe0a7e7
22 #ifdef CONFIG_DARWIN 22 #ifdef CONFIG_DARWIN
23 #define AVV(x...) (x) 23 #define AVV(x...) (x)
24 #else 24 #else
25 #define AVV(x...) {x} 25 #define AVV(x...) {x}
26 #endif 26 #endif
27
28 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
29 do { \
30 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \
31 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \
32 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \
33 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \
34 tempA1 = vec_mergeh (src_a, src_e); \
35 tempB1 = vec_mergel (src_a, src_e); \
36 tempC1 = vec_mergeh (src_b, src_f); \
37 tempD1 = vec_mergel (src_b, src_f); \
38 tempE1 = vec_mergeh (src_c, src_g); \
39 tempF1 = vec_mergel (src_c, src_g); \
40 tempG1 = vec_mergeh (src_d, src_h); \
41 tempH1 = vec_mergel (src_d, src_h); \
42 tempA2 = vec_mergeh (tempA1, tempE1); \
43 tempB2 = vec_mergel (tempA1, tempE1); \
44 tempC2 = vec_mergeh (tempB1, tempF1); \
45 tempD2 = vec_mergel (tempB1, tempF1); \
46 tempE2 = vec_mergeh (tempC1, tempG1); \
47 tempF2 = vec_mergel (tempC1, tempG1); \
48 tempG2 = vec_mergeh (tempD1, tempH1); \
49 tempH2 = vec_mergel (tempD1, tempH1); \
50 src_a = vec_mergeh (tempA2, tempE2); \
51 src_b = vec_mergel (tempA2, tempE2); \
52 src_c = vec_mergeh (tempB2, tempF2); \
53 src_d = vec_mergel (tempB2, tempF2); \
54 src_e = vec_mergeh (tempC2, tempG2); \
55 src_f = vec_mergel (tempC2, tempG2); \
56 src_g = vec_mergeh (tempD2, tempH2); \
57 src_h = vec_mergel (tempD2, tempH2); \
58 } while (0)
59
27 60
28 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { 61 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
29 /* 62 /*
30 this code makes no assumption on src or stride. 63 this code makes no assumption on src or stride.
31 One could remove the recomputation of the perm 64 One could remove the recomputation of the perm
131 return 1; 164 return 1;
132 } 165 }
133 else return 2; 166 else return 2;
134 } 167 }
135 168
169 /* this is the same as vertClassify_altivec,
170 with an added 8x8 transpose after the loading,
171 and w/o the stride*4 offset */
172 static inline int horizClassify_altivec(uint8_t src[], int stride, PPContext *c) {
173 /*
174 this code makes no assumption on src or stride.
175 One could remove the recomputation of the perm
176 vector by assuming (stride % 16) == 0, unfortunately
177 this is not always true.
178 */
179 register int y;
180 short __attribute__ ((aligned(16))) data[8];
181 int numEq;
182 uint8_t *src2 = src;
183 vector signed short v_dcOffset;
184 vector signed short v2QP;
185 vector unsigned short v4QP;
186 vector unsigned short v_dcThreshold;
187 int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0;
188 const vector signed int zero = vec_splat_s32(0);
189 const vector signed short mask = vec_splat_s16(1);
190 vector signed int v_numEq = vec_splat_s32(0);
191
192 data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
193 data[1] = data[0] * 2 + 1;
194 data[2] = c->QP * 2;
195 data[3] = c->QP * 4;
196 vector signed short v_data = vec_ld(0, data);
197 v_dcOffset = vec_splat(v_data, 0);
198 v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
199 v2QP = vec_splat(v_data, 2);
200 v4QP = (vector unsigned short)vec_splat(v_data, 3);
201
202 // src2 += stride * 4;
203
204 #define LOAD_LINE(i) \
205 register int j##i = i * stride; \
206 vector unsigned char perm##i = vec_lvsl(j##i, src2); \
207 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \
208 vector unsigned char v_srcA2##i; \
209 if (two_vectors) \
210 v_srcA2##i = vec_ld(j##i + 16, src2); \
211 const vector unsigned char v_srcA##i = \
212 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \
213 vector signed short v_srcAss##i = \
214 (vector signed short)vec_mergeh((vector signed char)zero, \
215 (vector signed char)v_srcA##i)
216
217 LOAD_LINE(0);
218 LOAD_LINE(1);
219 LOAD_LINE(2);
220 LOAD_LINE(3);
221 LOAD_LINE(4);
222 LOAD_LINE(5);
223 LOAD_LINE(6);
224 LOAD_LINE(7);
225 #undef LOAD_LINE
226
227 ALTIVEC_TRANSPOSE_8x8_SHORT(v_srcAss0,
228 v_srcAss1,
229 v_srcAss2,
230 v_srcAss3,
231 v_srcAss4,
232 v_srcAss5,
233 v_srcAss6,
234 v_srcAss7);
235
236 #define ITER(i, j) \
237 const vector signed short v_diff##i = \
238 vec_sub(v_srcAss##i, v_srcAss##j); \
239 const vector signed short v_sum##i = \
240 vec_add(v_diff##i, v_dcOffset); \
241 const vector signed short v_comp##i = \
242 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \
243 v_dcThreshold); \
244 const vector signed short v_part##i = vec_and(mask, v_comp##i); \
245 v_numEq = vec_sum4s(v_part##i, v_numEq);
246
247 ITER(0, 1);
248 ITER(1, 2);
249 ITER(2, 3);
250 ITER(3, 4);
251 ITER(4, 5);
252 ITER(5, 6);
253 ITER(6, 7);
254 #undef ITER
255
256 v_numEq = vec_sums(v_numEq, zero);
257
258 v_numEq = vec_splat(v_numEq, 3);
259 vec_ste(v_numEq, 0, &numEq);
260
261 if (numEq > c->ppMode.flatnessThreshold)
262 {
263 const vector unsigned char mmoP1 = (const vector unsigned char)
264 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
265 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
266 const vector unsigned char mmoP2 = (const vector unsigned char)
267 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
268 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
269 const vector unsigned char mmoP = (const vector unsigned char)
270 vec_lvsl(8, (unsigned char*)0);
271
272 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
273 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
274 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
275 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
276 vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
277 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
278 vector signed short mmoDiff = vec_sub(mmoL, mmoR);
279 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
280
281 if (vec_any_gt(mmoSum, v4QP))
282 return 0;
283 else
284 return 1;
285 }
286 else return 2;
287 }
288
136 289
137 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { 290 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
138 /* 291 /*
139 this code makes no assumption on src or stride. 292 this code makes no assumption on src or stride.
140 One could remove the recomputation of the perm 293 One could remove the recomputation of the perm
178 #undef LOAD_LINE 331 #undef LOAD_LINE
179 332
180 const vector unsigned short v_1 = vec_splat_u16(1); 333 const vector unsigned short v_1 = vec_splat_u16(1);
181 const vector unsigned short v_2 = vec_splat_u16(2); 334 const vector unsigned short v_2 = vec_splat_u16(2);
182 const vector unsigned short v_4 = vec_splat_u16(4); 335 const vector unsigned short v_4 = vec_splat_u16(4);
183 const vector signed short v_8 = vec_splat_s16(8); 336
184 337 const vector signed short v_diff01 = vec_sub(vb0, vb1);
185 const vector signed short v_first = vec_sel(vb1, vb0, 338 const vector unsigned short v_cmp01 =
186 vec_cmplt(vec_abs(vec_sub(vb0, vb1)), 339 (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
187 vqp)); 340 const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
188 const vector signed short v_last = vec_sel(vb8, vb9, 341 const vector signed short v_diff89 = vec_sub(vb8, vb9);
189 vec_cmplt(vec_abs(vec_sub(vb8, vb9)), 342 const vector unsigned short v_cmp89 =
190 vqp)); 343 (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
191 344 const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
192 const vector signed short v_sums0 = vec_add(v_first, vb1); 345
193 const vector signed short v_sums1 = vec_add(vb1, vb2); 346 const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
194 const vector signed short v_sums2 = vec_add(vb2, vb3); 347 const vector signed short temp02 = vec_add(vb2, vb3);
195 const vector signed short v_sums3 = vec_add(vb3, vb4); 348 const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
196 const vector signed short v_sums4 = vec_add(vb4, vb5); 349 const vector signed short v_sumsB0 = vec_add(temp02, temp03);
197 const vector signed short v_sums5 = vec_add(vb5, vb6); 350
198 const vector signed short v_sums6 = vec_add(vb6, vb7); 351 const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
199 const vector signed short v_sums7 = vec_add(vb7, vb8); 352 const vector signed short v_sumsB1 = vec_add(temp11, vb4);
200 const vector signed short v_sums8 = vec_add(vb8, v_last); 353
201 354 const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
202 const vector signed short vr1 = vec_sra(vec_add(vec_add(vec_sl(v_sums0, v_2), 355 const vector signed short v_sumsB2 = vec_add(temp21, vb5);
203 vec_sl(vec_add(v_first, v_sums2), v_1)), 356
204 vec_add(v_sums4, v_8)), 357 const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
205 v_4); 358 const vector signed short v_sumsB3 = vec_add(temp31, vb6);
206 const vector signed short vr2 = vec_sra(vec_add(vec_add(vec_sl(vb2, v_2), 359
207 v_sums5), 360 const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
208 vec_add(v_8, 361 const vector signed short v_sumsB4 = vec_add(temp41, vb7);
209 vec_sl(vec_add(v_first, 362
210 vec_add(v_sums0, v_sums3)), 363 const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
211 v_1))), 364 const vector signed short v_sumsB5 = vec_add(temp51, vb8);
212 v_4); 365
213 const vector signed short vr3 = vec_sra(vec_add(vec_add(vec_sl(vb3, v_2), 366 const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
214 v_sums6), 367 const vector signed short v_sumsB6 = vec_add(temp61, v_last);
215 vec_add(v_8, 368
216 vec_sl(vec_add(v_first, 369 const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
217 vec_add(v_sums1, v_sums4)), 370 const vector signed short v_sumsB7 = vec_add(temp71, v_last);
218 v_1))), 371
219 v_4); 372 const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
220 const vector signed short vr4 = vec_sra(vec_add(vec_add(vec_sl(vb4, v_2), 373 const vector signed short v_sumsB8 = vec_add(temp81, v_last);
221 v_sums7), 374
222 vec_add(v_8, 375 const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
223 vec_add(v_sums0, 376 const vector signed short v_sumsB9 = vec_add(temp91, v_last);
224 vec_sl(vec_add(v_sums2, v_sums5), 377
225 v_1)))), 378 #define COMPUTE_VR(i, j, k) \
226 v_4); 379 const vector signed short temps1##i = \
227 const vector signed short vr5 = vec_sra(vec_add(vec_add(vec_sl(vb5, v_2), 380 vec_add(v_sumsB##i, v_sumsB##k); \
228 v_sums8), 381 const vector signed short temps2##i = \
229 vec_add(v_8, 382 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \
230 vec_add(v_sums1, 383 const vector signed short vr##j = vec_sra(temps2##i, v_4)
231 vec_sl(vec_add(v_sums3, v_sums6), 384
232 v_1)))), 385 COMPUTE_VR(0, 1, 2);
233 v_4); 386 COMPUTE_VR(1, 2, 3);
234 const vector signed short vr6 = vec_sra(vec_add(vec_add(vec_sl(vb6, v_2), 387 COMPUTE_VR(2, 3, 4);
235 v_sums2), 388 COMPUTE_VR(3, 4, 5);
236 vec_add(v_8, 389 COMPUTE_VR(4, 5, 6);
237 vec_sl(vec_add(v_last, 390 COMPUTE_VR(5, 6, 7);
238 vec_add(v_sums7, v_sums4)), 391 COMPUTE_VR(6, 7, 8);
239 v_1))), 392 COMPUTE_VR(7, 8, 9);
240 v_4); 393
241 const vector signed short vr7 = vec_sra(vec_add(vec_add(vec_sl(vec_add(v_last, vb7), v_2), 394 const vector signed char neg1 = vec_splat_s8(-1);
242 vec_sl(vec_add(vb8, v_sums5), v_1)), 395 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
243 vec_add(v_8, v_sums3)), 396 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
244 v_4);
245 const vector signed short vr8 = vec_sra(vec_add(vec_add(vec_sl(v_sums8, v_2),
246 vec_sl(vec_add(v_last, v_sums6), v_1)),
247 vec_add(v_sums4, v_8)),
248 v_4);
249
250 const vector unsigned char neg1 = (vector unsigned char)AVV(-1, -1, -1, -1, -1, -1, -1, -1,
251 -1, -1, -1, -1, -1, -1, -1, -1);
252 const vector unsigned char permHH = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
253 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
254 397
255 #define PACK_AND_STORE(i) \ 398 #define PACK_AND_STORE(i) \
256 const vector unsigned char perms##i = \ 399 const vector unsigned char perms##i = \
257 vec_lvsr(i * stride, src2); \ 400 vec_lvsr(i * stride, src2); \
258 const vector unsigned char vf##i = \ 401 const vector unsigned char vf##i = \
259 vec_packsu(vr##i, (vector signed short)zero); \ 402 vec_packsu(vr##i, (vector signed short)zero); \
260 const vector unsigned char vg##i = \ 403 const vector unsigned char vg##i = \
261 vec_perm(vf##i, vbT##i, permHH); \ 404 vec_perm(vf##i, vbT##i, permHH); \
262 const vector unsigned char mask##i = \ 405 const vector unsigned char mask##i = \
263 vec_perm((vector unsigned char)zero, neg1, perms##i); \ 406 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
264 const vector unsigned char vg2##i = \ 407 const vector unsigned char vg2##i = \
265 vec_perm(vg##i, vg##i, perms##i); \ 408 vec_perm(vg##i, vg##i, perms##i); \
266 const vector unsigned char svA##i = \ 409 const vector unsigned char svA##i = \
267 vec_sel(vbA##i, vg2##i, mask##i); \ 410 vec_sel(vbA##i, vg2##i, mask##i); \
268 const vector unsigned char svB##i = \ 411 const vector unsigned char svB##i = \
381 const vector signed short vb4minusd = vec_sub(vb4, dornotd); 524 const vector signed short vb4minusd = vec_sub(vb4, dornotd);
382 const vector signed short vb5plusd = vec_add(vb5, dornotd); 525 const vector signed short vb5plusd = vec_add(vb5, dornotd);
383 /* finally, stores */ 526 /* finally, stores */
384 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); 527 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
385 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); 528 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
386 529
387 const vector unsigned char neg1 = (vector unsigned char)AVV(-1, -1, -1, -1, -1, -1, -1, -1, 530 const vector signed char neg1 = vec_splat_s8(-1);
388 -1, -1, -1, -1, -1, -1, -1, -1); 531 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
389 532 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
390 const vector unsigned char permHH = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
391 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
392 533
393 #define STORE(i) \ 534 #define STORE(i) \
394 const vector unsigned char perms##i = \ 535 const vector unsigned char perms##i = \
395 vec_lvsr(i * stride, src2); \ 536 vec_lvsr(i * stride, src2); \
396 const vector unsigned char vg##i = \ 537 const vector unsigned char vg##i = \
397 vec_perm(st##i, vbT##i, permHH); \ 538 vec_perm(st##i, vbT##i, permHH); \
398 const vector unsigned char mask##i = \ 539 const vector unsigned char mask##i = \
399 vec_perm((vector unsigned char)zero, neg1, perms##i); \ 540 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
400 const vector unsigned char vg2##i = \ 541 const vector unsigned char vg2##i = \
401 vec_perm(vg##i, vg##i, perms##i); \ 542 vec_perm(vg##i, vg##i, perms##i); \
402 const vector unsigned char svA##i = \ 543 const vector unsigned char svA##i = \
403 vec_sel(vbA##i, vg2##i, mask##i); \ 544 vec_sel(vbA##i, vg2##i, mask##i); \
404 const vector unsigned char svB##i = \ 545 const vector unsigned char svB##i = \
678 ITER(4, 5, 6); 819 ITER(4, 5, 6);
679 ITER(5, 6, 7); 820 ITER(5, 6, 7);
680 ITER(6, 7, 8); 821 ITER(6, 7, 8);
681 ITER(7, 8, 9); 822 ITER(7, 8, 9);
682 823
683 const vector signed char neg1 = vec_splat_s8( -1 ); 824 const vector signed char neg1 = vec_splat_s8(-1);
684 825
685 #define STORE_LINE(i) \ 826 #define STORE_LINE(i) \
686 const vector unsigned char permST##i = \ 827 const vector unsigned char permST##i = \
687 vec_lvsr(i * stride, srcCopy); \ 828 vec_lvsr(i * stride, srcCopy); \
688 const vector unsigned char maskST##i = \ 829 const vector unsigned char maskST##i = \
706 #undef STORE_LINE 847 #undef STORE_LINE
707 #undef ITER 848 #undef ITER
708 #undef F2 849 #undef F2
709 } 850 }
710 851
711 #define horizClassify_altivec(a...) horizClassify_C(a)
712 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) 852 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a)
713 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) 853 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a)
854 #define do_a_deblock_altivec(a...) do_a_deblock_C(a)
855
856 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
857 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
858 {
859 const vector signed int zero = vec_splat_s32(0);
860 const vector signed short vsint16_1 = vec_splat_s16(1);
861 vector signed int v_dp = zero;
862 vector signed int v_sysdp = zero;
863 int d, sysd, i;
864
865 tempBluredPast[127]= maxNoise[0];
866 tempBluredPast[128]= maxNoise[1];
867 tempBluredPast[129]= maxNoise[2];
868
869 #define LOAD_LINE(src, i) \
870 register int j##src##i = i * stride; \
871 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \
872 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \
873 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
874 const vector unsigned char v_##src##A##i = \
875 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \
876 vector signed short v_##src##Ass##i = \
877 (vector signed short)vec_mergeh((vector signed char)zero, \
878 (vector signed char)v_##src##A##i)
879
880 LOAD_LINE(src, 0);
881 LOAD_LINE(src, 1);
882 LOAD_LINE(src, 2);
883 LOAD_LINE(src, 3);
884 LOAD_LINE(src, 4);
885 LOAD_LINE(src, 5);
886 LOAD_LINE(src, 6);
887 LOAD_LINE(src, 7);
888
889 LOAD_LINE(tempBlured, 0);
890 LOAD_LINE(tempBlured, 1);
891 LOAD_LINE(tempBlured, 2);
892 LOAD_LINE(tempBlured, 3);
893 LOAD_LINE(tempBlured, 4);
894 LOAD_LINE(tempBlured, 5);
895 LOAD_LINE(tempBlured, 6);
896 LOAD_LINE(tempBlured, 7);
897 #undef LOAD_LINE
898
899 #define ACCUMULATE_DIFFS(i) \
900 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \
901 v_srcAss##i); \
902 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \
903 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp)
904
905 ACCUMULATE_DIFFS(0);
906 ACCUMULATE_DIFFS(1);
907 ACCUMULATE_DIFFS(2);
908 ACCUMULATE_DIFFS(3);
909 ACCUMULATE_DIFFS(4);
910 ACCUMULATE_DIFFS(5);
911 ACCUMULATE_DIFFS(6);
912 ACCUMULATE_DIFFS(7);
913 #undef ACCUMULATE_DIFFS
914
915 v_dp = vec_sums(v_dp, zero);
916 v_sysdp = vec_sums(v_sysdp, zero);
917
918 v_dp = vec_splat(v_dp, 3);
919 v_sysdp = vec_splat(v_sysdp, 3);
920
921 vec_ste(v_dp, 0, &d);
922 vec_ste(v_sysdp, 0, &sysd);
923
924 i = d;
925 d = (4*d
926 +(*(tempBluredPast-256))
927 +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
928 +(*(tempBluredPast+256))
929 +4)>>3;
930
931 *tempBluredPast=i;
932
933 if (d > maxNoise[1]) {
934 if (d < maxNoise[2]) {
935 #define OP(i) v_tempBluredAss##i = vec_avg(v_tempBluredAss##i, v_srcAss##i);
936
937 OP(0);
938 OP(1);
939 OP(2);
940 OP(3);
941 OP(4);
942 OP(5);
943 OP(6);
944 OP(7);
945 #undef OP
946 } else {
947 #define OP(i) v_tempBluredAss##i = v_srcAss##i;
948
949 OP(0);
950 OP(1);
951 OP(2);
952 OP(3);
953 OP(4);
954 OP(5);
955 OP(6);
956 OP(7);
957 #undef OP
958 }
959 } else {
960 if (d < maxNoise[0]) {
961 const vector signed short vsint16_7 = vec_splat_s16(7);
962 const vector signed short vsint16_4 = vec_splat_s16(4);
963 const vector unsigned short vuint16_3 = vec_splat_u16(3);
964
965 #define OP(i) \
966 const vector signed short v_temp##i = \
967 vec_mladd(v_tempBluredAss##i, \
968 vsint16_7, v_srcAss##i); \
969 const vector signed short v_temp2##i = \
970 vec_add(v_temp##i, vsint16_4); \
971 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3)
972
973 OP(0);
974 OP(1);
975 OP(2);
976 OP(3);
977 OP(4);
978 OP(5);
979 OP(6);
980 OP(7);
981 #undef OP
982 } else {
983 const vector signed short vsint16_3 = vec_splat_s16(3);
984 const vector signed short vsint16_2 = vec_splat_s16(2);
985
986 #define OP(i) \
987 const vector signed short v_temp##i = \
988 vec_mladd(v_tempBluredAss##i, \
989 vsint16_3, v_srcAss##i); \
990 const vector signed short v_temp2##i = \
991 vec_add(v_temp##i, vsint16_2); \
992 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2)
993
994 OP(0);
995 OP(1);
996 OP(2);
997 OP(3);
998 OP(4);
999 OP(5);
1000 OP(6);
1001 OP(7);
1002 #undef OP
1003 }
1004 }
1005
1006 const vector signed char neg1 = vec_splat_s8(-1);
1007 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1008 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
1009
1010 #define PACK_AND_STORE(src, i) \
1011 const vector unsigned char perms##src##i = \
1012 vec_lvsr(i * stride, src); \
1013 const vector unsigned char vf##src##i = \
1014 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \
1015 const vector unsigned char vg##src##i = \
1016 vec_perm(vf##src##i, v_##src##A##i, permHH); \
1017 const vector unsigned char mask##src##i = \
1018 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \
1019 const vector unsigned char vg2##src##i = \
1020 vec_perm(vg##src##i, vg##src##i, perms##src##i); \
1021 const vector unsigned char svA##src##i = \
1022 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \
1023 const vector unsigned char svB##src##i = \
1024 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \
1025 vec_st(svA##src##i, i * stride, src); \
1026 vec_st(svB##src##i, i * stride + 16, src)
1027
1028 PACK_AND_STORE(src, 0);
1029 PACK_AND_STORE(src, 1);
1030 PACK_AND_STORE(src, 2);
1031 PACK_AND_STORE(src, 3);
1032 PACK_AND_STORE(src, 4);
1033 PACK_AND_STORE(src, 5);
1034 PACK_AND_STORE(src, 6);
1035 PACK_AND_STORE(src, 7);
1036 PACK_AND_STORE(tempBlured, 0);
1037 PACK_AND_STORE(tempBlured, 1);
1038 PACK_AND_STORE(tempBlured, 2);
1039 PACK_AND_STORE(tempBlured, 3);
1040 PACK_AND_STORE(tempBlured, 4);
1041 PACK_AND_STORE(tempBlured, 5);
1042 PACK_AND_STORE(tempBlured, 6);
1043 PACK_AND_STORE(tempBlured, 7);
1044 #undef PACK_AND_STORE
1045 }