Mercurial > libavcodec.hg
comparison libpostproc/postprocess_altivec_template.c @ 2979:bfabfdf9ce55 libavcodec
COSMETICS: tabs --> spaces, some prettyprinting
| author | diego |
|---|---|
| date | Thu, 22 Dec 2005 01:10:11 +0000 |
| parents | ef2149182f1c |
| children | 0b546eab515d |
comparison
equal
deleted
inserted
replaced
| 2978:403183bbb505 | 2979:bfabfdf9ce55 |
|---|---|
| 24 #else | 24 #else |
| 25 #define AVV(x...) {x} | 25 #define AVV(x...) {x} |
| 26 #endif | 26 #endif |
| 27 | 27 |
| 28 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \ | 28 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \ |
| 29 do { \ | 29 do { \ |
| 30 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \ | 30 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \ |
| 31 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \ | 31 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \ |
| 32 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \ | 32 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \ |
| 33 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \ | 33 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \ |
| 34 tempA1 = vec_mergeh (src_a, src_e); \ | 34 tempA1 = vec_mergeh (src_a, src_e); \ |
| 35 tempB1 = vec_mergel (src_a, src_e); \ | 35 tempB1 = vec_mergel (src_a, src_e); \ |
| 36 tempC1 = vec_mergeh (src_b, src_f); \ | 36 tempC1 = vec_mergeh (src_b, src_f); \ |
| 37 tempD1 = vec_mergel (src_b, src_f); \ | 37 tempD1 = vec_mergel (src_b, src_f); \ |
| 38 tempE1 = vec_mergeh (src_c, src_g); \ | 38 tempE1 = vec_mergeh (src_c, src_g); \ |
| 39 tempF1 = vec_mergel (src_c, src_g); \ | 39 tempF1 = vec_mergel (src_c, src_g); \ |
| 40 tempG1 = vec_mergeh (src_d, src_h); \ | 40 tempG1 = vec_mergeh (src_d, src_h); \ |
| 41 tempH1 = vec_mergel (src_d, src_h); \ | 41 tempH1 = vec_mergel (src_d, src_h); \ |
| 42 tempA2 = vec_mergeh (tempA1, tempE1); \ | 42 tempA2 = vec_mergeh (tempA1, tempE1); \ |
| 43 tempB2 = vec_mergel (tempA1, tempE1); \ | 43 tempB2 = vec_mergel (tempA1, tempE1); \ |
| 44 tempC2 = vec_mergeh (tempB1, tempF1); \ | 44 tempC2 = vec_mergeh (tempB1, tempF1); \ |
| 45 tempD2 = vec_mergel (tempB1, tempF1); \ | 45 tempD2 = vec_mergel (tempB1, tempF1); \ |
| 46 tempE2 = vec_mergeh (tempC1, tempG1); \ | 46 tempE2 = vec_mergeh (tempC1, tempG1); \ |
| 47 tempF2 = vec_mergel (tempC1, tempG1); \ | 47 tempF2 = vec_mergel (tempC1, tempG1); \ |
| 48 tempG2 = vec_mergeh (tempD1, tempH1); \ | 48 tempG2 = vec_mergeh (tempD1, tempH1); \ |
| 49 tempH2 = vec_mergel (tempD1, tempH1); \ | 49 tempH2 = vec_mergel (tempD1, tempH1); \ |
| 50 src_a = vec_mergeh (tempA2, tempE2); \ | 50 src_a = vec_mergeh (tempA2, tempE2); \ |
| 51 src_b = vec_mergel (tempA2, tempE2); \ | 51 src_b = vec_mergel (tempA2, tempE2); \ |
| 52 src_c = vec_mergeh (tempB2, tempF2); \ | 52 src_c = vec_mergeh (tempB2, tempF2); \ |
| 53 src_d = vec_mergel (tempB2, tempF2); \ | 53 src_d = vec_mergel (tempB2, tempF2); \ |
| 54 src_e = vec_mergeh (tempC2, tempG2); \ | 54 src_e = vec_mergeh (tempC2, tempG2); \ |
| 55 src_f = vec_mergel (tempC2, tempG2); \ | 55 src_f = vec_mergel (tempC2, tempG2); \ |
| 56 src_g = vec_mergeh (tempD2, tempH2); \ | 56 src_g = vec_mergeh (tempD2, tempH2); \ |
| 57 src_h = vec_mergel (tempD2, tempH2); \ | 57 src_h = vec_mergel (tempD2, tempH2); \ |
| 58 } while (0) | 58 } while (0) |
| 59 | 59 |
| 60 | 60 |
| 61 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { | 61 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { |
| 62 /* | 62 /* |
| 92 | 92 |
| 93 src2 += stride * 4; | 93 src2 += stride * 4; |
| 94 | 94 |
| 95 vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7; | 95 vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7; |
| 96 | 96 |
| 97 #define LOAD_LINE(i) \ | 97 #define LOAD_LINE(i) \ |
| 98 register int j##i = i * stride; \ | 98 register int j##i = i * stride; \ |
| 99 vector unsigned char perm##i = vec_lvsl(j##i, src2); \ | 99 vector unsigned char perm##i = vec_lvsl(j##i, src2); \ |
| 100 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ | 100 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ |
| 101 vector unsigned char v_srcA2##i; \ | 101 vector unsigned char v_srcA2##i; \ |
| 102 if (two_vectors) \ | 102 if (two_vectors) \ |
| 103 v_srcA2##i = vec_ld(j##i + 16, src2); \ | 103 v_srcA2##i = vec_ld(j##i + 16, src2); \ |
| 104 const vector unsigned char v_srcA##i = \ | 104 const vector unsigned char v_srcA##i = \ |
| 105 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ | 105 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ |
| 106 v_srcAss##i = \ | 106 v_srcAss##i = \ |
| 107 (vector signed short)vec_mergeh((vector signed char)zero, \ | 107 (vector signed short)vec_mergeh((vector signed char)zero, \ |
| 108 (vector signed char)v_srcA##i) | 108 (vector signed char)v_srcA##i) |
| 109 | 109 |
| 110 #define LOAD_LINE_ALIGNED(i) \ | 110 #define LOAD_LINE_ALIGNED(i) \ |
| 111 register int j##i = i * stride; \ | 111 register int j##i = i * stride; \ |
| 112 const vector unsigned char v_srcA##i = vec_ld(j##i, src2); \ | 112 const vector unsigned char v_srcA##i = vec_ld(j##i, src2); \ |
| 113 v_srcAss##i = \ | 113 v_srcAss##i = \ |
| 114 (vector signed short)vec_mergeh((vector signed char)zero, \ | 114 (vector signed short)vec_mergeh((vector signed char)zero, \ |
| 115 (vector signed char)v_srcA##i) | 115 (vector signed char)v_srcA##i) |
| 116 | 116 |
| 117 // special casing the aligned case is worthwhile, as all call from | 117 // special casing the aligned case is worthwhile, as all call from |
| 118 // the (transposed) horizontable deblocks will be aligned, i naddition | 118 // the (transposed) horizontable deblocks will be aligned, i naddition |
| 119 // to the naturraly aligned vertical deblocks. | 119 // to the naturraly aligned vertical deblocks. |
| 120 if (properStride && srcAlign) { | 120 if (properStride && srcAlign) { |
| 137 LOAD_LINE(7); | 137 LOAD_LINE(7); |
| 138 } | 138 } |
| 139 #undef LOAD_LINE | 139 #undef LOAD_LINE |
| 140 #undef LOAD_LINE_ALIGNED | 140 #undef LOAD_LINE_ALIGNED |
| 141 | 141 |
| 142 #define ITER(i, j) \ | 142 #define ITER(i, j) \ |
| 143 const vector signed short v_diff##i = \ | 143 const vector signed short v_diff##i = \ |
| 144 vec_sub(v_srcAss##i, v_srcAss##j); \ | 144 vec_sub(v_srcAss##i, v_srcAss##j); \ |
| 145 const vector signed short v_sum##i = \ | 145 const vector signed short v_sum##i = \ |
| 146 vec_add(v_diff##i, v_dcOffset); \ | 146 vec_add(v_diff##i, v_dcOffset); \ |
| 147 const vector signed short v_comp##i = \ | 147 const vector signed short v_comp##i = \ |
| 148 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ | 148 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ |
| 149 v_dcThreshold); \ | 149 v_dcThreshold); \ |
| 150 const vector signed short v_part##i = vec_and(mask, v_comp##i); \ | 150 const vector signed short v_part##i = vec_and(mask, v_comp##i); \ |
| 151 v_numEq = vec_sum4s(v_part##i, v_numEq); | 151 v_numEq = vec_sum4s(v_part##i, v_numEq); |
| 152 | 152 |
| 153 ITER(0, 1); | 153 ITER(0, 1); |
| 154 ITER(1, 2); | 154 ITER(1, 2); |
| 155 ITER(2, 3); | 155 ITER(2, 3); |
| 165 vec_ste(v_numEq, 0, &numEq); | 165 vec_ste(v_numEq, 0, &numEq); |
| 166 | 166 |
| 167 if (numEq > c->ppMode.flatnessThreshold) | 167 if (numEq > c->ppMode.flatnessThreshold) |
| 168 { | 168 { |
| 169 const vector unsigned char mmoP1 = (const vector unsigned char) | 169 const vector unsigned char mmoP1 = (const vector unsigned char) |
| 170 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, | 170 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, |
| 171 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B); | 171 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B); |
| 172 const vector unsigned char mmoP2 = (const vector unsigned char) | 172 const vector unsigned char mmoP2 = (const vector unsigned char) |
| 173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, | 173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, |
| 174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); | 174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); |
| 175 const vector unsigned char mmoP = (const vector unsigned char) | 175 const vector unsigned char mmoP = (const vector unsigned char) |
| 176 vec_lvsl(8, (unsigned char*)0); | 176 vec_lvsl(8, (unsigned char*)0); |
| 177 | 177 |
| 178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); | 178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); |
| 179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); | 179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); |
| 180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); | 180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); |
| 181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); | 181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); |
| 183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); | 183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); |
| 184 vector signed short mmoDiff = vec_sub(mmoL, mmoR); | 184 vector signed short mmoDiff = vec_sub(mmoL, mmoR); |
| 185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); | 185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); |
| 186 | 186 |
| 187 if (vec_any_gt(mmoSum, v4QP)) | 187 if (vec_any_gt(mmoSum, v4QP)) |
| 188 return 0; | 188 return 0; |
| 189 else | 189 else |
| 190 return 1; | 190 return 1; |
| 191 } | 191 } |
| 192 else return 2; | 192 else return 2; |
| 193 } | 193 } |
| 194 | 194 |
| 195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { | 195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { |
| 216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; | 216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; |
| 217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; | 217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; |
| 218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; | 218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; |
| 219 | 219 |
| 220 #define LOAD_LINE(i) \ | 220 #define LOAD_LINE(i) \ |
| 221 const vector unsigned char perml##i = \ | 221 const vector unsigned char perml##i = \ |
| 222 vec_lvsl(i * stride, src2); \ | 222 vec_lvsl(i * stride, src2); \ |
| 223 vbA##i = vec_ld(i * stride, src2); \ | 223 vbA##i = vec_ld(i * stride, src2); \ |
| 224 vbB##i = vec_ld(i * stride + 16, src2); \ | 224 vbB##i = vec_ld(i * stride + 16, src2); \ |
| 225 vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \ | 225 vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \ |
| 226 vb##i = \ | 226 vb##i = \ |
| 227 (vector signed short)vec_mergeh((vector unsigned char)zero, \ | 227 (vector signed short)vec_mergeh((vector unsigned char)zero, \ |
| 228 (vector unsigned char)vbT##i) | 228 (vector unsigned char)vbT##i) |
| 229 | 229 |
| 230 #define LOAD_LINE_ALIGNED(i) \ | 230 #define LOAD_LINE_ALIGNED(i) \ |
| 231 register int j##i = i * stride; \ | 231 register int j##i = i * stride; \ |
| 232 vbT##i = vec_ld(j##i, src2); \ | 232 vbT##i = vec_ld(j##i, src2); \ |
| 233 vb##i = \ | 233 vb##i = \ |
| 234 (vector signed short)vec_mergeh((vector signed char)zero, \ | 234 (vector signed short)vec_mergeh((vector signed char)zero, \ |
| 235 (vector signed char)vbT##i) | 235 (vector signed char)vbT##i) |
| 236 | 236 |
| 237 // special casing the aligned case is worthwhile, as all call from | 237 // special casing the aligned case is worthwhile, as all call from |
| 238 // the (transposed) horizontable deblocks will be aligned, in addition | 238 // the (transposed) horizontable deblocks will be aligned, in addition |
| 239 // to the naturraly aligned vertical deblocks. | 239 // to the naturraly aligned vertical deblocks. |
| 240 if (properStride && srcAlign) { | 240 if (properStride && srcAlign) { |
| 306 const vector signed short v_sumsB8 = vec_add(temp81, v_last); | 306 const vector signed short v_sumsB8 = vec_add(temp81, v_last); |
| 307 | 307 |
| 308 const vector signed short temp91 = vec_sub(v_sumsB8, vb5); | 308 const vector signed short temp91 = vec_sub(v_sumsB8, vb5); |
| 309 const vector signed short v_sumsB9 = vec_add(temp91, v_last); | 309 const vector signed short v_sumsB9 = vec_add(temp91, v_last); |
| 310 | 310 |
| 311 #define COMPUTE_VR(i, j, k) \ | 311 #define COMPUTE_VR(i, j, k) \ |
| 312 const vector signed short temps1##i = \ | 312 const vector signed short temps1##i = \ |
| 313 vec_add(v_sumsB##i, v_sumsB##k); \ | 313 vec_add(v_sumsB##i, v_sumsB##k); \ |
| 314 const vector signed short temps2##i = \ | 314 const vector signed short temps2##i = \ |
| 315 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ | 315 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ |
| 316 const vector signed short vr##j = vec_sra(temps2##i, v_4) | 316 const vector signed short vr##j = vec_sra(temps2##i, v_4) |
| 317 | 317 |
| 318 COMPUTE_VR(0, 1, 2); | 318 COMPUTE_VR(0, 1, 2); |
| 319 COMPUTE_VR(1, 2, 3); | 319 COMPUTE_VR(1, 2, 3); |
| 320 COMPUTE_VR(2, 3, 4); | 320 COMPUTE_VR(2, 3, 4); |
| 324 COMPUTE_VR(6, 7, 8); | 324 COMPUTE_VR(6, 7, 8); |
| 325 COMPUTE_VR(7, 8, 9); | 325 COMPUTE_VR(7, 8, 9); |
| 326 | 326 |
| 327 const vector signed char neg1 = vec_splat_s8(-1); | 327 const vector signed char neg1 = vec_splat_s8(-1); |
| 328 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | 328 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 329 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | 329 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
| 330 | 330 |
| 331 #define PACK_AND_STORE(i) \ | 331 #define PACK_AND_STORE(i) \ |
| 332 const vector unsigned char perms##i = \ | 332 const vector unsigned char perms##i = \ |
| 333 vec_lvsr(i * stride, src2); \ | 333 vec_lvsr(i * stride, src2); \ |
| 334 const vector unsigned char vf##i = \ | 334 const vector unsigned char vf##i = \ |
| 335 vec_packsu(vr##i, (vector signed short)zero); \ | 335 vec_packsu(vr##i, (vector signed short)zero); \ |
| 336 const vector unsigned char vg##i = \ | 336 const vector unsigned char vg##i = \ |
| 337 vec_perm(vf##i, vbT##i, permHH); \ | 337 vec_perm(vf##i, vbT##i, permHH); \ |
| 338 const vector unsigned char mask##i = \ | 338 const vector unsigned char mask##i = \ |
| 339 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ | 339 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ |
| 340 const vector unsigned char vg2##i = \ | 340 const vector unsigned char vg2##i = \ |
| 341 vec_perm(vg##i, vg##i, perms##i); \ | 341 vec_perm(vg##i, vg##i, perms##i); \ |
| 342 const vector unsigned char svA##i = \ | 342 const vector unsigned char svA##i = \ |
| 343 vec_sel(vbA##i, vg2##i, mask##i); \ | 343 vec_sel(vbA##i, vg2##i, mask##i); \ |
| 344 const vector unsigned char svB##i = \ | 344 const vector unsigned char svB##i = \ |
| 345 vec_sel(vg2##i, vbB##i, mask##i); \ | 345 vec_sel(vg2##i, vbB##i, mask##i); \ |
| 346 vec_st(svA##i, i * stride, src2); \ | 346 vec_st(svA##i, i * stride, src2); \ |
| 347 vec_st(svB##i, i * stride + 16, src2) | 347 vec_st(svB##i, i * stride + 16, src2) |
| 348 | 348 |
| 349 #define PACK_AND_STORE_ALIGNED(i) \ | 349 #define PACK_AND_STORE_ALIGNED(i) \ |
| 350 const vector unsigned char vf##i = \ | 350 const vector unsigned char vf##i = \ |
| 351 vec_packsu(vr##i, (vector signed short)zero); \ | 351 vec_packsu(vr##i, (vector signed short)zero); \ |
| 352 const vector unsigned char vg##i = \ | 352 const vector unsigned char vg##i = \ |
| 353 vec_perm(vf##i, vbT##i, permHH); \ | 353 vec_perm(vf##i, vbT##i, permHH); \ |
| 354 vec_st(vg##i, i * stride, src2) | 354 vec_st(vg##i, i * stride, src2) |
| 355 | 355 |
| 356 // special casing the aligned case is worthwhile, as all call from | 356 // special casing the aligned case is worthwhile, as all call from |
| 357 // the (transposed) horizontable deblocks will be aligned, in addition | 357 // the (transposed) horizontable deblocks will be aligned, in addition |
| 358 // to the naturraly aligned vertical deblocks. | 358 // to the naturraly aligned vertical deblocks. |
| 396 qp[0] = 8*c->QP; | 396 qp[0] = 8*c->QP; |
| 397 vector signed short vqp = vec_ld(0, qp); | 397 vector signed short vqp = vec_ld(0, qp); |
| 398 vqp = vec_splat(vqp, 0); | 398 vqp = vec_splat(vqp, 0); |
| 399 | 399 |
| 400 #define LOAD_LINE(i) \ | 400 #define LOAD_LINE(i) \ |
| 401 const vector unsigned char perm##i = \ | 401 const vector unsigned char perm##i = \ |
| 402 vec_lvsl(i * stride, src2); \ | 402 vec_lvsl(i * stride, src2); \ |
| 403 const vector unsigned char vbA##i = \ | 403 const vector unsigned char vbA##i = \ |
| 404 vec_ld(i * stride, src2); \ | 404 vec_ld(i * stride, src2); \ |
| 405 const vector unsigned char vbB##i = \ | 405 const vector unsigned char vbB##i = \ |
| 406 vec_ld(i * stride + 16, src2); \ | 406 vec_ld(i * stride + 16, src2); \ |
| 407 const vector unsigned char vbT##i = \ | 407 const vector unsigned char vbT##i = \ |
| 408 vec_perm(vbA##i, vbB##i, perm##i); \ | 408 vec_perm(vbA##i, vbB##i, perm##i); \ |
| 409 const vector signed short vb##i = \ | 409 const vector signed short vb##i = \ |
| 410 (vector signed short)vec_mergeh((vector unsigned char)zero, \ | 410 (vector signed short)vec_mergeh((vector unsigned char)zero, \ |
| 411 (vector unsigned char)vbT##i) | 411 (vector unsigned char)vbT##i) |
| 412 | 412 |
| 413 src2 += stride*3; | 413 src2 += stride*3; |
| 414 | 414 |
| 415 LOAD_LINE(1); | 415 LOAD_LINE(1); |
| 416 LOAD_LINE(2); | 416 LOAD_LINE(2); |
| 424 | 424 |
| 425 const vector signed short v_1 = vec_splat_s16(1); | 425 const vector signed short v_1 = vec_splat_s16(1); |
| 426 const vector signed short v_2 = vec_splat_s16(2); | 426 const vector signed short v_2 = vec_splat_s16(2); |
| 427 const vector signed short v_5 = vec_splat_s16(5); | 427 const vector signed short v_5 = vec_splat_s16(5); |
| 428 const vector signed short v_32 = vec_sl(v_1, | 428 const vector signed short v_32 = vec_sl(v_1, |
| 429 (vector unsigned short)v_5); | 429 (vector unsigned short)v_5); |
| 430 /* middle energy */ | 430 /* middle energy */ |
| 431 const vector signed short l3minusl6 = vec_sub(vb3, vb6); | 431 const vector signed short l3minusl6 = vec_sub(vb3, vb6); |
| 432 const vector signed short l5minusl4 = vec_sub(vb5, vb4); | 432 const vector signed short l5minusl4 = vec_sub(vb5, vb4); |
| 433 const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero); | 433 const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero); |
| 434 const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6); | 434 const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6); |
| 481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); | 481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); |
| 482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); | 482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); |
| 483 | 483 |
| 484 const vector signed char neg1 = vec_splat_s8(-1); | 484 const vector signed char neg1 = vec_splat_s8(-1); |
| 485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | 485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | 486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
| 487 | 487 |
| 488 #define STORE(i) \ | 488 #define STORE(i) \ |
| 489 const vector unsigned char perms##i = \ | 489 const vector unsigned char perms##i = \ |
| 490 vec_lvsr(i * stride, src2); \ | 490 vec_lvsr(i * stride, src2); \ |
| 491 const vector unsigned char vg##i = \ | 491 const vector unsigned char vg##i = \ |
| 492 vec_perm(st##i, vbT##i, permHH); \ | 492 vec_perm(st##i, vbT##i, permHH); \ |
| 493 const vector unsigned char mask##i = \ | 493 const vector unsigned char mask##i = \ |
| 494 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ | 494 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ |
| 495 const vector unsigned char vg2##i = \ | 495 const vector unsigned char vg2##i = \ |
| 496 vec_perm(vg##i, vg##i, perms##i); \ | 496 vec_perm(vg##i, vg##i, perms##i); \ |
| 497 const vector unsigned char svA##i = \ | 497 const vector unsigned char svA##i = \ |
| 498 vec_sel(vbA##i, vg2##i, mask##i); \ | 498 vec_sel(vbA##i, vg2##i, mask##i); \ |
| 499 const vector unsigned char svB##i = \ | 499 const vector unsigned char svB##i = \ |
| 500 vec_sel(vg2##i, vbB##i, mask##i); \ | 500 vec_sel(vg2##i, vbB##i, mask##i); \ |
| 501 vec_st(svA##i, i * stride, src2); \ | 501 vec_st(svA##i, i * stride, src2); \ |
| 502 vec_st(svB##i, i * stride + 16, src2) | 502 vec_st(svB##i, i * stride + 16, src2) |
| 503 | 503 |
| 504 STORE(4); | 504 STORE(4); |
| 505 STORE(5); | 505 STORE(5); |
| 506 } | 506 } |
| 520 const vector signed int zero = vec_splat_s32(0); | 520 const vector signed int zero = vec_splat_s32(0); |
| 521 vector unsigned char v_dt; | 521 vector unsigned char v_dt; |
| 522 dt[0] = deringThreshold; | 522 dt[0] = deringThreshold; |
| 523 v_dt = vec_splat(vec_ld(0, dt), 0); | 523 v_dt = vec_splat(vec_ld(0, dt), 0); |
| 524 | 524 |
| 525 #define LOAD_LINE(i) \ | 525 #define LOAD_LINE(i) \ |
| 526 const vector unsigned char perm##i = \ | 526 const vector unsigned char perm##i = \ |
| 527 vec_lvsl(i * stride, srcCopy); \ | 527 vec_lvsl(i * stride, srcCopy); \ |
| 528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ | 528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ |
| 529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ | 529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ |
| 530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) | 530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) |
| 531 | 531 |
| 532 LOAD_LINE(0); | 532 LOAD_LINE(0); |
| 533 LOAD_LINE(1); | 533 LOAD_LINE(1); |
| 534 LOAD_LINE(2); | 534 LOAD_LINE(2); |
| 543 | 543 |
| 544 vector unsigned char v_avg; | 544 vector unsigned char v_avg; |
| 545 { | 545 { |
| 546 const vector unsigned char trunc_perm = (vector unsigned char) | 546 const vector unsigned char trunc_perm = (vector unsigned char) |
| 547 AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, | 547 AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, |
| 548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); | 548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); |
| 549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); | 549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); |
| 550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); | 550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); |
| 551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); | 551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); |
| 552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); | 552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); |
| 553 | 553 |
| 554 #define EXTRACT(op) do { \ | 554 #define EXTRACT(op) do { \ |
| 555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ | 555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ |
| 556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ | 556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ |
| 557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ | 557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ |
| 558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ | 558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ |
| 559 const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \ | 559 const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \ |
| 582 | 582 |
| 583 signed int __attribute__((aligned(16))) S[8]; | 583 signed int __attribute__((aligned(16))) S[8]; |
| 584 { | 584 { |
| 585 const vector unsigned short mask1 = (vector unsigned short) | 585 const vector unsigned short mask1 = (vector unsigned short) |
| 586 AVV(0x0001, 0x0002, 0x0004, 0x0008, | 586 AVV(0x0001, 0x0002, 0x0004, 0x0008, |
| 587 0x0010, 0x0020, 0x0040, 0x0080); | 587 0x0010, 0x0020, 0x0040, 0x0080); |
| 588 const vector unsigned short mask2 = (vector unsigned short) | 588 const vector unsigned short mask2 = (vector unsigned short) |
| 589 AVV(0x0100, 0x0200, 0x0000, 0x0000, | 589 AVV(0x0100, 0x0200, 0x0000, 0x0000, |
| 590 0x0000, 0x0000, 0x0000, 0x0000); | 590 0x0000, 0x0000, 0x0000, 0x0000); |
| 591 | 591 |
| 592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); | 592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); |
| 593 const vector unsigned int vuint32_1 = vec_splat_u32(1); | 593 const vector unsigned int vuint32_1 = vec_splat_u32(1); |
| 594 | 594 |
| 595 #define COMPARE(i) \ | 595 #define COMPARE(i) \ |
| 596 vector signed int sum##i; \ | 596 vector signed int sum##i; \ |
| 597 do { \ | 597 do { \ |
| 598 const vector unsigned char cmp##i = \ | 598 const vector unsigned char cmp##i = \ |
| 599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ | 599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ |
| 600 const vector unsigned short cmpHi##i = \ | 600 const vector unsigned short cmpHi##i = \ |
| 601 (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \ | 601 (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \ |
| 602 const vector unsigned short cmpLi##i = \ | 602 const vector unsigned short cmpLi##i = \ |
| 603 (vector unsigned short)vec_mergel(cmp##i, cmp##i); \ | 603 (vector unsigned short)vec_mergel(cmp##i, cmp##i); \ |
| 604 const vector signed short cmpHf##i = \ | 604 const vector signed short cmpHf##i = \ |
| 605 (vector signed short)vec_and(cmpHi##i, mask1); \ | 605 (vector signed short)vec_and(cmpHi##i, mask1); \ |
| 606 const vector signed short cmpLf##i = \ | 606 const vector signed short cmpLf##i = \ |
| 607 (vector signed short)vec_and(cmpLi##i, mask2); \ | 607 (vector signed short)vec_and(cmpLi##i, mask2); \ |
| 608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ | 608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ |
| 609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ | 609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ |
| 610 sum##i = vec_sums(sumq##i, zero); } while (0) | 610 sum##i = vec_sums(sumq##i, zero); } while (0) |
| 611 | 611 |
| 612 COMPARE(0); | 612 COMPARE(0); |
| 613 COMPARE(1); | 613 COMPARE(1); |
| 614 COMPARE(2); | 614 COMPARE(2); |
| 641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); | 641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); |
| 642 const vector signed int t2A = vec_or(sumA, tA); | 642 const vector signed int t2A = vec_or(sumA, tA); |
| 643 const vector signed int t2B = vec_or(sumB, tB); | 643 const vector signed int t2B = vec_or(sumB, tB); |
| 644 const vector signed int t2C = vec_or(sumC, tC); | 644 const vector signed int t2C = vec_or(sumC, tC); |
| 645 const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1), | 645 const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1), |
| 646 vec_sl(t2A, vuint32_1)); | 646 vec_sl(t2A, vuint32_1)); |
| 647 const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1), | 647 const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1), |
| 648 vec_sl(t2B, vuint32_1)); | 648 vec_sl(t2B, vuint32_1)); |
| 649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), | 649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), |
| 650 vec_sl(t2C, vuint32_1)); | 650 vec_sl(t2C, vuint32_1)); |
| 651 const vector signed int yA = vec_and(t2A, t3A); | 651 const vector signed int yA = vec_and(t2A, t3A); |
| 652 const vector signed int yB = vec_and(t2B, t3B); | 652 const vector signed int yB = vec_and(t2B, t3B); |
| 653 const vector signed int yC = vec_and(t2C, t3C); | 653 const vector signed int yC = vec_and(t2C, t3C); |
| 654 | 654 |
| 655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); | 655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); |
| 657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); | 657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); |
| 658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); | 658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); |
| 659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); | 659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); |
| 660 const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2); | 660 const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2); |
| 661 const vector signed int sumAp = vec_and(yA, | 661 const vector signed int sumAp = vec_and(yA, |
| 662 vec_and(sumAd4,sumAd8)); | 662 vec_and(sumAd4,sumAd8)); |
| 663 const vector signed int sumBp = vec_and(yB, | 663 const vector signed int sumBp = vec_and(yB, |
| 664 vec_and(sumBd4,sumBd8)); | 664 vec_and(sumBd4,sumBd8)); |
| 665 sumA2 = vec_or(sumAp, | 665 sumA2 = vec_or(sumAp, |
| 666 vec_sra(sumAp, | 666 vec_sra(sumAp, |
| 667 vuint32_16)); | 667 vuint32_16)); |
| 668 sumB2 = vec_or(sumBp, | 668 sumB2 = vec_or(sumBp, |
| 669 vec_sra(sumBp, | 669 vec_sra(sumBp, |
| 670 vuint32_16)); | 670 vuint32_16)); |
| 671 } | 671 } |
| 672 vec_st(sumA2, 0, S); | 672 vec_st(sumA2, 0, S); |
| 673 vec_st(sumB2, 16, S); | 673 vec_st(sumB2, 16, S); |
| 674 } | 674 } |
| 675 | 675 |
| 684 const vector signed int vsint32_8 = vec_splat_s32(8); | 684 const vector signed int vsint32_8 = vec_splat_s32(8); |
| 685 const vector unsigned int vuint32_4 = vec_splat_u32(4); | 685 const vector unsigned int vuint32_4 = vec_splat_u32(4); |
| 686 | 686 |
| 687 const vector unsigned char permA1 = (vector unsigned char) | 687 const vector unsigned char permA1 = (vector unsigned char) |
| 688 AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F, | 688 AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F, |
| 689 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); | 689 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); |
| 690 const vector unsigned char permA2 = (vector unsigned char) | 690 const vector unsigned char permA2 = (vector unsigned char) |
| 691 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, | 691 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, |
| 692 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); | 692 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); |
| 693 const vector unsigned char permA1inc = (vector unsigned char) | 693 const vector unsigned char permA1inc = (vector unsigned char) |
| 694 AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, | 694 AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, |
| 695 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | 695 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
| 696 const vector unsigned char permA2inc = (vector unsigned char) | 696 const vector unsigned char permA2inc = (vector unsigned char) |
| 697 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, | 697 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, |
| 698 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | 698 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
| 699 const vector unsigned char magic = (vector unsigned char) | 699 const vector unsigned char magic = (vector unsigned char) |
| 700 AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02, | 700 AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02, |
| 701 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | 701 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
| 702 const vector unsigned char extractPerm = (vector unsigned char) | 702 const vector unsigned char extractPerm = (vector unsigned char) |
| 703 AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01, | 703 AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01, |
| 704 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01); | 704 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01); |
| 705 const vector unsigned char extractPermInc = (vector unsigned char) | 705 const vector unsigned char extractPermInc = (vector unsigned char) |
| 706 AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, | 706 AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, |
| 707 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01); | 707 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01); |
| 708 const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0); | 708 const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0); |
| 709 const vector unsigned char tenRight = (vector unsigned char) | 709 const vector unsigned char tenRight = (vector unsigned char) |
| 710 AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | 710 AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 711 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); | 711 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
| 712 const vector unsigned char eightLeft = (vector unsigned char) | 712 const vector unsigned char eightLeft = (vector unsigned char) |
| 713 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, | 713 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
| 714 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08); | 714 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08); |
| 715 | 715 |
| 716 | 716 |
| 717 #define F_INIT(i) \ | 717 #define F_INIT(i) \ |
| 718 vector unsigned char tenRightM##i = tenRight; \ | 718 vector unsigned char tenRightM##i = tenRight; \ |
| 719 vector unsigned char permA1M##i = permA1; \ | 719 vector unsigned char permA1M##i = permA1; \ |
| 720 vector unsigned char permA2M##i = permA2; \ | 720 vector unsigned char permA2M##i = permA2; \ |
| 721 vector unsigned char extractPermM##i = extractPerm | 721 vector unsigned char extractPermM##i = extractPerm |
| 722 | 722 |
| 723 #define F2(i, j, k, l) \ | 723 #define F2(i, j, k, l) \ |
| 724 if (S[i] & (1 << (l+1))) { \ | 724 if (S[i] & (1 << (l+1))) { \ |
| 725 const vector unsigned char a_##j##_A##l = \ | 725 const vector unsigned char a_##j##_A##l = \ |
| 726 vec_perm(src##i, src##j, permA1M##i); \ | 726 vec_perm(src##i, src##j, permA1M##i); \ |
| 727 const vector unsigned char a_##j##_B##l = \ | 727 const vector unsigned char a_##j##_B##l = \ |
| 728 vec_perm(a_##j##_A##l, src##k, permA2M##i); \ | 728 vec_perm(a_##j##_A##l, src##k, permA2M##i); \ |
| 729 const vector signed int a_##j##_sump##l = \ | 729 const vector signed int a_##j##_sump##l = \ |
| 730 (vector signed int)vec_msum(a_##j##_B##l, magic, \ | 730 (vector signed int)vec_msum(a_##j##_B##l, magic, \ |
| 731 (vector unsigned int)zero); \ | 731 (vector unsigned int)zero); \ |
| 732 vector signed int F_##j##_##l = \ | 732 vector signed int F_##j##_##l = \ |
| 733 vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \ | 733 vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \ |
| 734 F_##j##_##l = vec_splat(F_##j##_##l, 3); \ | 734 F_##j##_##l = vec_splat(F_##j##_##l, 3); \ |
| 735 const vector signed int p_##j##_##l = \ | 735 const vector signed int p_##j##_##l = \ |
| 736 (vector signed int)vec_perm(src##j, \ | 736 (vector signed int)vec_perm(src##j, \ |
| 737 (vector unsigned char)zero, \ | 737 (vector unsigned char)zero, \ |
| 738 extractPermM##i); \ | 738 extractPermM##i); \ |
| 739 const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2); \ | 739 const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2);\ |
| 740 const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2); \ | 740 const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2);\ |
| 741 vector signed int newpm_##j##_##l; \ | 741 vector signed int newpm_##j##_##l; \ |
| 742 if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \ | 742 if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \ |
| 743 newpm_##j##_##l = sum_##j##_##l; \ | 743 newpm_##j##_##l = sum_##j##_##l; \ |
| 744 else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \ | 744 else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \ |
| 745 newpm_##j##_##l = diff_##j##_##l; \ | 745 newpm_##j##_##l = diff_##j##_##l; \ |
| 746 else newpm_##j##_##l = F_##j##_##l; \ | 746 else newpm_##j##_##l = F_##j##_##l; \ |
| 747 const vector unsigned char newpm2_##j##_##l = \ | 747 const vector unsigned char newpm2_##j##_##l = \ |
| 748 vec_splat((vector unsigned char)newpm_##j##_##l, 15); \ | 748 vec_splat((vector unsigned char)newpm_##j##_##l, 15); \ |
| 749 const vector unsigned char mask##j##l = vec_add(identity, \ | 749 const vector unsigned char mask##j##l = vec_add(identity, \ |
| 750 tenRightM##i); \ | 750 tenRightM##i); \ |
| 751 src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \ | 751 src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \ |
| 752 } \ | 752 } \ |
| 753 permA1M##i = vec_add(permA1M##i, permA1inc); \ | 753 permA1M##i = vec_add(permA1M##i, permA1inc); \ |
| 754 permA2M##i = vec_add(permA2M##i, permA2inc); \ | 754 permA2M##i = vec_add(permA2M##i, permA2inc); \ |
| 755 tenRightM##i = vec_sro(tenRightM##i, eightLeft); \ | 755 tenRightM##i = vec_sro(tenRightM##i, eightLeft); \ |
| 756 extractPermM##i = vec_add(extractPermM##i, extractPermInc) | 756 extractPermM##i = vec_add(extractPermM##i, extractPermInc) |
| 757 | 757 |
| 758 #define ITER(i, j, k) \ | 758 #define ITER(i, j, k) \ |
| 759 F_INIT(i); \ | 759 F_INIT(i); \ |
| 760 F2(i, j, k, 0); \ | 760 F2(i, j, k, 0); \ |
| 761 F2(i, j, k, 1); \ | 761 F2(i, j, k, 1); \ |
| 762 F2(i, j, k, 2); \ | 762 F2(i, j, k, 2); \ |
| 763 F2(i, j, k, 3); \ | 763 F2(i, j, k, 3); \ |
| 764 F2(i, j, k, 4); \ | 764 F2(i, j, k, 4); \ |
| 765 F2(i, j, k, 5); \ | 765 F2(i, j, k, 5); \ |
| 766 F2(i, j, k, 6); \ | 766 F2(i, j, k, 6); \ |
| 767 F2(i, j, k, 7) | 767 F2(i, j, k, 7) |
| 768 | 768 |
| 769 ITER(0, 1, 2); | 769 ITER(0, 1, 2); |
| 770 ITER(1, 2, 3); | 770 ITER(1, 2, 3); |
| 771 ITER(2, 3, 4); | 771 ITER(2, 3, 4); |
| 775 ITER(6, 7, 8); | 775 ITER(6, 7, 8); |
| 776 ITER(7, 8, 9); | 776 ITER(7, 8, 9); |
| 777 | 777 |
| 778 const vector signed char neg1 = vec_splat_s8(-1); | 778 const vector signed char neg1 = vec_splat_s8(-1); |
| 779 | 779 |
| 780 #define STORE_LINE(i) \ | 780 #define STORE_LINE(i) \ |
| 781 const vector unsigned char permST##i = \ | 781 const vector unsigned char permST##i = \ |
| 782 vec_lvsr(i * stride, srcCopy); \ | 782 vec_lvsr(i * stride, srcCopy); \ |
| 783 const vector unsigned char maskST##i = \ | 783 const vector unsigned char maskST##i = \ |
| 784 vec_perm((vector unsigned char)zero, \ | 784 vec_perm((vector unsigned char)zero, \ |
| 785 (vector unsigned char)neg1, permST##i); \ | 785 (vector unsigned char)neg1, permST##i); \ |
| 786 src##i = vec_perm(src##i ,src##i, permST##i); \ | 786 src##i = vec_perm(src##i ,src##i, permST##i); \ |
| 787 sA##i= vec_sel(sA##i, src##i, maskST##i); \ | 787 sA##i= vec_sel(sA##i, src##i, maskST##i); \ |
| 788 sB##i= vec_sel(src##i, sB##i, maskST##i); \ | 788 sB##i= vec_sel(src##i, sB##i, maskST##i); \ |
| 789 vec_st(sA##i, i * stride, srcCopy); \ | 789 vec_st(sA##i, i * stride, srcCopy); \ |
| 790 vec_st(sB##i, i * stride + 16, srcCopy) | 790 vec_st(sB##i, i * stride + 16, srcCopy) |
| 791 | 791 |
| 792 STORE_LINE(1); | 792 STORE_LINE(1); |
| 793 STORE_LINE(2); | 793 STORE_LINE(2); |
| 794 STORE_LINE(3); | 794 STORE_LINE(3); |
| 806 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) | 806 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) |
| 807 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) | 807 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) |
| 808 #define do_a_deblock_altivec(a...) do_a_deblock_C(a) | 808 #define do_a_deblock_altivec(a...) do_a_deblock_C(a) |
| 809 | 809 |
| 810 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, | 810 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
| 811 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) | 811 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) |
| 812 { | 812 { |
| 813 const vector signed int zero = vec_splat_s32(0); | 813 const vector signed int zero = vec_splat_s32(0); |
| 814 const vector signed short vsint16_1 = vec_splat_s16(1); | 814 const vector signed short vsint16_1 = vec_splat_s16(1); |
| 815 vector signed int v_dp = zero; | 815 vector signed int v_dp = zero; |
| 816 vector signed int v_sysdp = zero; | 816 vector signed int v_sysdp = zero; |
| 818 | 818 |
| 819 tempBluredPast[127]= maxNoise[0]; | 819 tempBluredPast[127]= maxNoise[0]; |
| 820 tempBluredPast[128]= maxNoise[1]; | 820 tempBluredPast[128]= maxNoise[1]; |
| 821 tempBluredPast[129]= maxNoise[2]; | 821 tempBluredPast[129]= maxNoise[2]; |
| 822 | 822 |
| 823 #define LOAD_LINE(src, i) \ | 823 #define LOAD_LINE(src, i) \ |
| 824 register int j##src##i = i * stride; \ | 824 register int j##src##i = i * stride; \ |
| 825 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \ | 825 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \ |
| 826 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \ | 826 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \ |
| 827 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \ | 827 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \ |
| 828 const vector unsigned char v_##src##A##i = \ | 828 const vector unsigned char v_##src##A##i = \ |
| 829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ | 829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ |
| 830 vector signed short v_##src##Ass##i = \ | 830 vector signed short v_##src##Ass##i = \ |
| 831 (vector signed short)vec_mergeh((vector signed char)zero, \ | 831 (vector signed short)vec_mergeh((vector signed char)zero, \ |
| 832 (vector signed char)v_##src##A##i) | 832 (vector signed char)v_##src##A##i) |
| 833 | 833 |
| 834 LOAD_LINE(src, 0); | 834 LOAD_LINE(src, 0); |
| 835 LOAD_LINE(src, 1); | 835 LOAD_LINE(src, 1); |
| 836 LOAD_LINE(src, 2); | 836 LOAD_LINE(src, 2); |
| 837 LOAD_LINE(src, 3); | 837 LOAD_LINE(src, 3); |
| 848 LOAD_LINE(tempBlured, 5); | 848 LOAD_LINE(tempBlured, 5); |
| 849 LOAD_LINE(tempBlured, 6); | 849 LOAD_LINE(tempBlured, 6); |
| 850 LOAD_LINE(tempBlured, 7); | 850 LOAD_LINE(tempBlured, 7); |
| 851 #undef LOAD_LINE | 851 #undef LOAD_LINE |
| 852 | 852 |
| 853 #define ACCUMULATE_DIFFS(i) \ | 853 #define ACCUMULATE_DIFFS(i) \ |
| 854 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \ | 854 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \ |
| 855 v_srcAss##i); \ | 855 v_srcAss##i); \ |
| 856 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \ | 856 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \ |
| 857 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp) | 857 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp) |
| 858 | 858 |
| 859 ACCUMULATE_DIFFS(0); | 859 ACCUMULATE_DIFFS(0); |
| 860 ACCUMULATE_DIFFS(1); | 860 ACCUMULATE_DIFFS(1); |
| 861 ACCUMULATE_DIFFS(2); | 861 ACCUMULATE_DIFFS(2); |
| 914 if (d < maxNoise[0]) { | 914 if (d < maxNoise[0]) { |
| 915 const vector signed short vsint16_7 = vec_splat_s16(7); | 915 const vector signed short vsint16_7 = vec_splat_s16(7); |
| 916 const vector signed short vsint16_4 = vec_splat_s16(4); | 916 const vector signed short vsint16_4 = vec_splat_s16(4); |
| 917 const vector unsigned short vuint16_3 = vec_splat_u16(3); | 917 const vector unsigned short vuint16_3 = vec_splat_u16(3); |
| 918 | 918 |
| 919 #define OP(i) \ | 919 #define OP(i) \ |
| 920 const vector signed short v_temp##i = \ | 920 const vector signed short v_temp##i = \ |
| 921 vec_mladd(v_tempBluredAss##i, \ | 921 vec_mladd(v_tempBluredAss##i, \ |
| 922 vsint16_7, v_srcAss##i); \ | 922 vsint16_7, v_srcAss##i); \ |
| 923 const vector signed short v_temp2##i = \ | 923 const vector signed short v_temp2##i = \ |
| 924 vec_add(v_temp##i, vsint16_4); \ | 924 vec_add(v_temp##i, vsint16_4); \ |
| 925 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3) | 925 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3) |
| 926 | 926 |
| 927 OP(0); | 927 OP(0); |
| 928 OP(1); | 928 OP(1); |
| 929 OP(2); | 929 OP(2); |
| 935 #undef OP | 935 #undef OP |
| 936 } else { | 936 } else { |
| 937 const vector signed short vsint16_3 = vec_splat_s16(3); | 937 const vector signed short vsint16_3 = vec_splat_s16(3); |
| 938 const vector signed short vsint16_2 = vec_splat_s16(2); | 938 const vector signed short vsint16_2 = vec_splat_s16(2); |
| 939 | 939 |
| 940 #define OP(i) \ | 940 #define OP(i) \ |
| 941 const vector signed short v_temp##i = \ | 941 const vector signed short v_temp##i = \ |
| 942 vec_mladd(v_tempBluredAss##i, \ | 942 vec_mladd(v_tempBluredAss##i, \ |
| 943 vsint16_3, v_srcAss##i); \ | 943 vsint16_3, v_srcAss##i); \ |
| 944 const vector signed short v_temp2##i = \ | 944 const vector signed short v_temp2##i = \ |
| 945 vec_add(v_temp##i, vsint16_2); \ | 945 vec_add(v_temp##i, vsint16_2); \ |
| 946 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2) | 946 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2) |
| 947 | 947 |
| 948 OP(0); | 948 OP(0); |
| 949 OP(1); | 949 OP(1); |
| 950 OP(2); | 950 OP(2); |
| 957 } | 957 } |
| 958 } | 958 } |
| 959 | 959 |
| 960 const vector signed char neg1 = vec_splat_s8(-1); | 960 const vector signed char neg1 = vec_splat_s8(-1); |
| 961 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | 961 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 962 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | 962 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
| 963 | 963 |
| 964 #define PACK_AND_STORE(src, i) \ | 964 #define PACK_AND_STORE(src, i) \ |
| 965 const vector unsigned char perms##src##i = \ | 965 const vector unsigned char perms##src##i = \ |
| 966 vec_lvsr(i * stride, src); \ | 966 vec_lvsr(i * stride, src); \ |
| 967 const vector unsigned char vf##src##i = \ | 967 const vector unsigned char vf##src##i = \ |
| 968 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \ | 968 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \ |
| 969 const vector unsigned char vg##src##i = \ | 969 const vector unsigned char vg##src##i = \ |
| 970 vec_perm(vf##src##i, v_##src##A##i, permHH); \ | 970 vec_perm(vf##src##i, v_##src##A##i, permHH); \ |
| 971 const vector unsigned char mask##src##i = \ | 971 const vector unsigned char mask##src##i = \ |
| 972 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \ | 972 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \ |
| 973 const vector unsigned char vg2##src##i = \ | 973 const vector unsigned char vg2##src##i = \ |
| 974 vec_perm(vg##src##i, vg##src##i, perms##src##i); \ | 974 vec_perm(vg##src##i, vg##src##i, perms##src##i); \ |
| 975 const vector unsigned char svA##src##i = \ | 975 const vector unsigned char svA##src##i = \ |
| 976 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \ | 976 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \ |
| 977 const vector unsigned char svB##src##i = \ | 977 const vector unsigned char svB##src##i = \ |
| 978 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \ | 978 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \ |
| 979 vec_st(svA##src##i, i * stride, src); \ | 979 vec_st(svA##src##i, i * stride, src); \ |
| 980 vec_st(svB##src##i, i * stride + 16, src) | 980 vec_st(svB##src##i, i * stride + 16, src) |
| 981 | 981 |
| 982 PACK_AND_STORE(src, 0); | 982 PACK_AND_STORE(src, 0); |
| 983 PACK_AND_STORE(src, 1); | 983 PACK_AND_STORE(src, 1); |
| 984 PACK_AND_STORE(src, 2); | 984 PACK_AND_STORE(src, 2); |
| 999 } | 999 } |
| 1000 | 1000 |
| 1001 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { | 1001 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { |
| 1002 const vector unsigned char zero = vec_splat_u8(0); | 1002 const vector unsigned char zero = vec_splat_u8(0); |
| 1003 | 1003 |
| 1004 #define LOAD_DOUBLE_LINE(i, j) \ | 1004 #define LOAD_DOUBLE_LINE(i, j) \ |
| 1005 vector unsigned char perm1##i = vec_lvsl(i * stride, src); \ | 1005 vector unsigned char perm1##i = vec_lvsl(i * stride, src); \ |
| 1006 vector unsigned char perm2##i = vec_lvsl(j * stride, src); \ | 1006 vector unsigned char perm2##i = vec_lvsl(j * stride, src); \ |
| 1007 vector unsigned char srcA##i = vec_ld(i * stride, src); \ | 1007 vector unsigned char srcA##i = vec_ld(i * stride, src); \ |
| 1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ | 1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ |
| 1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \ | 1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \ |
| 1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ | 1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ |
| 1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ | 1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ |
| 1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) | 1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) |
| 1013 | 1013 |
| 1014 LOAD_DOUBLE_LINE(0, 1); | 1014 LOAD_DOUBLE_LINE(0, 1); |
| 1015 LOAD_DOUBLE_LINE(2, 3); | 1015 LOAD_DOUBLE_LINE(2, 3); |
| 1016 LOAD_DOUBLE_LINE(4, 5); | 1016 LOAD_DOUBLE_LINE(4, 5); |
| 1105 | 1105 |
| 1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { | 1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { |
| 1107 const vector unsigned char zero = vec_splat_u8(0); | 1107 const vector unsigned char zero = vec_splat_u8(0); |
| 1108 const vector unsigned char magic_perm = (const vector unsigned char) | 1108 const vector unsigned char magic_perm = (const vector unsigned char) |
| 1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | 1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, |
| 1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | 1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); |
| 1111 | 1111 |
| 1112 #define LOAD_DOUBLE_LINE(i, j) \ | 1112 #define LOAD_DOUBLE_LINE(i, j) \ |
| 1113 vector unsigned char src##i = vec_ld(i * 16, src); \ | 1113 vector unsigned char src##i = vec_ld(i * 16, src); \ |
| 1114 vector unsigned char src##j = vec_ld(j * 16, src) | 1114 vector unsigned char src##j = vec_ld(j * 16, src) |
| 1115 | 1115 |
| 1116 LOAD_DOUBLE_LINE(0, 1); | 1116 LOAD_DOUBLE_LINE(0, 1); |
| 1117 LOAD_DOUBLE_LINE(2, 3); | 1117 LOAD_DOUBLE_LINE(2, 3); |
| 1118 LOAD_DOUBLE_LINE(4, 5); | 1118 LOAD_DOUBLE_LINE(4, 5); |
| 1167 temp6 = vec_mergeh(tempD, tempL); | 1167 temp6 = vec_mergeh(tempD, tempL); |
| 1168 temp7 = vec_mergel(tempD, tempL); | 1168 temp7 = vec_mergel(tempD, tempL); |
| 1169 | 1169 |
| 1170 | 1170 |
| 1171 const vector signed char neg1 = vec_splat_s8(-1); | 1171 const vector signed char neg1 = vec_splat_s8(-1); |
| 1172 #define STORE_DOUBLE_LINE(i, j) \ | 1172 #define STORE_DOUBLE_LINE(i, j) \ |
| 1173 vector unsigned char dstA##i = vec_ld(i * stride, dst); \ | 1173 vector unsigned char dstA##i = vec_ld(i * stride, dst); \ |
| 1174 vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \ | 1174 vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \ |
| 1175 vector unsigned char dstA##j = vec_ld(j * stride, dst); \ | 1175 vector unsigned char dstA##j = vec_ld(j * stride, dst); \ |
| 1176 vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \ | 1176 vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \ |
| 1177 vector unsigned char align##i = vec_lvsr(i * stride, dst); \ | 1177 vector unsigned char align##i = vec_lvsr(i * stride, dst); \ |
| 1178 vector unsigned char align##j = vec_lvsr(j * stride, dst); \ | 1178 vector unsigned char align##j = vec_lvsr(j * stride, dst); \ |
| 1179 vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \ | 1179 vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \ |
| 1180 vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \ | 1180 vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \ |
| 1181 vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i); \ | 1181 vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i); \ |
| 1182 vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j); \ | 1182 vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j); \ |
| 1183 vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \ | 1183 vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \ |
| 1184 vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \ | 1184 vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \ |
| 1185 vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \ | 1185 vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \ |
| 1186 vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \ | 1186 vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \ |
| 1187 vec_st(dstAF##i, i * stride, dst); \ | 1187 vec_st(dstAF##i, i * stride, dst); \ |
| 1188 vec_st(dstBF##i, i * stride + 16, dst); \ | 1188 vec_st(dstBF##i, i * stride + 16, dst); \ |
| 1189 vec_st(dstAF##j, j * stride, dst); \ | 1189 vec_st(dstAF##j, j * stride, dst); \ |
| 1190 vec_st(dstBF##j, j * stride + 16, dst) | 1190 vec_st(dstBF##j, j * stride + 16, dst) |
| 1191 | 1191 |
| 1192 STORE_DOUBLE_LINE(0,1); | 1192 STORE_DOUBLE_LINE(0,1); |
| 1193 STORE_DOUBLE_LINE(2,3); | 1193 STORE_DOUBLE_LINE(2,3); |
| 1194 STORE_DOUBLE_LINE(4,5); | 1194 STORE_DOUBLE_LINE(4,5); |
