Mercurial > libavcodec.hg
annotate libpostproc/postprocess_altivec_template.c @ 2497:69adfbbdcdeb libavcodec
- samples from mplayer ftp in the "adv" profile seem to have profile=2,
which isn't the advanced one; and indeed, using adv. profile parser fails.
Using normal parser works, and that's what is done
- attempt at taking care of stride for NORM2 bitplane decoding
- duplication of much code from msmpeg4.c; this code isn't yet used, but
goes down as far as the block layer (mainly Transform Type stuff, the
remains are wild editing without checking). Unusable yet, and lacks the AC
decoding (but a step further in bitstream parsing)
patch by anonymous
| author | michael |
|---|---|
| date | Fri, 04 Feb 2005 02:20:38 +0000 |
| parents | 703b80c99891 |
| children | ef2149182f1c |
| rev | line source |
|---|---|
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
1 /* |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
2 AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org> |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
3 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
4 based on code by Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
5 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
6 This program is free software; you can redistribute it and/or modify |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
7 it under the terms of the GNU General Public License as published by |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
8 the Free Software Foundation; either version 2 of the License, or |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
9 (at your option) any later version. |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
10 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
11 This program is distributed in the hope that it will be useful, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
12 but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
14 GNU General Public License for more details. |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
15 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
16 You should have received a copy of the GNU General Public License |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
17 along with this program; if not, write to the Free Software |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
18 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
19 */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
20 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
21 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
22 #ifdef CONFIG_DARWIN |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
23 #define AVV(x...) (x) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
24 #else |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
25 #define AVV(x...) {x} |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
26 #endif |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
27 |
| 2041 | 28 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \ |
| 29 do { \ | |
| 30 __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \ | |
| 31 __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \ | |
| 32 __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \ | |
| 33 __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \ | |
| 34 tempA1 = vec_mergeh (src_a, src_e); \ | |
| 35 tempB1 = vec_mergel (src_a, src_e); \ | |
| 36 tempC1 = vec_mergeh (src_b, src_f); \ | |
| 37 tempD1 = vec_mergel (src_b, src_f); \ | |
| 38 tempE1 = vec_mergeh (src_c, src_g); \ | |
| 39 tempF1 = vec_mergel (src_c, src_g); \ | |
| 40 tempG1 = vec_mergeh (src_d, src_h); \ | |
| 41 tempH1 = vec_mergel (src_d, src_h); \ | |
| 42 tempA2 = vec_mergeh (tempA1, tempE1); \ | |
| 43 tempB2 = vec_mergel (tempA1, tempE1); \ | |
| 44 tempC2 = vec_mergeh (tempB1, tempF1); \ | |
| 45 tempD2 = vec_mergel (tempB1, tempF1); \ | |
| 46 tempE2 = vec_mergeh (tempC1, tempG1); \ | |
| 47 tempF2 = vec_mergel (tempC1, tempG1); \ | |
| 48 tempG2 = vec_mergeh (tempD1, tempH1); \ | |
| 49 tempH2 = vec_mergel (tempD1, tempH1); \ | |
| 50 src_a = vec_mergeh (tempA2, tempE2); \ | |
| 51 src_b = vec_mergel (tempA2, tempE2); \ | |
| 52 src_c = vec_mergeh (tempB2, tempF2); \ | |
| 53 src_d = vec_mergel (tempB2, tempF2); \ | |
| 54 src_e = vec_mergeh (tempC2, tempG2); \ | |
| 55 src_f = vec_mergel (tempC2, tempG2); \ | |
| 56 src_g = vec_mergeh (tempD2, tempH2); \ | |
| 57 src_h = vec_mergel (tempD2, tempH2); \ | |
| 58 } while (0) | |
| 59 | |
| 60 | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
61 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) { |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
62 /* |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
63 this code makes no assumption on src or stride. |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
64 One could remove the recomputation of the perm |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
65 vector by assuming (stride % 16) == 0, unfortunately |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
66 this is not always true. |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
67 */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
68 register int y; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
69 short __attribute__ ((aligned(16))) data[8]; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
70 int numEq; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
71 uint8_t *src2 = src; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
72 vector signed short v_dcOffset; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
73 vector signed short v2QP; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
74 vector unsigned short v4QP; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
75 vector unsigned short v_dcThreshold; |
| 2043 | 76 const int properStride = (stride % 16); |
| 77 const int srcAlign = ((unsigned long)src2 % 16); | |
| 78 const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0; | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
79 const vector signed int zero = vec_splat_s32(0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
80 const vector signed short mask = vec_splat_s16(1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
81 vector signed int v_numEq = vec_splat_s32(0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
82 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
83 data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
84 data[1] = data[0] * 2 + 1; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
85 data[2] = c->QP * 2; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
86 data[3] = c->QP * 4; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
87 vector signed short v_data = vec_ld(0, data); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
88 v_dcOffset = vec_splat(v_data, 0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
89 v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
90 v2QP = vec_splat(v_data, 2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
91 v4QP = (vector unsigned short)vec_splat(v_data, 3); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
92 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
93 src2 += stride * 4; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
94 |
| 2043 | 95 vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3, v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7; |
| 96 | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
97 #define LOAD_LINE(i) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
98 register int j##i = i * stride; \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
99 vector unsigned char perm##i = vec_lvsl(j##i, src2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
100 const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
101 vector unsigned char v_srcA2##i; \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
102 if (two_vectors) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
103 v_srcA2##i = vec_ld(j##i + 16, src2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
104 const vector unsigned char v_srcA##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
105 vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \ |
| 2043 | 106 v_srcAss##i = \ |
| 107 (vector signed short)vec_mergeh((vector signed char)zero, \ | |
| 108 (vector signed char)v_srcA##i) | |
| 109 | |
| 110 #define LOAD_LINE_ALIGNED(i) \ | |
| 111 register int j##i = i * stride; \ | |
| 112 const vector unsigned char v_srcA##i = vec_ld(j##i, src2); \ | |
| 113 v_srcAss##i = \ | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
114 (vector signed short)vec_mergeh((vector signed char)zero, \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
115 (vector signed char)v_srcA##i) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
116 |
| 2043 | 117 // special casing the aligned case is worthwhile, as all call from |
| 118 // the (transposed) horizontable deblocks will be aligned, i naddition | |
| 119 // to the naturraly aligned vertical deblocks. | |
| 120 if (properStride && srcAlign) { | |
| 121 LOAD_LINE_ALIGNED(0); | |
| 122 LOAD_LINE_ALIGNED(1); | |
| 123 LOAD_LINE_ALIGNED(2); | |
| 124 LOAD_LINE_ALIGNED(3); | |
| 125 LOAD_LINE_ALIGNED(4); | |
| 126 LOAD_LINE_ALIGNED(5); | |
| 127 LOAD_LINE_ALIGNED(6); | |
| 128 LOAD_LINE_ALIGNED(7); | |
| 129 } else { | |
| 130 LOAD_LINE(0); | |
| 131 LOAD_LINE(1); | |
| 132 LOAD_LINE(2); | |
| 133 LOAD_LINE(3); | |
| 134 LOAD_LINE(4); | |
| 135 LOAD_LINE(5); | |
| 136 LOAD_LINE(6); | |
| 137 LOAD_LINE(7); | |
| 138 } | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
139 #undef LOAD_LINE |
| 2043 | 140 #undef LOAD_LINE_ALIGNED |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
141 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
142 #define ITER(i, j) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
143 const vector signed short v_diff##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
144 vec_sub(v_srcAss##i, v_srcAss##j); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
145 const vector signed short v_sum##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
146 vec_add(v_diff##i, v_dcOffset); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
147 const vector signed short v_comp##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
148 (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
149 v_dcThreshold); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
150 const vector signed short v_part##i = vec_and(mask, v_comp##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
151 v_numEq = vec_sum4s(v_part##i, v_numEq); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
152 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
153 ITER(0, 1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
154 ITER(1, 2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
155 ITER(2, 3); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
156 ITER(3, 4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
157 ITER(4, 5); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
158 ITER(5, 6); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
159 ITER(6, 7); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
160 #undef ITER |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
161 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
162 v_numEq = vec_sums(v_numEq, zero); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
163 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
164 v_numEq = vec_splat(v_numEq, 3); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
165 vec_ste(v_numEq, 0, &numEq); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
166 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
167 if (numEq > c->ppMode.flatnessThreshold) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
168 { |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
169 const vector unsigned char mmoP1 = (const vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
170 AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
171 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
172 const vector unsigned char mmoP2 = (const vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
173 AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
174 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
175 const vector unsigned char mmoP = (const vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
176 vec_lvsl(8, (unsigned char*)0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
177 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
178 vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
179 vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
180 vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
181 vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
182 vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
183 vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
184 vector signed short mmoDiff = vec_sub(mmoL, mmoR); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
185 vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
186 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
187 if (vec_any_gt(mmoSum, v4QP)) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
188 return 0; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
189 else |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
190 return 1; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
191 } |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
192 else return 2; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
193 } |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
194 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
195 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) { |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
196 /* |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
197 this code makes no assumption on src or stride. |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
198 One could remove the recomputation of the perm |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
199 vector by assuming (stride % 16) == 0, unfortunately |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
200 this is not always true. Quite a lot of load/stores |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
201 can be removed by assuming proper alignement of |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
202 src & stride :-( |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
203 */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
204 uint8_t *src2 = src; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
205 const vector signed int zero = vec_splat_s32(0); |
| 2043 | 206 const int properStride = (stride % 16); |
| 207 const int srcAlign = ((unsigned long)src2 % 16); | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
208 short __attribute__ ((aligned(16))) qp[8]; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
209 qp[0] = c->QP; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
210 vector signed short vqp = vec_ld(0, qp); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
211 vqp = vec_splat(vqp, 0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
212 |
| 2043 | 213 src2 += stride*3; |
| 214 | |
| 215 vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9; | |
| 216 vector unsigned char vbA0, vbA1, vbA2, vbA3, vbA4, vbA5, vbA6, vbA7, vbA8, vbA9; | |
| 217 vector unsigned char vbB0, vbB1, vbB2, vbB3, vbB4, vbB5, vbB6, vbB7, vbB8, vbB9; | |
| 218 vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9; | |
| 219 | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
220 #define LOAD_LINE(i) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
221 const vector unsigned char perml##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
222 vec_lvsl(i * stride, src2); \ |
| 2043 | 223 vbA##i = vec_ld(i * stride, src2); \ |
| 224 vbB##i = vec_ld(i * stride + 16, src2); \ | |
| 225 vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \ | |
| 226 vb##i = \ | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
227 (vector signed short)vec_mergeh((vector unsigned char)zero, \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
228 (vector unsigned char)vbT##i) |
| 2043 | 229 |
| 230 #define LOAD_LINE_ALIGNED(i) \ | |
| 231 register int j##i = i * stride; \ | |
| 232 vbT##i = vec_ld(j##i, src2); \ | |
| 233 vb##i = \ | |
| 234 (vector signed short)vec_mergeh((vector signed char)zero, \ | |
| 235 (vector signed char)vbT##i) | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
236 |
| 2043 | 237 // special casing the aligned case is worthwhile, as all call from |
| 238 // the (transposed) horizontable deblocks will be aligned, in addition | |
| 239 // to the naturraly aligned vertical deblocks. | |
| 240 if (properStride && srcAlign) { | |
| 241 LOAD_LINE_ALIGNED(0); | |
| 242 LOAD_LINE_ALIGNED(1); | |
| 243 LOAD_LINE_ALIGNED(2); | |
| 244 LOAD_LINE_ALIGNED(3); | |
| 245 LOAD_LINE_ALIGNED(4); | |
| 246 LOAD_LINE_ALIGNED(5); | |
| 247 LOAD_LINE_ALIGNED(6); | |
| 248 LOAD_LINE_ALIGNED(7); | |
| 249 LOAD_LINE_ALIGNED(8); | |
| 250 LOAD_LINE_ALIGNED(9); | |
| 251 } else { | |
| 252 LOAD_LINE(0); | |
| 253 LOAD_LINE(1); | |
| 254 LOAD_LINE(2); | |
| 255 LOAD_LINE(3); | |
| 256 LOAD_LINE(4); | |
| 257 LOAD_LINE(5); | |
| 258 LOAD_LINE(6); | |
| 259 LOAD_LINE(7); | |
| 260 LOAD_LINE(8); | |
| 261 LOAD_LINE(9); | |
| 262 } | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
263 #undef LOAD_LINE |
| 2043 | 264 #undef LOAD_LINE_ALIGNED |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
265 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
266 const vector unsigned short v_1 = vec_splat_u16(1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
267 const vector unsigned short v_2 = vec_splat_u16(2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
268 const vector unsigned short v_4 = vec_splat_u16(4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
269 |
| 2041 | 270 const vector signed short v_diff01 = vec_sub(vb0, vb1); |
| 271 const vector unsigned short v_cmp01 = | |
| 272 (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp); | |
| 273 const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01); | |
| 274 const vector signed short v_diff89 = vec_sub(vb8, vb9); | |
| 275 const vector unsigned short v_cmp89 = | |
| 276 (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp); | |
| 277 const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89); | |
| 278 | |
| 279 const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1); | |
| 280 const vector signed short temp02 = vec_add(vb2, vb3); | |
| 281 const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4); | |
| 282 const vector signed short v_sumsB0 = vec_add(temp02, temp03); | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
283 |
| 2041 | 284 const vector signed short temp11 = vec_sub(v_sumsB0, v_first); |
| 285 const vector signed short v_sumsB1 = vec_add(temp11, vb4); | |
| 286 | |
| 287 const vector signed short temp21 = vec_sub(v_sumsB1, v_first); | |
| 288 const vector signed short v_sumsB2 = vec_add(temp21, vb5); | |
| 289 | |
| 290 const vector signed short temp31 = vec_sub(v_sumsB2, v_first); | |
| 291 const vector signed short v_sumsB3 = vec_add(temp31, vb6); | |
| 292 | |
| 293 const vector signed short temp41 = vec_sub(v_sumsB3, v_first); | |
| 294 const vector signed short v_sumsB4 = vec_add(temp41, vb7); | |
| 295 | |
| 296 const vector signed short temp51 = vec_sub(v_sumsB4, vb1); | |
| 297 const vector signed short v_sumsB5 = vec_add(temp51, vb8); | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
298 |
| 2041 | 299 const vector signed short temp61 = vec_sub(v_sumsB5, vb2); |
| 300 const vector signed short v_sumsB6 = vec_add(temp61, v_last); | |
| 301 | |
| 302 const vector signed short temp71 = vec_sub(v_sumsB6, vb3); | |
| 303 const vector signed short v_sumsB7 = vec_add(temp71, v_last); | |
| 304 | |
| 305 const vector signed short temp81 = vec_sub(v_sumsB7, vb4); | |
| 306 const vector signed short v_sumsB8 = vec_add(temp81, v_last); | |
| 307 | |
| 308 const vector signed short temp91 = vec_sub(v_sumsB8, vb5); | |
| 309 const vector signed short v_sumsB9 = vec_add(temp91, v_last); | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
310 |
| 2041 | 311 #define COMPUTE_VR(i, j, k) \ |
| 312 const vector signed short temps1##i = \ | |
| 313 vec_add(v_sumsB##i, v_sumsB##k); \ | |
| 314 const vector signed short temps2##i = \ | |
| 315 vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \ | |
| 316 const vector signed short vr##j = vec_sra(temps2##i, v_4) | |
| 317 | |
| 318 COMPUTE_VR(0, 1, 2); | |
| 319 COMPUTE_VR(1, 2, 3); | |
| 320 COMPUTE_VR(2, 3, 4); | |
| 321 COMPUTE_VR(3, 4, 5); | |
| 322 COMPUTE_VR(4, 5, 6); | |
| 323 COMPUTE_VR(5, 6, 7); | |
| 324 COMPUTE_VR(6, 7, 8); | |
| 325 COMPUTE_VR(7, 8, 9); | |
| 326 | |
| 327 const vector signed char neg1 = vec_splat_s8(-1); | |
| 328 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
| 329 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
330 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
331 #define PACK_AND_STORE(i) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
332 const vector unsigned char perms##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
333 vec_lvsr(i * stride, src2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
334 const vector unsigned char vf##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
335 vec_packsu(vr##i, (vector signed short)zero); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
336 const vector unsigned char vg##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
337 vec_perm(vf##i, vbT##i, permHH); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
338 const vector unsigned char mask##i = \ |
| 2041 | 339 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
340 const vector unsigned char vg2##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
341 vec_perm(vg##i, vg##i, perms##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
342 const vector unsigned char svA##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
343 vec_sel(vbA##i, vg2##i, mask##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
344 const vector unsigned char svB##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
345 vec_sel(vg2##i, vbB##i, mask##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
346 vec_st(svA##i, i * stride, src2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
347 vec_st(svB##i, i * stride + 16, src2) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
348 |
| 2043 | 349 #define PACK_AND_STORE_ALIGNED(i) \ |
| 350 const vector unsigned char vf##i = \ | |
| 351 vec_packsu(vr##i, (vector signed short)zero); \ | |
| 352 const vector unsigned char vg##i = \ | |
| 353 vec_perm(vf##i, vbT##i, permHH); \ | |
| 354 vec_st(vg##i, i * stride, src2) | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
355 |
| 2043 | 356 // special casing the aligned case is worthwhile, as all call from |
| 357 // the (transposed) horizontable deblocks will be aligned, in addition | |
| 358 // to the naturraly aligned vertical deblocks. | |
| 359 if (properStride && srcAlign) { | |
| 360 PACK_AND_STORE_ALIGNED(1); | |
| 361 PACK_AND_STORE_ALIGNED(2); | |
| 362 PACK_AND_STORE_ALIGNED(3); | |
| 363 PACK_AND_STORE_ALIGNED(4); | |
| 364 PACK_AND_STORE_ALIGNED(5); | |
| 365 PACK_AND_STORE_ALIGNED(6); | |
| 366 PACK_AND_STORE_ALIGNED(7); | |
| 367 PACK_AND_STORE_ALIGNED(8); | |
| 368 } else { | |
| 369 PACK_AND_STORE(1); | |
| 370 PACK_AND_STORE(2); | |
| 371 PACK_AND_STORE(3); | |
| 372 PACK_AND_STORE(4); | |
| 373 PACK_AND_STORE(5); | |
| 374 PACK_AND_STORE(6); | |
| 375 PACK_AND_STORE(7); | |
| 376 PACK_AND_STORE(8); | |
| 377 } | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
378 #undef PACK_AND_STORE |
| 2043 | 379 #undef PACK_AND_STORE_ALIGNED |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
380 } |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
381 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
382 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
383 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
384 static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) { |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
385 /* |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
386 this code makes no assumption on src or stride. |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
387 One could remove the recomputation of the perm |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
388 vector by assuming (stride % 16) == 0, unfortunately |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
389 this is not always true. Quite a lot of load/stores |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
390 can be removed by assuming proper alignement of |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
391 src & stride :-( |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
392 */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
393 uint8_t *src2 = src; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
394 const vector signed int zero = vec_splat_s32(0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
395 short __attribute__ ((aligned(16))) qp[8]; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
396 qp[0] = 8*c->QP; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
397 vector signed short vqp = vec_ld(0, qp); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
398 vqp = vec_splat(vqp, 0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
399 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
400 #define LOAD_LINE(i) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
401 const vector unsigned char perm##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
402 vec_lvsl(i * stride, src2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
403 const vector unsigned char vbA##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
404 vec_ld(i * stride, src2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
405 const vector unsigned char vbB##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
406 vec_ld(i * stride + 16, src2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
407 const vector unsigned char vbT##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
408 vec_perm(vbA##i, vbB##i, perm##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
409 const vector signed short vb##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
410 (vector signed short)vec_mergeh((vector unsigned char)zero, \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
411 (vector unsigned char)vbT##i) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
412 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
413 src2 += stride*3; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
414 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
415 LOAD_LINE(1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
416 LOAD_LINE(2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
417 LOAD_LINE(3); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
418 LOAD_LINE(4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
419 LOAD_LINE(5); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
420 LOAD_LINE(6); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
421 LOAD_LINE(7); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
422 LOAD_LINE(8); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
423 #undef LOAD_LINE |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
424 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
425 const vector signed short v_1 = vec_splat_s16(1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
426 const vector signed short v_2 = vec_splat_s16(2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
427 const vector signed short v_5 = vec_splat_s16(5); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
428 const vector signed short v_32 = vec_sl(v_1, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
429 (vector unsigned short)v_5); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
430 /* middle energy */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
431 const vector signed short l3minusl6 = vec_sub(vb3, vb6); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
432 const vector signed short l5minusl4 = vec_sub(vb5, vb4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
433 const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
434 const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
435 const vector signed short absmE = vec_abs(mE); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
436 /* left & right energy */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
437 const vector signed short l1minusl4 = vec_sub(vb1, vb4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
438 const vector signed short l3minusl2 = vec_sub(vb3, vb2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
439 const vector signed short l5minusl8 = vec_sub(vb5, vb8); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
440 const vector signed short l7minusl6 = vec_sub(vb7, vb6); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
441 const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
442 const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
443 const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
444 const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
445 /* d */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
446 const vector signed short ddiff = vec_sub(absmE, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
447 vec_min(vec_abs(lE), |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
448 vec_abs(rE))); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
449 const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
450 const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
451 const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
452 const vector signed short minusd = vec_sub((vector signed short)zero, d); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
453 const vector signed short finald = vec_sel(minusd, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
454 d, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
455 vec_cmpgt(vec_sub((vector signed short)zero, mE), |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
456 (vector signed short)zero)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
457 /* q */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
458 const vector signed short qtimes2 = vec_sub(vb4, vb5); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
459 /* for a shift right to behave like /2, we need to add one |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
460 to all negative integer */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
461 const vector signed short rounddown = vec_sel((vector signed short)zero, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
462 v_1, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
463 vec_cmplt(qtimes2, (vector signed short)zero)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
464 const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
465 /* clamp */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
466 const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
467 const vector signed short dclamp_P = vec_min(dclamp_P1, q); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
468 const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
469 const vector signed short dclamp_N = vec_max(dclamp_N1, q); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
470 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
471 const vector signed short dclampedfinal = vec_sel(dclamp_N, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
472 dclamp_P, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
473 vec_cmpgt(q, (vector signed short)zero)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
474 const vector signed short dornotd = vec_sel((vector signed short)zero, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
475 dclampedfinal, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
476 vec_cmplt(absmE, vqp)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
477 /* add/substract to l4 and l5 */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
478 const vector signed short vb4minusd = vec_sub(vb4, dornotd); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
479 const vector signed short vb5plusd = vec_add(vb5, dornotd); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
480 /* finally, stores */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
481 const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
482 const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero); |
| 2041 | 483 |
| 484 const vector signed char neg1 = vec_splat_s8(-1); | |
| 485 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
| 486 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
487 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
488 #define STORE(i) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
489 const vector unsigned char perms##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
490 vec_lvsr(i * stride, src2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
491 const vector unsigned char vg##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
492 vec_perm(st##i, vbT##i, permHH); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
493 const vector unsigned char mask##i = \ |
| 2041 | 494 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \ |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
495 const vector unsigned char vg2##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
496 vec_perm(vg##i, vg##i, perms##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
497 const vector unsigned char svA##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
498 vec_sel(vbA##i, vg2##i, mask##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
499 const vector unsigned char svB##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
500 vec_sel(vg2##i, vbB##i, mask##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
501 vec_st(svA##i, i * stride, src2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
502 vec_st(svB##i, i * stride + 16, src2) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
503 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
504 STORE(4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
505 STORE(5); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
506 } |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
507 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
508 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) { |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
509 /* |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
510 this code makes no assumption on src or stride. |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
511 One could remove the recomputation of the perm |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
512 vector by assuming (stride % 16) == 0, unfortunately |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
513 this is not always true. Quite a lot of load/stores |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
514 can be removed by assuming proper alignement of |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
515 src & stride :-( |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
516 */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
517 uint8_t *srcCopy = src; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
518 uint8_t __attribute__((aligned(16))) dt[16]; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
519 const vector unsigned char vuint8_1 = vec_splat_u8(1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
520 const vector signed int zero = vec_splat_s32(0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
521 vector unsigned char v_dt; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
522 dt[0] = deringThreshold; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
523 v_dt = vec_splat(vec_ld(0, dt), 0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
524 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
525 #define LOAD_LINE(i) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
526 const vector unsigned char perm##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
527 vec_lvsl(i * stride, srcCopy); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
528 vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
529 vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
530 vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
531 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
532 LOAD_LINE(0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
533 LOAD_LINE(1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
534 LOAD_LINE(2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
535 LOAD_LINE(3); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
536 LOAD_LINE(4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
537 LOAD_LINE(5); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
538 LOAD_LINE(6); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
539 LOAD_LINE(7); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
540 LOAD_LINE(8); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
541 LOAD_LINE(9); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
542 #undef LOAD_LINE |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
543 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
544 vector unsigned char v_avg; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
545 { |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
546 const vector unsigned char trunc_perm = (vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
547 AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
548 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
549 const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
550 const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
551 const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
552 const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
553 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
554 #define EXTRACT(op) do { \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
555 const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
556 const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
557 const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
558 const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
559 const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
560 const vector unsigned char s##op##_9 = vec_##op(s##op##_8h, s##op##_8l); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
561 const vector unsigned char s##op##_9h = vec_mergeh(s##op##_9, s##op##_9); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
562 const vector unsigned char s##op##_9l = vec_mergel(s##op##_9, s##op##_9); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
563 const vector unsigned char s##op##_10 = vec_##op(s##op##_9h, s##op##_9l); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
564 const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
565 const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
566 const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
567 const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
568 const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
569 v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
570 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
571 vector unsigned char v_min; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
572 vector unsigned char v_max; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
573 EXTRACT(min); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
574 EXTRACT(max); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
575 #undef EXTRACT |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
576 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
577 if (vec_all_lt(vec_sub(v_max, v_min), v_dt)) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
578 return; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
579 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
580 v_avg = vec_avg(v_min, v_max); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
581 } |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
582 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
583 signed int __attribute__((aligned(16))) S[8]; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
584 { |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
585 const vector unsigned short mask1 = (vector unsigned short) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
586 AVV(0x0001, 0x0002, 0x0004, 0x0008, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
587 0x0010, 0x0020, 0x0040, 0x0080); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
588 const vector unsigned short mask2 = (vector unsigned short) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
589 AVV(0x0100, 0x0200, 0x0000, 0x0000, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
590 0x0000, 0x0000, 0x0000, 0x0000); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
591 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
592 const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
593 const vector unsigned int vuint32_1 = vec_splat_u32(1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
594 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
595 #define COMPARE(i) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
596 vector signed int sum##i; \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
597 do { \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
598 const vector unsigned char cmp##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
599 (vector unsigned char)vec_cmpgt(src##i, v_avg); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
600 const vector unsigned short cmpHi##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
601 (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
602 const vector unsigned short cmpLi##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
603 (vector unsigned short)vec_mergel(cmp##i, cmp##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
604 const vector signed short cmpHf##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
605 (vector signed short)vec_and(cmpHi##i, mask1); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
606 const vector signed short cmpLf##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
607 (vector signed short)vec_and(cmpLi##i, mask2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
608 const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
609 const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
610 sum##i = vec_sums(sumq##i, zero); } while (0) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
611 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
612 COMPARE(0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
613 COMPARE(1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
614 COMPARE(2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
615 COMPARE(3); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
616 COMPARE(4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
617 COMPARE(5); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
618 COMPARE(6); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
619 COMPARE(7); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
620 COMPARE(8); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
621 COMPARE(9); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
622 #undef COMPARE |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
623 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
624 vector signed int sumA2; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
625 vector signed int sumB2; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
626 { |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
627 const vector signed int sump02 = vec_mergel(sum0, sum2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
628 const vector signed int sump13 = vec_mergel(sum1, sum3); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
629 const vector signed int sumA = vec_mergel(sump02, sump13); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
630 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
631 const vector signed int sump46 = vec_mergel(sum4, sum6); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
632 const vector signed int sump57 = vec_mergel(sum5, sum7); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
633 const vector signed int sumB = vec_mergel(sump46, sump57); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
634 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
635 const vector signed int sump8A = vec_mergel(sum8, zero); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
636 const vector signed int sump9B = vec_mergel(sum9, zero); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
637 const vector signed int sumC = vec_mergel(sump8A, sump9B); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
638 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
639 const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
640 const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
641 const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
642 const vector signed int t2A = vec_or(sumA, tA); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
643 const vector signed int t2B = vec_or(sumB, tB); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
644 const vector signed int t2C = vec_or(sumC, tC); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
645 const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1), |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
646 vec_sl(t2A, vuint32_1)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
647 const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1), |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
648 vec_sl(t2B, vuint32_1)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
649 const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1), |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
650 vec_sl(t2C, vuint32_1)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
651 const vector signed int yA = vec_and(t2A, t3A); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
652 const vector signed int yB = vec_and(t2B, t3B); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
653 const vector signed int yC = vec_and(t2C, t3C); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
654 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
655 const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
656 const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
657 const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
658 const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
659 const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
660 const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
661 const vector signed int sumAp = vec_and(yA, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
662 vec_and(sumAd4,sumAd8)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
663 const vector signed int sumBp = vec_and(yB, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
664 vec_and(sumBd4,sumBd8)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
665 sumA2 = vec_or(sumAp, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
666 vec_sra(sumAp, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
667 vuint32_16)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
668 sumB2 = vec_or(sumBp, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
669 vec_sra(sumBp, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
670 vuint32_16)); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
671 } |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
672 vec_st(sumA2, 0, S); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
673 vec_st(sumB2, 16, S); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
674 } |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
675 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
676 /* I'm not sure the following is actually faster |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
677 than straight, unvectorized C code :-( */ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
678 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
679 int __attribute__((aligned(16))) tQP2[4]; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
680 tQP2[0]= c->QP/2 + 1; |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
681 vector signed int vQP2 = vec_ld(0, tQP2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
682 vQP2 = vec_splat(vQP2, 0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
683 const vector unsigned char vuint8_2 = vec_splat_u8(2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
684 const vector signed int vsint32_8 = vec_splat_s32(8); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
685 const vector unsigned int vuint32_4 = vec_splat_u32(4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
686 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
687 const vector unsigned char permA1 = (vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
688 AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
689 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
690 const vector unsigned char permA2 = (vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
691 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
692 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
693 const vector unsigned char permA1inc = (vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
694 AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
695 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
696 const vector unsigned char permA2inc = (vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
697 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
698 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
699 const vector unsigned char magic = (vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
700 AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
701 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
702 const vector unsigned char extractPerm = (vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
703 AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
704 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
705 const vector unsigned char extractPermInc = (vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
706 AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
707 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
708 const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
709 const vector unsigned char tenRight = (vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
710 AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
711 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
712 const vector unsigned char eightLeft = (vector unsigned char) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
713 AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
714 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
715 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
716 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
717 #define F_INIT(i) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
718 vector unsigned char tenRightM##i = tenRight; \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
719 vector unsigned char permA1M##i = permA1; \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
720 vector unsigned char permA2M##i = permA2; \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
721 vector unsigned char extractPermM##i = extractPerm |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
722 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
723 #define F2(i, j, k, l) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
724 if (S[i] & (1 << (l+1))) { \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
725 const vector unsigned char a_##j##_A##l = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
726 vec_perm(src##i, src##j, permA1M##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
727 const vector unsigned char a_##j##_B##l = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
728 vec_perm(a_##j##_A##l, src##k, permA2M##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
729 const vector signed int a_##j##_sump##l = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
730 (vector signed int)vec_msum(a_##j##_B##l, magic, \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
731 (vector unsigned int)zero); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
732 vector signed int F_##j##_##l = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
733 vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
734 F_##j##_##l = vec_splat(F_##j##_##l, 3); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
735 const vector signed int p_##j##_##l = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
736 (vector signed int)vec_perm(src##j, \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
737 (vector unsigned char)zero, \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
738 extractPermM##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
739 const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
740 const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
741 vector signed int newpm_##j##_##l; \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
742 if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
743 newpm_##j##_##l = sum_##j##_##l; \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
744 else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
745 newpm_##j##_##l = diff_##j##_##l; \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
746 else newpm_##j##_##l = F_##j##_##l; \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
747 const vector unsigned char newpm2_##j##_##l = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
748 vec_splat((vector unsigned char)newpm_##j##_##l, 15); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
749 const vector unsigned char mask##j##l = vec_add(identity, \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
750 tenRightM##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
751 src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
752 } \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
753 permA1M##i = vec_add(permA1M##i, permA1inc); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
754 permA2M##i = vec_add(permA2M##i, permA2inc); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
755 tenRightM##i = vec_sro(tenRightM##i, eightLeft); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
756 extractPermM##i = vec_add(extractPermM##i, extractPermInc) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
757 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
758 #define ITER(i, j, k) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
759 F_INIT(i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
760 F2(i, j, k, 0); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
761 F2(i, j, k, 1); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
762 F2(i, j, k, 2); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
763 F2(i, j, k, 3); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
764 F2(i, j, k, 4); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
765 F2(i, j, k, 5); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
766 F2(i, j, k, 6); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
767 F2(i, j, k, 7) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
768 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
769 ITER(0, 1, 2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
770 ITER(1, 2, 3); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
771 ITER(2, 3, 4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
772 ITER(3, 4, 5); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
773 ITER(4, 5, 6); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
774 ITER(5, 6, 7); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
775 ITER(6, 7, 8); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
776 ITER(7, 8, 9); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
777 |
| 2041 | 778 const vector signed char neg1 = vec_splat_s8(-1); |
|
2036
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
779 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
780 #define STORE_LINE(i) \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
781 const vector unsigned char permST##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
782 vec_lvsr(i * stride, srcCopy); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
783 const vector unsigned char maskST##i = \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
784 vec_perm((vector unsigned char)zero, \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
785 (vector unsigned char)neg1, permST##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
786 src##i = vec_perm(src##i ,src##i, permST##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
787 sA##i= vec_sel(sA##i, src##i, maskST##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
788 sB##i= vec_sel(src##i, sB##i, maskST##i); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
789 vec_st(sA##i, i * stride, srcCopy); \ |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
790 vec_st(sB##i, i * stride + 16, srcCopy) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
791 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
792 STORE_LINE(1); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
793 STORE_LINE(2); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
794 STORE_LINE(3); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
795 STORE_LINE(4); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
796 STORE_LINE(5); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
797 STORE_LINE(6); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
798 STORE_LINE(7); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
799 STORE_LINE(8); |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
800 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
801 #undef STORE_LINE |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
802 #undef ITER |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
803 #undef F2 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
804 } |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
805 |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
806 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a) |
|
6a6c678517b3
altivec optimizations and horizontal filter fix by (Romain Dolbeau <dolbeau at irisa dot fr>)
michael
parents:
diff
changeset
|
807 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a) |
| 2041 | 808 #define do_a_deblock_altivec(a...) do_a_deblock_C(a) |
| 809 | |
| 810 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, | |
| 811 uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise) | |
| 812 { | |
| 813 const vector signed int zero = vec_splat_s32(0); | |
| 814 const vector signed short vsint16_1 = vec_splat_s16(1); | |
| 815 vector signed int v_dp = zero; | |
| 816 vector signed int v_sysdp = zero; | |
| 817 int d, sysd, i; | |
| 818 | |
| 819 tempBluredPast[127]= maxNoise[0]; | |
| 820 tempBluredPast[128]= maxNoise[1]; | |
| 821 tempBluredPast[129]= maxNoise[2]; | |
| 822 | |
| 823 #define LOAD_LINE(src, i) \ | |
| 824 register int j##src##i = i * stride; \ | |
| 825 vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \ | |
| 826 const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \ | |
| 827 const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \ | |
| 828 const vector unsigned char v_##src##A##i = \ | |
| 829 vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \ | |
| 830 vector signed short v_##src##Ass##i = \ | |
| 831 (vector signed short)vec_mergeh((vector signed char)zero, \ | |
| 832 (vector signed char)v_##src##A##i) | |
| 833 | |
| 834 LOAD_LINE(src, 0); | |
| 835 LOAD_LINE(src, 1); | |
| 836 LOAD_LINE(src, 2); | |
| 837 LOAD_LINE(src, 3); | |
| 838 LOAD_LINE(src, 4); | |
| 839 LOAD_LINE(src, 5); | |
| 840 LOAD_LINE(src, 6); | |
| 841 LOAD_LINE(src, 7); | |
| 842 | |
| 843 LOAD_LINE(tempBlured, 0); | |
| 844 LOAD_LINE(tempBlured, 1); | |
| 845 LOAD_LINE(tempBlured, 2); | |
| 846 LOAD_LINE(tempBlured, 3); | |
| 847 LOAD_LINE(tempBlured, 4); | |
| 848 LOAD_LINE(tempBlured, 5); | |
| 849 LOAD_LINE(tempBlured, 6); | |
| 850 LOAD_LINE(tempBlured, 7); | |
| 851 #undef LOAD_LINE | |
| 852 | |
| 853 #define ACCUMULATE_DIFFS(i) \ | |
| 854 vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \ | |
| 855 v_srcAss##i); \ | |
| 856 v_dp = vec_msums(v_d##i, v_d##i, v_dp); \ | |
| 857 v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp) | |
| 858 | |
| 859 ACCUMULATE_DIFFS(0); | |
| 860 ACCUMULATE_DIFFS(1); | |
| 861 ACCUMULATE_DIFFS(2); | |
| 862 ACCUMULATE_DIFFS(3); | |
| 863 ACCUMULATE_DIFFS(4); | |
| 864 ACCUMULATE_DIFFS(5); | |
| 865 ACCUMULATE_DIFFS(6); | |
| 866 ACCUMULATE_DIFFS(7); | |
| 867 #undef ACCUMULATE_DIFFS | |
| 868 | |
| 869 v_dp = vec_sums(v_dp, zero); | |
| 870 v_sysdp = vec_sums(v_sysdp, zero); | |
| 871 | |
| 872 v_dp = vec_splat(v_dp, 3); | |
| 873 v_sysdp = vec_splat(v_sysdp, 3); | |
| 874 | |
| 875 vec_ste(v_dp, 0, &d); | |
| 876 vec_ste(v_sysdp, 0, &sysd); | |
| 877 | |
| 878 i = d; | |
| 879 d = (4*d | |
| 880 +(*(tempBluredPast-256)) | |
| 881 +(*(tempBluredPast-1))+ (*(tempBluredPast+1)) | |
| 882 +(*(tempBluredPast+256)) | |
| 883 +4)>>3; | |
| 884 | |
| 885 *tempBluredPast=i; | |
| 886 | |
| 887 if (d > maxNoise[1]) { | |
| 888 if (d < maxNoise[2]) { | |
| 889 #define OP(i) v_tempBluredAss##i = vec_avg(v_tempBluredAss##i, v_srcAss##i); | |
| 890 | |
| 891 OP(0); | |
| 892 OP(1); | |
| 893 OP(2); | |
| 894 OP(3); | |
| 895 OP(4); | |
| 896 OP(5); | |
| 897 OP(6); | |
| 898 OP(7); | |
| 899 #undef OP | |
| 900 } else { | |
| 901 #define OP(i) v_tempBluredAss##i = v_srcAss##i; | |
| 902 | |
| 903 OP(0); | |
| 904 OP(1); | |
| 905 OP(2); | |
| 906 OP(3); | |
| 907 OP(4); | |
| 908 OP(5); | |
| 909 OP(6); | |
| 910 OP(7); | |
| 911 #undef OP | |
| 912 } | |
| 913 } else { | |
| 914 if (d < maxNoise[0]) { | |
| 915 const vector signed short vsint16_7 = vec_splat_s16(7); | |
| 916 const vector signed short vsint16_4 = vec_splat_s16(4); | |
| 917 const vector unsigned short vuint16_3 = vec_splat_u16(3); | |
| 918 | |
| 919 #define OP(i) \ | |
| 920 const vector signed short v_temp##i = \ | |
| 921 vec_mladd(v_tempBluredAss##i, \ | |
| 922 vsint16_7, v_srcAss##i); \ | |
| 923 const vector signed short v_temp2##i = \ | |
| 924 vec_add(v_temp##i, vsint16_4); \ | |
| 925 v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3) | |
| 926 | |
| 927 OP(0); | |
| 928 OP(1); | |
| 929 OP(2); | |
| 930 OP(3); | |
| 931 OP(4); | |
| 932 OP(5); | |
| 933 OP(6); | |
| 934 OP(7); | |
| 935 #undef OP | |
| 936 } else { | |
| 937 const vector signed short vsint16_3 = vec_splat_s16(3); | |
| 938 const vector signed short vsint16_2 = vec_splat_s16(2); | |
| 939 | |
| 940 #define OP(i) \ | |
| 941 const vector signed short v_temp##i = \ | |
| 942 vec_mladd(v_tempBluredAss##i, \ | |
| 943 vsint16_3, v_srcAss##i); \ | |
| 944 const vector signed short v_temp2##i = \ | |
| 945 vec_add(v_temp##i, vsint16_2); \ | |
| 946 v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2) | |
| 947 | |
| 948 OP(0); | |
| 949 OP(1); | |
| 950 OP(2); | |
| 951 OP(3); | |
| 952 OP(4); | |
| 953 OP(5); | |
| 954 OP(6); | |
| 955 OP(7); | |
| 956 #undef OP | |
| 957 } | |
| 958 } | |
| 959 | |
| 960 const vector signed char neg1 = vec_splat_s8(-1); | |
| 961 const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
| 962 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | |
| 963 | |
| 964 #define PACK_AND_STORE(src, i) \ | |
| 965 const vector unsigned char perms##src##i = \ | |
| 966 vec_lvsr(i * stride, src); \ | |
| 967 const vector unsigned char vf##src##i = \ | |
| 968 vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \ | |
| 969 const vector unsigned char vg##src##i = \ | |
| 970 vec_perm(vf##src##i, v_##src##A##i, permHH); \ | |
| 971 const vector unsigned char mask##src##i = \ | |
| 972 vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \ | |
| 973 const vector unsigned char vg2##src##i = \ | |
| 974 vec_perm(vg##src##i, vg##src##i, perms##src##i); \ | |
| 975 const vector unsigned char svA##src##i = \ | |
| 976 vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \ | |
| 977 const vector unsigned char svB##src##i = \ | |
| 978 vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \ | |
| 979 vec_st(svA##src##i, i * stride, src); \ | |
| 980 vec_st(svB##src##i, i * stride + 16, src) | |
| 981 | |
| 982 PACK_AND_STORE(src, 0); | |
| 983 PACK_AND_STORE(src, 1); | |
| 984 PACK_AND_STORE(src, 2); | |
| 985 PACK_AND_STORE(src, 3); | |
| 986 PACK_AND_STORE(src, 4); | |
| 987 PACK_AND_STORE(src, 5); | |
| 988 PACK_AND_STORE(src, 6); | |
| 989 PACK_AND_STORE(src, 7); | |
| 990 PACK_AND_STORE(tempBlured, 0); | |
| 991 PACK_AND_STORE(tempBlured, 1); | |
| 992 PACK_AND_STORE(tempBlured, 2); | |
| 993 PACK_AND_STORE(tempBlured, 3); | |
| 994 PACK_AND_STORE(tempBlured, 4); | |
| 995 PACK_AND_STORE(tempBlured, 5); | |
| 996 PACK_AND_STORE(tempBlured, 6); | |
| 997 PACK_AND_STORE(tempBlured, 7); | |
| 998 #undef PACK_AND_STORE | |
| 999 } | |
| 2043 | 1000 |
| 1001 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { | |
| 1002 const vector unsigned char zero = vec_splat_u8(0); | |
| 1003 | |
| 1004 #define LOAD_DOUBLE_LINE(i, j) \ | |
| 1005 vector unsigned char perm1##i = vec_lvsl(i * stride, src); \ | |
| 1006 vector unsigned char perm2##i = vec_lvsl(j * stride, src); \ | |
| 1007 vector unsigned char srcA##i = vec_ld(i * stride, src); \ | |
| 1008 vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \ | |
| 1009 vector unsigned char srcC##i = vec_ld(j * stride, src); \ | |
| 1010 vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \ | |
| 1011 vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \ | |
| 1012 vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i) | |
| 1013 | |
| 1014 LOAD_DOUBLE_LINE(0, 1); | |
| 1015 LOAD_DOUBLE_LINE(2, 3); | |
| 1016 LOAD_DOUBLE_LINE(4, 5); | |
| 1017 LOAD_DOUBLE_LINE(6, 7); | |
| 1018 #undef LOAD_DOUBLE_LINE | |
| 1019 | |
| 1020 vector unsigned char tempA = vec_mergeh(src0, zero); | |
| 1021 vector unsigned char tempB = vec_mergel(src0, zero); | |
| 1022 vector unsigned char tempC = vec_mergeh(src1, zero); | |
| 1023 vector unsigned char tempD = vec_mergel(src1, zero); | |
| 1024 vector unsigned char tempE = vec_mergeh(src2, zero); | |
| 1025 vector unsigned char tempF = vec_mergel(src2, zero); | |
| 1026 vector unsigned char tempG = vec_mergeh(src3, zero); | |
| 1027 vector unsigned char tempH = vec_mergel(src3, zero); | |
| 1028 vector unsigned char tempI = vec_mergeh(src4, zero); | |
| 1029 vector unsigned char tempJ = vec_mergel(src4, zero); | |
| 1030 vector unsigned char tempK = vec_mergeh(src5, zero); | |
| 1031 vector unsigned char tempL = vec_mergel(src5, zero); | |
| 1032 vector unsigned char tempM = vec_mergeh(src6, zero); | |
| 1033 vector unsigned char tempN = vec_mergel(src6, zero); | |
| 1034 vector unsigned char tempO = vec_mergeh(src7, zero); | |
| 1035 vector unsigned char tempP = vec_mergel(src7, zero); | |
| 1036 | |
| 1037 vector unsigned char temp0 = vec_mergeh(tempA, tempI); | |
| 1038 vector unsigned char temp1 = vec_mergel(tempA, tempI); | |
| 1039 vector unsigned char temp2 = vec_mergeh(tempB, tempJ); | |
| 1040 vector unsigned char temp3 = vec_mergel(tempB, tempJ); | |
| 1041 vector unsigned char temp4 = vec_mergeh(tempC, tempK); | |
| 1042 vector unsigned char temp5 = vec_mergel(tempC, tempK); | |
| 1043 vector unsigned char temp6 = vec_mergeh(tempD, tempL); | |
| 1044 vector unsigned char temp7 = vec_mergel(tempD, tempL); | |
| 1045 vector unsigned char temp8 = vec_mergeh(tempE, tempM); | |
| 1046 vector unsigned char temp9 = vec_mergel(tempE, tempM); | |
| 1047 vector unsigned char temp10 = vec_mergeh(tempF, tempN); | |
| 1048 vector unsigned char temp11 = vec_mergel(tempF, tempN); | |
| 1049 vector unsigned char temp12 = vec_mergeh(tempG, tempO); | |
| 1050 vector unsigned char temp13 = vec_mergel(tempG, tempO); | |
| 1051 vector unsigned char temp14 = vec_mergeh(tempH, tempP); | |
| 1052 vector unsigned char temp15 = vec_mergel(tempH, tempP); | |
| 1053 | |
| 1054 tempA = vec_mergeh(temp0, temp8); | |
| 1055 tempB = vec_mergel(temp0, temp8); | |
| 1056 tempC = vec_mergeh(temp1, temp9); | |
| 1057 tempD = vec_mergel(temp1, temp9); | |
| 1058 tempE = vec_mergeh(temp2, temp10); | |
| 1059 tempF = vec_mergel(temp2, temp10); | |
| 1060 tempG = vec_mergeh(temp3, temp11); | |
| 1061 tempH = vec_mergel(temp3, temp11); | |
| 1062 tempI = vec_mergeh(temp4, temp12); | |
| 1063 tempJ = vec_mergel(temp4, temp12); | |
| 1064 tempK = vec_mergeh(temp5, temp13); | |
| 1065 tempL = vec_mergel(temp5, temp13); | |
| 1066 tempM = vec_mergeh(temp6, temp14); | |
| 1067 tempN = vec_mergel(temp6, temp14); | |
| 1068 tempO = vec_mergeh(temp7, temp15); | |
| 1069 tempP = vec_mergel(temp7, temp15); | |
| 1070 | |
| 1071 temp0 = vec_mergeh(tempA, tempI); | |
| 1072 temp1 = vec_mergel(tempA, tempI); | |
| 1073 temp2 = vec_mergeh(tempB, tempJ); | |
| 1074 temp3 = vec_mergel(tempB, tempJ); | |
| 1075 temp4 = vec_mergeh(tempC, tempK); | |
| 1076 temp5 = vec_mergel(tempC, tempK); | |
| 1077 temp6 = vec_mergeh(tempD, tempL); | |
| 1078 temp7 = vec_mergel(tempD, tempL); | |
| 1079 temp8 = vec_mergeh(tempE, tempM); | |
| 1080 temp9 = vec_mergel(tempE, tempM); | |
| 1081 temp10 = vec_mergeh(tempF, tempN); | |
| 1082 temp11 = vec_mergel(tempF, tempN); | |
| 1083 temp12 = vec_mergeh(tempG, tempO); | |
| 1084 temp13 = vec_mergel(tempG, tempO); | |
| 1085 temp14 = vec_mergeh(tempH, tempP); | |
| 1086 temp15 = vec_mergel(tempH, tempP); | |
| 1087 | |
| 1088 vec_st(temp0, 0, dst); | |
| 1089 vec_st(temp1, 16, dst); | |
| 1090 vec_st(temp2, 32, dst); | |
| 1091 vec_st(temp3, 48, dst); | |
| 1092 vec_st(temp4, 64, dst); | |
| 1093 vec_st(temp5, 80, dst); | |
| 1094 vec_st(temp6, 96, dst); | |
| 1095 vec_st(temp7, 112, dst); | |
| 1096 vec_st(temp8, 128, dst); | |
| 1097 vec_st(temp9, 144, dst); | |
| 1098 vec_st(temp10, 160, dst); | |
| 1099 vec_st(temp11, 176, dst); | |
| 1100 vec_st(temp12, 192, dst); | |
| 1101 vec_st(temp13, 208, dst); | |
| 1102 vec_st(temp14, 224, dst); | |
| 1103 vec_st(temp15, 240, dst); | |
| 1104 } | |
| 1105 | |
| 1106 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) { | |
| 1107 const vector unsigned char zero = vec_splat_u8(0); | |
| 1108 const vector unsigned char magic_perm = (const vector unsigned char) | |
| 1109 AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, | |
| 1110 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F); | |
| 1111 | |
| 1112 #define LOAD_DOUBLE_LINE(i, j) \ | |
| 1113 vector unsigned char src##i = vec_ld(i * 16, src); \ | |
| 1114 vector unsigned char src##j = vec_ld(j * 16, src) | |
| 1115 | |
| 1116 LOAD_DOUBLE_LINE(0, 1); | |
| 1117 LOAD_DOUBLE_LINE(2, 3); | |
| 1118 LOAD_DOUBLE_LINE(4, 5); | |
| 1119 LOAD_DOUBLE_LINE(6, 7); | |
| 1120 LOAD_DOUBLE_LINE(8, 9); | |
| 1121 LOAD_DOUBLE_LINE(10, 11); | |
| 1122 LOAD_DOUBLE_LINE(12, 13); | |
| 1123 LOAD_DOUBLE_LINE(14, 15); | |
| 1124 #undef LOAD_DOUBLE_LINE | |
| 1125 | |
| 1126 vector unsigned char tempA = vec_mergeh(src0, src8); | |
| 1127 vector unsigned char tempB; | |
| 1128 vector unsigned char tempC = vec_mergeh(src1, src9); | |
| 1129 vector unsigned char tempD; | |
| 1130 vector unsigned char tempE = vec_mergeh(src2, src10); | |
| 1131 vector unsigned char tempG = vec_mergeh(src3, src11); | |
| 1132 vector unsigned char tempI = vec_mergeh(src4, src12); | |
| 1133 vector unsigned char tempJ; | |
| 1134 vector unsigned char tempK = vec_mergeh(src5, src13); | |
| 1135 vector unsigned char tempL; | |
| 1136 vector unsigned char tempM = vec_mergeh(src6, src14); | |
| 1137 vector unsigned char tempO = vec_mergeh(src7, src15); | |
| 1138 | |
| 1139 vector unsigned char temp0 = vec_mergeh(tempA, tempI); | |
| 1140 vector unsigned char temp1 = vec_mergel(tempA, tempI); | |
| 1141 vector unsigned char temp2; | |
| 1142 vector unsigned char temp3; | |
| 1143 vector unsigned char temp4 = vec_mergeh(tempC, tempK); | |
| 1144 vector unsigned char temp5 = vec_mergel(tempC, tempK); | |
| 1145 vector unsigned char temp6; | |
| 1146 vector unsigned char temp7; | |
| 1147 vector unsigned char temp8 = vec_mergeh(tempE, tempM); | |
| 1148 vector unsigned char temp9 = vec_mergel(tempE, tempM); | |
| 1149 vector unsigned char temp12 = vec_mergeh(tempG, tempO); | |
| 1150 vector unsigned char temp13 = vec_mergel(tempG, tempO); | |
| 1151 | |
| 1152 tempA = vec_mergeh(temp0, temp8); | |
| 1153 tempB = vec_mergel(temp0, temp8); | |
| 1154 tempC = vec_mergeh(temp1, temp9); | |
| 1155 tempD = vec_mergel(temp1, temp9); | |
| 1156 tempI = vec_mergeh(temp4, temp12); | |
| 1157 tempJ = vec_mergel(temp4, temp12); | |
| 1158 tempK = vec_mergeh(temp5, temp13); | |
| 1159 tempL = vec_mergel(temp5, temp13); | |
| 1160 | |
| 1161 temp0 = vec_mergeh(tempA, tempI); | |
| 1162 temp1 = vec_mergel(tempA, tempI); | |
| 1163 temp2 = vec_mergeh(tempB, tempJ); | |
| 1164 temp3 = vec_mergel(tempB, tempJ); | |
| 1165 temp4 = vec_mergeh(tempC, tempK); | |
| 1166 temp5 = vec_mergel(tempC, tempK); | |
| 1167 temp6 = vec_mergeh(tempD, tempL); | |
| 1168 temp7 = vec_mergel(tempD, tempL); | |
| 1169 | |
| 1170 | |
| 1171 const vector signed char neg1 = vec_splat_s8(-1); | |
| 1172 #define STORE_DOUBLE_LINE(i, j) \ | |
| 1173 vector unsigned char dstA##i = vec_ld(i * stride, dst); \ | |
| 1174 vector unsigned char dstB##i = vec_ld(i * stride + 16, dst); \ | |
| 1175 vector unsigned char dstA##j = vec_ld(j * stride, dst); \ | |
| 1176 vector unsigned char dstB##j = vec_ld(j * stride+ 16, dst); \ | |
| 1177 vector unsigned char align##i = vec_lvsr(i * stride, dst); \ | |
| 1178 vector unsigned char align##j = vec_lvsr(j * stride, dst); \ | |
| 1179 vector unsigned char mask##i = vec_perm(zero, (vector unsigned char)neg1, align##i); \ | |
| 1180 vector unsigned char mask##j = vec_perm(zero, (vector unsigned char)neg1, align##j); \ | |
| 1181 vector unsigned char dstR##i = vec_perm(temp##i, temp##i, align##i); \ | |
| 1182 vector unsigned char dstR##j = vec_perm(temp##j, temp##j, align##j); \ | |
| 1183 vector unsigned char dstAF##i = vec_sel(dstA##i, dstR##i, mask##i); \ | |
| 1184 vector unsigned char dstBF##i = vec_sel(dstR##i, dstB##i, mask##i); \ | |
| 1185 vector unsigned char dstAF##j = vec_sel(dstA##j, dstR##j, mask##j); \ | |
| 1186 vector unsigned char dstBF##j = vec_sel(dstR##j, dstB##j, mask##j); \ | |
| 1187 vec_st(dstAF##i, i * stride, dst); \ | |
| 1188 vec_st(dstBF##i, i * stride + 16, dst); \ | |
| 1189 vec_st(dstAF##j, j * stride, dst); \ | |
| 1190 vec_st(dstBF##j, j * stride + 16, dst) | |
| 1191 | |
| 1192 STORE_DOUBLE_LINE(0,1); | |
| 1193 STORE_DOUBLE_LINE(2,3); | |
| 1194 STORE_DOUBLE_LINE(4,5); | |
| 1195 STORE_DOUBLE_LINE(6,7); | |
| 1196 } |
