Mercurial > libavcodec.hg
annotate alpha/dsputil_alpha.c @ 538:7d45f7c1937e libavcodec
10L
| author | nickols_k |
|---|---|
| date | Thu, 11 Jul 2002 15:54:10 +0000 |
| parents | 70113647b50d |
| children | 8cefba09f2e8 |
| rev | line source |
|---|---|
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
1 /* |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
2 * Alpha optimized DSP utils |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
3 * Copyright (c) 2002 Falk Hueffner <falk@debian.org> |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
4 * |
| 429 | 5 * This library is free software; you can redistribute it and/or |
| 6 * modify it under the terms of the GNU Lesser General Public | |
| 7 * License as published by the Free Software Foundation; either | |
| 8 * version 2 of the License, or (at your option) any later version. | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
9 * |
| 429 | 10 * This library is distributed in the hope that it will be useful, |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 429 | 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 * Lesser General Public License for more details. | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
14 * |
| 429 | 15 * You should have received a copy of the GNU Lesser General Public |
| 16 * License along with this library; if not, write to the Free Software | |
| 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
18 */ |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
19 |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
20 #include "asm.h" |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
21 #include "../dsputil.h" |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
22 |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
23 void simple_idct_axp(DCTELEM *block); |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
24 |
|
511
fa4425cf6b31
Assembly version of put_pixels. This is currently the function that
mellum
parents:
509
diff
changeset
|
25 void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, |
|
fa4425cf6b31
Assembly version of put_pixels. This is currently the function that
mellum
parents:
509
diff
changeset
|
26 int line_size, int h); |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
27 void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
28 int line_size); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
29 void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
30 int line_size); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
31 |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
32 #if 0 |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
33 /* These functions were the base for the optimized assembler routines, |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
34 and remain here for documentation purposes. */ |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
35 static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
36 int line_size) |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
37 { |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
38 int i = 8; |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
39 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ |
|
505
7a976bf93394
Ugly hack to make the assembler accept MVI instructions.
mellum
parents:
429
diff
changeset
|
40 |
|
7a976bf93394
Ugly hack to make the assembler accept MVI instructions.
mellum
parents:
429
diff
changeset
|
41 ASM_ACCEPT_MVI; |
|
7a976bf93394
Ugly hack to make the assembler accept MVI instructions.
mellum
parents:
429
diff
changeset
|
42 |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
43 do { |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
44 uint64_t shorts0, shorts1; |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
45 |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
46 shorts0 = ldq(block); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
47 shorts0 = maxsw4(shorts0, 0); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
48 shorts0 = minsw4(shorts0, clampmask); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
49 stl(pkwb(shorts0), pixels); |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
50 |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
51 shorts1 = ldq(block + 4); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
52 shorts1 = maxsw4(shorts1, 0); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
53 shorts1 = minsw4(shorts1, clampmask); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
54 stl(pkwb(shorts1), pixels + 4); |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
55 |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
56 pixels += line_size; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
57 block += 8; |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
58 } while (--i); |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
59 } |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
60 |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
61 void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
62 int line_size) |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
63 { |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
64 int h = 8; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
65 /* Keep this function a leaf function by generating the constants |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
66 manually (mainly for the hack value ;-). */ |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
67 uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */ |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
68 uint64_t signmask = zap(-1, 0x33); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
69 signmask ^= signmask >> 1; /* 0x8000800080008000 */ |
|
505
7a976bf93394
Ugly hack to make the assembler accept MVI instructions.
mellum
parents:
429
diff
changeset
|
70 |
|
7a976bf93394
Ugly hack to make the assembler accept MVI instructions.
mellum
parents:
429
diff
changeset
|
71 ASM_ACCEPT_MVI; |
|
7a976bf93394
Ugly hack to make the assembler accept MVI instructions.
mellum
parents:
429
diff
changeset
|
72 |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
73 do { |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
74 uint64_t shorts0, pix0, signs0; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
75 uint64_t shorts1, pix1, signs1; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
76 |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
77 shorts0 = ldq(block); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
78 shorts1 = ldq(block + 4); |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
79 |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
80 pix0 = unpkbw(ldl(pixels)); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
81 /* Signed subword add (MMX paddw). */ |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
82 signs0 = shorts0 & signmask; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
83 shorts0 &= ~signmask; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
84 shorts0 += pix0; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
85 shorts0 ^= signs0; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
86 /* Clamp. */ |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
87 shorts0 = maxsw4(shorts0, 0); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
88 shorts0 = minsw4(shorts0, clampmask); |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
89 |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
90 /* Next 4. */ |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
91 pix1 = unpkbw(ldl(pixels + 4)); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
92 signs1 = shorts1 & signmask; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
93 shorts1 &= ~signmask; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
94 shorts1 += pix1; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
95 shorts1 ^= signs1; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
96 shorts1 = maxsw4(shorts1, 0); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
97 shorts1 = minsw4(shorts1, clampmask); |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
98 |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
99 stl(pkwb(shorts0), pixels); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
100 stl(pkwb(shorts1), pixels + 4); |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
101 |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
102 pixels += line_size; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
103 block += 8; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
104 } while (--h); |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
105 } |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
106 #endif |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
107 |
| 518 | 108 static void clear_blocks_axp(DCTELEM *blocks) { |
| 109 uint64_t *p = (uint64_t *) blocks; | |
| 110 int n = sizeof(DCTELEM) * 6 * 64; | |
| 111 | |
| 112 do { | |
| 113 p[0] = 0; | |
| 114 p[1] = 0; | |
| 115 p[2] = 0; | |
| 116 p[3] = 0; | |
| 117 p[4] = 0; | |
| 118 p[5] = 0; | |
| 119 p[6] = 0; | |
| 120 p[7] = 0; | |
| 121 p += 8; | |
| 122 n -= 8 * 8; | |
| 123 } while (n); | |
| 124 } | |
| 125 | |
| 513 | 126 static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
127 { |
| 513 | 128 return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); |
| 129 } | |
| 130 | |
| 131 static inline uint64_t avg2(uint64_t a, uint64_t b) | |
| 132 { | |
| 133 return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
134 } |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
135 |
| 513 | 136 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
137 { |
| 513 | 138 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) |
| 139 + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |
| 140 + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |
| 141 + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |
| 142 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |
| 143 + (l2 & BYTE_VEC(0x03)) | |
| 144 + (l3 & BYTE_VEC(0x03)) | |
| 145 + (l4 & BYTE_VEC(0x03)) | |
| 146 + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |
| 147 return r1 + r2; | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
148 } |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
149 |
| 513 | 150 static inline uint64_t avg4_no_rnd(uint64_t l1, uint64_t l2, |
| 151 uint64_t l3, uint64_t l4) | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
152 { |
| 513 | 153 uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) |
| 154 + ((l2 & ~BYTE_VEC(0x03)) >> 2) | |
| 155 + ((l3 & ~BYTE_VEC(0x03)) >> 2) | |
| 156 + ((l4 & ~BYTE_VEC(0x03)) >> 2); | |
| 157 uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |
| 158 + (l2 & BYTE_VEC(0x03)) | |
| 159 + (l3 & BYTE_VEC(0x03)) | |
| 160 + (l4 & BYTE_VEC(0x03)) | |
| 161 + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03); | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
162 return r1 + r2; |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
163 } |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
164 |
| 513 | 165 #define OP(LOAD, STORE, INCR) \ |
| 166 do { \ | |
| 167 STORE(LOAD(pixels), block); \ | |
| 168 pixels += line_size; \ | |
| 169 block += INCR; \ | |
| 170 } while (--h) | |
| 171 | |
| 172 #define OP_X2(LOAD, STORE, INCR) \ | |
| 173 do { \ | |
| 174 uint64_t pix1, pix2; \ | |
| 175 \ | |
| 176 pix1 = LOAD(pixels); \ | |
| 177 pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |
| 178 STORE(AVG2(pix1, pix2), block); \ | |
| 179 pixels += line_size; \ | |
| 180 block += INCR; \ | |
| 181 } while (--h) | |
| 182 | |
| 183 #define OP_Y2(LOAD, STORE, INCR) \ | |
| 184 do { \ | |
| 185 uint64_t pix = LOAD(pixels); \ | |
| 186 do { \ | |
| 187 uint64_t next_pix; \ | |
| 188 \ | |
| 189 pixels += line_size; \ | |
| 190 next_pix = LOAD(pixels); \ | |
| 191 STORE(AVG2(pix, next_pix), block); \ | |
| 192 block += INCR; \ | |
| 193 pix = next_pix; \ | |
| 194 } while (--h); \ | |
| 195 } while (0) | |
| 196 | |
| 197 #define OP_XY2(LOAD, STORE, INCR) \ | |
| 198 do { \ | |
| 199 uint64_t pix1 = LOAD(pixels); \ | |
| 200 uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |
| 201 \ | |
| 202 do { \ | |
| 203 uint64_t next_pix1, next_pix2; \ | |
| 204 \ | |
| 205 pixels += line_size; \ | |
| 206 next_pix1 = LOAD(pixels); \ | |
| 207 next_pix2 = next_pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |
| 208 \ | |
| 209 STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block); \ | |
| 210 \ | |
| 211 block += INCR; \ | |
| 212 pix1 = next_pix1; \ | |
| 213 pix2 = next_pix2; \ | |
| 214 } while (--h); \ | |
| 215 } while (0) | |
| 216 | |
| 217 #define MAKE_OP(BTYPE, OPNAME, SUFF, OPKIND, STORE, INCR) \ | |
| 218 static void OPNAME ## _pixels ## SUFF ## _axp(BTYPE *block, \ | |
| 219 const uint8_t *pixels, \ | |
| 220 int line_size, int h) \ | |
| 221 { \ | |
| 222 if ((size_t) pixels & 0x7) { \ | |
| 223 OPKIND(uldq, STORE, INCR); \ | |
| 224 } else { \ | |
| 225 OPKIND(ldq, STORE, INCR); \ | |
| 226 } \ | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
227 } |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
228 |
| 513 | 229 #define PIXOP(BTYPE, OPNAME, STORE, INCR) \ |
| 230 MAKE_OP(BTYPE, OPNAME, , OP, STORE, INCR); \ | |
| 231 MAKE_OP(BTYPE, OPNAME, _x2, OP_X2, STORE, INCR); \ | |
| 232 MAKE_OP(BTYPE, OPNAME, _y2, OP_Y2, STORE, INCR); \ | |
| 233 MAKE_OP(BTYPE, OPNAME, _xy2, OP_XY2, STORE, INCR); | |
| 234 | |
| 235 /* Rounding primitives. */ | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
236 #define AVG2 avg2 |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
237 #define AVG4 avg4 |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
238 #define STORE(l, b) stq(l, b) |
| 513 | 239 PIXOP(uint8_t, put, STORE, line_size); |
| 240 | |
| 241 #undef STORE | |
| 242 #define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |
| 243 PIXOP(uint8_t, avg, STORE, line_size); | |
| 244 | |
| 245 /* Not rounding primitives. */ | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
246 #undef AVG2 |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
247 #undef AVG4 |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
248 #undef STORE |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
249 #define AVG2 avg2_no_rnd |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
250 #define AVG4 avg4_no_rnd |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
251 #define STORE(l, b) stq(l, b) |
| 513 | 252 PIXOP(uint8_t, put_no_rnd, STORE, line_size); |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
253 |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
254 #undef STORE |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
255 #define STORE(l, b) stq(AVG2(l, ldq(b)), b); |
| 513 | 256 PIXOP(uint8_t, avg_no_rnd, STORE, line_size); |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
257 |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
258 void dsputil_init_alpha(void) |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
259 { |
|
511
fa4425cf6b31
Assembly version of put_pixels. This is currently the function that
mellum
parents:
509
diff
changeset
|
260 put_pixels_tab[0] = put_pixels_axp_asm; |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
261 put_pixels_tab[1] = put_pixels_x2_axp; |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
262 put_pixels_tab[2] = put_pixels_y2_axp; |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
263 put_pixels_tab[3] = put_pixels_xy2_axp; |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
264 |
|
511
fa4425cf6b31
Assembly version of put_pixels. This is currently the function that
mellum
parents:
509
diff
changeset
|
265 put_no_rnd_pixels_tab[0] = put_pixels_axp_asm; |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
266 put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp; |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
267 put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
268 put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
269 |
| 513 | 270 avg_pixels_tab[0] = avg_pixels_axp; |
| 271 avg_pixels_tab[1] = avg_pixels_x2_axp; | |
| 272 avg_pixels_tab[2] = avg_pixels_y2_axp; | |
| 273 avg_pixels_tab[3] = avg_pixels_xy2_axp; | |
| 274 | |
| 275 avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp; | |
| 276 avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp; | |
| 277 avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp; | |
| 278 avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp; | |
| 279 | |
| 518 | 280 clear_blocks = clear_blocks_axp; |
| 281 | |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
282 /* amask clears all bits that correspond to present features. */ |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
283 if (amask(AMASK_MVI) == 0) { |
|
509
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
284 put_pixels_clamped = put_pixels_clamped_mvi_asm; |
|
cab79946302f
Implement put_pixels_clamped and add_pixels_clamped in Assembler. This
mellum
parents:
505
diff
changeset
|
285 add_pixels_clamped = add_pixels_clamped_mvi_asm; |
|
214
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
286 } |
|
73df666cacc7
Alpha optimizations by Falk Hueffner <falk.hueffner@student.uni-tuebingen.de>
nickols_k
parents:
diff
changeset
|
287 } |
