Mercurial > libavcodec.hg
annotate x86/vp8dsp.asm @ 12530:63edd10ad4bc libavcodec tip
Try to fix crashes introduced by r25218
r25218 made assumptions about the existence of past reference frames that
weren't necessarily true.
| author | darkshikari |
|---|---|
| date | Tue, 28 Sep 2010 09:06:22 +0000 |
| parents | 2982071047a2 |
| children |
| rev | line source |
|---|---|
| 11975 | 1 ;****************************************************************************** |
| 2 ;* VP8 MMXEXT optimizations | |
| 3 ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> | |
| 4 ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> | |
| 5 ;* | |
| 6 ;* This file is part of FFmpeg. | |
| 7 ;* | |
| 8 ;* FFmpeg is free software; you can redistribute it and/or | |
| 9 ;* modify it under the terms of the GNU Lesser General Public | |
| 10 ;* License as published by the Free Software Foundation; either | |
| 11 ;* version 2.1 of the License, or (at your option) any later version. | |
| 12 ;* | |
| 13 ;* FFmpeg is distributed in the hope that it will be useful, | |
| 14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 16 ;* Lesser General Public License for more details. | |
| 17 ;* | |
| 18 ;* You should have received a copy of the GNU Lesser General Public | |
| 19 ;* License along with FFmpeg; if not, write to the Free Software | |
| 20 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 21 ;****************************************************************************** | |
| 22 | |
| 23 %include "x86inc.asm" | |
|
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
24 %include "x86util.asm" |
| 11975 | 25 |
| 26 SECTION_RODATA | |
| 27 | |
| 28 fourtap_filter_hw_m: times 4 dw -6, 123 | |
| 29 times 4 dw 12, -1 | |
| 30 times 4 dw -9, 93 | |
| 31 times 4 dw 50, -6 | |
| 32 times 4 dw -6, 50 | |
| 33 times 4 dw 93, -9 | |
| 34 times 4 dw -1, 12 | |
| 35 times 4 dw 123, -6 | |
| 36 | |
| 37 sixtap_filter_hw_m: times 4 dw 2, -11 | |
| 38 times 4 dw 108, 36 | |
| 39 times 4 dw -8, 1 | |
| 40 times 4 dw 3, -16 | |
| 41 times 4 dw 77, 77 | |
| 42 times 4 dw -16, 3 | |
| 43 times 4 dw 1, -8 | |
| 44 times 4 dw 36, 108 | |
| 45 times 4 dw -11, 2 | |
| 46 | |
|
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
47 fourtap_filter_hb_m: times 8 db -6, 123 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
48 times 8 db 12, -1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
49 times 8 db -9, 93 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
50 times 8 db 50, -6 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
51 times 8 db -6, 50 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
52 times 8 db 93, -9 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
53 times 8 db -1, 12 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
54 times 8 db 123, -6 |
| 11975 | 55 |
| 56 sixtap_filter_hb_m: times 8 db 2, 1 | |
| 57 times 8 db -11, 108 | |
| 58 times 8 db 36, -8 | |
| 59 times 8 db 3, 3 | |
| 60 times 8 db -16, 77 | |
| 61 times 8 db 77, -16 | |
| 62 times 8 db 1, 2 | |
| 63 times 8 db -8, 36 | |
| 64 times 8 db 108, -11 | |
| 65 | |
| 66 fourtap_filter_v_m: times 8 dw -6 | |
| 67 times 8 dw 123 | |
| 68 times 8 dw 12 | |
| 69 times 8 dw -1 | |
| 70 times 8 dw -9 | |
| 71 times 8 dw 93 | |
| 72 times 8 dw 50 | |
| 73 times 8 dw -6 | |
| 74 times 8 dw -6 | |
| 75 times 8 dw 50 | |
| 76 times 8 dw 93 | |
| 77 times 8 dw -9 | |
| 78 times 8 dw -1 | |
| 79 times 8 dw 12 | |
| 80 times 8 dw 123 | |
| 81 times 8 dw -6 | |
| 82 | |
| 83 sixtap_filter_v_m: times 8 dw 2 | |
| 84 times 8 dw -11 | |
| 85 times 8 dw 108 | |
| 86 times 8 dw 36 | |
| 87 times 8 dw -8 | |
| 88 times 8 dw 1 | |
| 89 times 8 dw 3 | |
| 90 times 8 dw -16 | |
| 91 times 8 dw 77 | |
| 92 times 8 dw 77 | |
| 93 times 8 dw -16 | |
| 94 times 8 dw 3 | |
| 95 times 8 dw 1 | |
| 96 times 8 dw -8 | |
| 97 times 8 dw 36 | |
| 98 times 8 dw 108 | |
| 99 times 8 dw -11 | |
| 100 times 8 dw 2 | |
| 101 | |
| 11991 | 102 bilinear_filter_vw_m: times 8 dw 1 |
| 103 times 8 dw 2 | |
| 104 times 8 dw 3 | |
| 105 times 8 dw 4 | |
| 106 times 8 dw 5 | |
| 107 times 8 dw 6 | |
| 108 times 8 dw 7 | |
| 109 | |
| 110 bilinear_filter_vb_m: times 8 db 7, 1 | |
| 111 times 8 db 6, 2 | |
| 112 times 8 db 5, 3 | |
| 113 times 8 db 4, 4 | |
| 114 times 8 db 3, 5 | |
| 115 times 8 db 2, 6 | |
| 116 times 8 db 1, 7 | |
| 117 | |
| 11975 | 118 %ifdef PIC |
| 11991 | 119 %define fourtap_filter_hw r11 |
| 120 %define sixtap_filter_hw r11 | |
| 121 %define fourtap_filter_hb r11 | |
| 122 %define sixtap_filter_hb r11 | |
| 123 %define fourtap_filter_v r11 | |
| 124 %define sixtap_filter_v r11 | |
| 125 %define bilinear_filter_vw r11 | |
| 126 %define bilinear_filter_vb r11 | |
| 11975 | 127 %else |
| 128 %define fourtap_filter_hw fourtap_filter_hw_m | |
| 129 %define sixtap_filter_hw sixtap_filter_hw_m | |
| 130 %define fourtap_filter_hb fourtap_filter_hb_m | |
| 131 %define sixtap_filter_hb sixtap_filter_hb_m | |
| 132 %define fourtap_filter_v fourtap_filter_v_m | |
| 133 %define sixtap_filter_v sixtap_filter_v_m | |
| 11991 | 134 %define bilinear_filter_vw bilinear_filter_vw_m |
| 135 %define bilinear_filter_vb bilinear_filter_vb_m | |
| 11975 | 136 %endif |
| 137 | |
| 11991 | 138 filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
|
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
139 filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
| 11975 | 140 |
| 11991 | 141 filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
| 142 filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 | |
| 143 filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 | |
| 11975 | 144 |
| 12013 | 145 pw_20091: times 4 dw 20091 |
| 146 pw_17734: times 4 dw 17734 | |
| 147 | |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
148 pb_27_63: times 8 db 27, 63 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
149 pb_18_63: times 8 db 18, 63 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
150 pb_9_63: times 8 db 9, 63 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
151 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
152 cextern pb_1 |
|
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
153 cextern pw_3 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
154 cextern pb_3 |
| 11975 | 155 cextern pw_4 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
156 cextern pb_4 |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
157 cextern pw_9 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
158 cextern pw_18 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
159 cextern pw_27 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
160 cextern pw_63 |
| 11975 | 161 cextern pw_64 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
162 cextern pb_80 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
163 cextern pb_F8 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
164 cextern pb_FE |
| 11975 | 165 |
| 166 SECTION .text | |
| 167 | |
| 168 ;----------------------------------------------------------------------------- | |
| 169 ; subpel MC functions: | |
| 170 ; | |
| 171 ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, | |
| 172 ; uint8_t *src, int srcstride, | |
| 173 ; int height, int mx, int my); | |
| 174 ;----------------------------------------------------------------------------- | |
| 175 | |
|
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
176 %macro FILTER_SSSE3 3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
177 cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
178 lea r5d, [r5*3] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
179 mova m3, [filter_h6_shuf2] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
180 mova m4, [filter_h6_shuf3] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
181 %ifdef PIC |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
182 lea r11, [sixtap_filter_hb_m] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
183 %endif |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
184 mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
185 mova m6, [sixtap_filter_hb+r5*8-32] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
186 mova m7, [sixtap_filter_hb+r5*8-16] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
187 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
188 .nextrow |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
189 movu m0, [r2-2] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
190 mova m1, m0 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
191 mova m2, m0 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
192 %ifidn %1, 4 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
193 ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
194 ; shuffle with a memory operand |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
195 punpcklbw m0, [r2+3] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
196 %else |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
197 pshufb m0, [filter_h6_shuf1] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
198 %endif |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
199 pshufb m1, m3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
200 pshufb m2, m4 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
201 pmaddubsw m0, m5 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
202 pmaddubsw m1, m6 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
203 pmaddubsw m2, m7 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
204 paddsw m0, m1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
205 paddsw m0, m2 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
206 paddsw m0, [pw_64] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
207 psraw m0, 7 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
208 packuswb m0, m0 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
209 movh [r0], m0 ; store |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
210 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
211 ; go to next line |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
212 add r0, r1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
213 add r2, r3 |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
214 dec r4d ; next row |
|
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
215 jg .nextrow |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
216 REP_RET |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
217 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
218 cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
219 shl r5d, 4 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
220 mova m2, [pw_64] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
221 mova m3, [filter_h2_shuf] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
222 mova m4, [filter_h4_shuf] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
223 %ifdef PIC |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
224 lea r11, [fourtap_filter_hb_m] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
225 %endif |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
226 mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
227 mova m6, [fourtap_filter_hb+r5] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
228 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
229 .nextrow |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
230 movu m0, [r2-1] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
231 mova m1, m0 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
232 pshufb m0, m3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
233 pshufb m1, m4 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
234 pmaddubsw m0, m5 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
235 pmaddubsw m1, m6 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
236 paddsw m0, m2 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
237 paddsw m0, m1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
238 psraw m0, 7 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
239 packuswb m0, m0 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
240 movh [r0], m0 ; store |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
241 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
242 ; go to next line |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
243 add r0, r1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
244 add r2, r3 |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
245 dec r4d ; next row |
|
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
246 jg .nextrow |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
247 REP_RET |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
248 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
249 cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
250 shl r6d, 4 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
251 %ifdef PIC |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
252 lea r11, [fourtap_filter_hb_m] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
253 %endif |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
254 mova m5, [fourtap_filter_hb+r6-16] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
255 mova m6, [fourtap_filter_hb+r6] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
256 mova m7, [pw_64] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
257 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
258 ; read 3 lines |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
259 sub r2, r3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
260 movh m0, [r2] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
261 movh m1, [r2+ r3] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
262 movh m2, [r2+2*r3] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
263 add r2, r3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
264 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
265 .nextrow |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
266 movh m3, [r2+2*r3] ; read new row |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
267 mova m4, m0 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
268 mova m0, m1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
269 punpcklbw m4, m1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
270 mova m1, m2 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
271 punpcklbw m2, m3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
272 pmaddubsw m4, m5 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
273 pmaddubsw m2, m6 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
274 paddsw m4, m2 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
275 mova m2, m3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
276 paddsw m4, m7 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
277 psraw m4, 7 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
278 packuswb m4, m4 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
279 movh [r0], m4 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
280 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
281 ; go to next line |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
282 add r0, r1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
283 add r2, r3 |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
284 dec r4d ; next row |
|
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
285 jg .nextrow |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
286 REP_RET |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
287 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
288 cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
289 lea r6d, [r6*3] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
290 %ifdef PIC |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
291 lea r11, [sixtap_filter_hb_m] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
292 %endif |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
293 lea r6, [sixtap_filter_hb+r6*8] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
294 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
295 ; read 5 lines |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
296 sub r2, r3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
297 sub r2, r3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
298 movh m0, [r2] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
299 movh m1, [r2+r3] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
300 movh m2, [r2+r3*2] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
301 lea r2, [r2+r3*2] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
302 add r2, r3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
303 movh m3, [r2] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
304 movh m4, [r2+r3] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
305 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
306 .nextrow |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
307 movh m5, [r2+2*r3] ; read new row |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
308 mova m6, m0 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
309 punpcklbw m6, m5 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
310 mova m0, m1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
311 punpcklbw m1, m2 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
312 mova m7, m3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
313 punpcklbw m7, m4 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
314 pmaddubsw m6, [r6-48] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
315 pmaddubsw m1, [r6-32] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
316 pmaddubsw m7, [r6-16] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
317 paddsw m6, m1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
318 paddsw m6, m7 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
319 mova m1, m2 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
320 paddsw m6, [pw_64] |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
321 mova m2, m3 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
322 psraw m6, 7 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
323 mova m3, m4 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
324 packuswb m6, m6 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
325 mova m4, m5 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
326 movh [r0], m6 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
327 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
328 ; go to next line |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
329 add r0, r1 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
330 add r2, r3 |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
331 dec r4d ; next row |
|
12054
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
332 jg .nextrow |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
333 REP_RET |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
334 %endmacro |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
335 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
336 INIT_MMX |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
337 FILTER_SSSE3 4, 0, 0 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
338 INIT_XMM |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
339 FILTER_SSSE3 8, 8, 7 |
|
b8f80fe02861
SSSE3 versions of width4 VP8 6-tap MC functions
darkshikari
parents:
12018
diff
changeset
|
340 |
| 11975 | 341 ; 4x4 block, H-only 4-tap filter |
| 342 cglobal put_vp8_epel4_h4_mmxext, 6, 6 | |
| 343 shl r5d, 4 | |
| 344 %ifdef PIC | |
| 345 lea r11, [fourtap_filter_hw_m] | |
| 346 %endif | |
| 347 movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words | |
| 348 movq mm5, [fourtap_filter_hw+r5] | |
| 349 movq mm7, [pw_64] | |
| 350 pxor mm6, mm6 | |
| 351 | |
| 352 .nextrow | |
| 353 movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels | |
| 354 | |
| 355 ; first set of 2 pixels | |
| 356 movq mm2, mm1 ; byte ABCD.. | |
| 357 punpcklbw mm1, mm6 ; byte->word ABCD | |
| 358 pshufw mm0, mm2, 9 ; byte CDEF.. | |
| 359 punpcklbw mm0, mm6 ; byte->word CDEF | |
| 360 pshufw mm3, mm1, 0x94 ; word ABBC | |
| 361 pshufw mm1, mm0, 0x94 ; word CDDE | |
| 362 pmaddwd mm3, mm4 ; multiply 2px with F0/F1 | |
| 363 movq mm0, mm1 ; backup for second set of pixels | |
| 364 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
| 365 paddd mm3, mm1 ; finish 1st 2px | |
| 366 | |
| 367 ; second set of 2 pixels, use backup of above | |
| 368 punpckhbw mm2, mm6 ; byte->word EFGH | |
| 369 pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 | |
| 370 pshufw mm1, mm2, 0x94 ; word EFFG | |
| 371 pmaddwd mm1, mm5 ; multiply 2px with F2/F3 | |
| 372 paddd mm0, mm1 ; finish 2nd 2px | |
| 373 | |
| 374 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
| 375 packssdw mm3, mm0 ; merge dword->word (4px) | |
| 376 paddsw mm3, mm7 ; rounding | |
| 377 psraw mm3, 7 | |
| 378 packuswb mm3, mm6 ; clip and word->bytes | |
| 379 movd [r0], mm3 ; store | |
| 380 | |
| 381 ; go to next line | |
| 382 add r0, r1 | |
| 383 add r2, r3 | |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
384 dec r4d ; next row |
| 11975 | 385 jg .nextrow |
| 386 REP_RET | |
| 387 | |
| 388 ; 4x4 block, H-only 6-tap filter | |
| 389 cglobal put_vp8_epel4_h6_mmxext, 6, 6 | |
| 390 lea r5d, [r5*3] | |
| 391 %ifdef PIC | |
| 392 lea r11, [sixtap_filter_hw_m] | |
| 393 %endif | |
| 394 movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words | |
| 395 movq mm5, [sixtap_filter_hw+r5*8-32] | |
| 396 movq mm6, [sixtap_filter_hw+r5*8-16] | |
| 397 movq mm7, [pw_64] | |
| 398 pxor mm3, mm3 | |
| 399 | |
| 400 .nextrow | |
| 401 movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels | |
| 402 | |
| 403 ; first set of 2 pixels | |
| 404 movq mm2, mm1 ; byte ABCD.. | |
| 405 punpcklbw mm1, mm3 ; byte->word ABCD | |
| 406 pshufw mm0, mm2, 0x9 ; byte CDEF.. | |
| 407 punpckhbw mm2, mm3 ; byte->word EFGH | |
| 408 punpcklbw mm0, mm3 ; byte->word CDEF | |
| 409 pshufw mm1, mm1, 0x94 ; word ABBC | |
| 410 pshufw mm2, mm2, 0x94 ; word EFFG | |
| 411 pmaddwd mm1, mm4 ; multiply 2px with F0/F1 | |
| 412 pshufw mm3, mm0, 0x94 ; word CDDE | |
| 413 movq mm0, mm3 ; backup for second set of pixels | |
| 414 pmaddwd mm3, mm5 ; multiply 2px with F2/F3 | |
| 415 paddd mm1, mm3 ; add to 1st 2px cache | |
| 416 movq mm3, mm2 ; backup for second set of pixels | |
| 417 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
| 418 paddd mm1, mm2 ; finish 1st 2px | |
| 419 | |
| 420 ; second set of 2 pixels, use backup of above | |
| 421 movd mm2, [r2+3] ; byte FGHI (prevent overreads) | |
| 422 pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 | |
| 423 pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 | |
| 424 paddd mm0, mm3 ; add to 2nd 2px cache | |
| 425 pxor mm3, mm3 | |
| 426 punpcklbw mm2, mm3 ; byte->word FGHI | |
| 427 pshufw mm2, mm2, 0xE9 ; word GHHI | |
| 428 pmaddwd mm2, mm6 ; multiply 2px with F4/F5 | |
| 429 paddd mm0, mm2 ; finish 2nd 2px | |
| 430 | |
| 431 ; merge two sets of 2 pixels into one set of 4, round/clip/store | |
| 432 packssdw mm1, mm0 ; merge dword->word (4px) | |
| 433 paddsw mm1, mm7 ; rounding | |
| 434 psraw mm1, 7 | |
| 435 packuswb mm1, mm3 ; clip and word->bytes | |
| 436 movd [r0], mm1 ; store | |
| 437 | |
| 438 ; go to next line | |
| 439 add r0, r1 | |
| 440 add r2, r3 | |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
441 dec r4d ; next row |
| 11975 | 442 jg .nextrow |
| 443 REP_RET | |
| 444 | |
| 445 INIT_XMM | |
| 12278 | 446 cglobal put_vp8_epel8_h4_sse2, 6, 6, 10 |
| 447 shl r5d, 5 | |
| 11975 | 448 %ifdef PIC |
| 12278 | 449 lea r11, [fourtap_filter_v_m] |
| 11975 | 450 %endif |
| 12278 | 451 lea r5, [fourtap_filter_v+r5-32] |
| 11975 | 452 pxor m7, m7 |
| 12278 | 453 mova m4, [pw_64] |
| 454 mova m5, [r5+ 0] | |
| 455 mova m6, [r5+16] | |
| 456 %ifdef m8 | |
| 457 mova m8, [r5+32] | |
| 458 mova m9, [r5+48] | |
| 459 %endif | |
| 11975 | 460 .nextrow |
| 12278 | 461 movq m0, [r2-1] |
| 462 movq m1, [r2-0] | |
| 463 movq m2, [r2+1] | |
| 464 movq m3, [r2+2] | |
| 465 punpcklbw m0, m7 | |
| 466 punpcklbw m1, m7 | |
| 467 punpcklbw m2, m7 | |
| 468 punpcklbw m3, m7 | |
| 469 pmullw m0, m5 | |
| 470 pmullw m1, m6 | |
| 471 %ifdef m8 | |
| 472 pmullw m2, m8 | |
| 473 pmullw m3, m9 | |
| 474 %else | |
| 475 pmullw m2, [r5+32] | |
| 476 pmullw m3, [r5+48] | |
| 477 %endif | |
| 478 paddsw m0, m1 | |
| 479 paddsw m2, m3 | |
| 480 paddsw m0, m2 | |
| 481 paddsw m0, m4 | |
| 11975 | 482 psraw m0, 7 |
| 483 packuswb m0, m7 | |
| 484 movh [r0], m0 ; store | |
| 485 | |
| 486 ; go to next line | |
| 487 add r0, r1 | |
| 488 add r2, r3 | |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
489 dec r4d ; next row |
| 11975 | 490 jg .nextrow |
| 491 REP_RET | |
| 492 | |
| 12278 | 493 cglobal put_vp8_epel8_h6_sse2, 6, 6, 14 |
| 11975 | 494 lea r5d, [r5*3] |
| 12278 | 495 shl r5d, 4 |
| 11975 | 496 %ifdef PIC |
| 12278 | 497 lea r11, [sixtap_filter_v_m] |
| 11975 | 498 %endif |
| 12278 | 499 lea r5, [sixtap_filter_v+r5-96] |
| 11975 | 500 pxor m7, m7 |
| 12278 | 501 mova m6, [pw_64] |
| 502 %ifdef m8 | |
| 503 mova m8, [r5+ 0] | |
| 504 mova m9, [r5+16] | |
| 505 mova m10, [r5+32] | |
| 506 mova m11, [r5+48] | |
| 507 mova m12, [r5+64] | |
| 508 mova m13, [r5+80] | |
| 509 %endif | |
| 11975 | 510 .nextrow |
| 12278 | 511 movq m0, [r2-2] |
| 512 movq m1, [r2-1] | |
| 513 movq m2, [r2-0] | |
| 514 movq m3, [r2+1] | |
| 515 movq m4, [r2+2] | |
| 516 movq m5, [r2+3] | |
| 517 punpcklbw m0, m7 | |
| 518 punpcklbw m1, m7 | |
| 519 punpcklbw m2, m7 | |
| 520 punpcklbw m3, m7 | |
| 521 punpcklbw m4, m7 | |
| 522 punpcklbw m5, m7 | |
| 523 %ifdef m8 | |
| 524 pmullw m0, m8 | |
| 525 pmullw m1, m9 | |
| 526 pmullw m2, m10 | |
| 527 pmullw m3, m11 | |
| 528 pmullw m4, m12 | |
| 529 pmullw m5, m13 | |
| 530 %else | |
| 531 pmullw m0, [r5+ 0] | |
| 532 pmullw m1, [r5+16] | |
| 533 pmullw m2, [r5+32] | |
| 534 pmullw m3, [r5+48] | |
| 535 pmullw m4, [r5+64] | |
| 536 pmullw m5, [r5+80] | |
| 537 %endif | |
| 538 paddsw m1, m4 | |
| 539 paddsw m0, m5 | |
| 540 paddsw m1, m2 | |
| 541 paddsw m0, m3 | |
| 542 paddsw m0, m1 | |
| 543 paddsw m0, m6 | |
| 11975 | 544 psraw m0, 7 |
| 545 packuswb m0, m7 | |
| 546 movh [r0], m0 ; store | |
| 547 | |
| 548 ; go to next line | |
| 549 add r0, r1 | |
| 550 add r2, r3 | |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
551 dec r4d ; next row |
| 11975 | 552 jg .nextrow |
| 553 REP_RET | |
| 554 | |
| 555 %macro FILTER_V 3 | |
| 556 ; 4x4 block, V-only 4-tap filter | |
| 557 cglobal put_vp8_epel%2_v4_%1, 7, 7, %3 | |
| 558 shl r6d, 5 | |
| 559 %ifdef PIC | |
| 560 lea r11, [fourtap_filter_v_m] | |
| 561 %endif | |
| 562 lea r6, [fourtap_filter_v+r6-32] | |
| 563 mova m6, [pw_64] | |
| 564 pxor m7, m7 | |
| 565 mova m5, [r6+48] | |
| 566 | |
| 567 ; read 3 lines | |
| 568 sub r2, r3 | |
| 569 movh m0, [r2] | |
| 570 movh m1, [r2+ r3] | |
| 571 movh m2, [r2+2*r3] | |
| 572 add r2, r3 | |
| 573 punpcklbw m0, m7 | |
| 574 punpcklbw m1, m7 | |
| 575 punpcklbw m2, m7 | |
| 576 | |
| 577 .nextrow | |
| 578 ; first calculate negative taps (to prevent losing positive overflows) | |
| 579 movh m4, [r2+2*r3] ; read new row | |
| 580 punpcklbw m4, m7 | |
| 581 mova m3, m4 | |
| 582 pmullw m0, [r6+0] | |
| 583 pmullw m4, m5 | |
| 584 paddsw m4, m0 | |
| 585 | |
| 586 ; then calculate positive taps | |
| 587 mova m0, m1 | |
| 588 pmullw m1, [r6+16] | |
| 589 paddsw m4, m1 | |
| 590 mova m1, m2 | |
| 591 pmullw m2, [r6+32] | |
| 592 paddsw m4, m2 | |
| 593 mova m2, m3 | |
| 594 | |
| 595 ; round/clip/store | |
| 596 paddsw m4, m6 | |
| 597 psraw m4, 7 | |
| 598 packuswb m4, m7 | |
| 599 movh [r0], m4 | |
| 600 | |
| 601 ; go to next line | |
| 602 add r0, r1 | |
| 603 add r2, r3 | |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
604 dec r4d ; next row |
| 11975 | 605 jg .nextrow |
| 606 REP_RET | |
| 607 | |
| 608 | |
| 609 ; 4x4 block, V-only 6-tap filter | |
| 610 cglobal put_vp8_epel%2_v6_%1, 7, 7, %3 | |
| 611 shl r6d, 4 | |
| 612 lea r6, [r6*3] | |
| 613 %ifdef PIC | |
| 614 lea r11, [sixtap_filter_v_m] | |
| 615 %endif | |
| 616 lea r6, [sixtap_filter_v+r6-96] | |
| 617 pxor m7, m7 | |
| 618 | |
| 619 ; read 5 lines | |
| 620 sub r2, r3 | |
| 621 sub r2, r3 | |
| 622 movh m0, [r2] | |
| 623 movh m1, [r2+r3] | |
| 624 movh m2, [r2+r3*2] | |
| 625 lea r2, [r2+r3*2] | |
| 626 add r2, r3 | |
| 627 movh m3, [r2] | |
| 628 movh m4, [r2+r3] | |
| 629 punpcklbw m0, m7 | |
| 630 punpcklbw m1, m7 | |
| 631 punpcklbw m2, m7 | |
| 632 punpcklbw m3, m7 | |
| 633 punpcklbw m4, m7 | |
| 634 | |
| 635 .nextrow | |
| 636 ; first calculate negative taps (to prevent losing positive overflows) | |
| 637 mova m5, m1 | |
| 638 pmullw m5, [r6+16] | |
| 639 mova m6, m4 | |
| 640 pmullw m6, [r6+64] | |
| 641 paddsw m6, m5 | |
| 642 | |
| 643 ; then calculate positive taps | |
| 644 movh m5, [r2+2*r3] ; read new row | |
| 645 punpcklbw m5, m7 | |
| 646 pmullw m0, [r6+0] | |
| 647 paddsw m6, m0 | |
| 648 mova m0, m1 | |
| 649 mova m1, m2 | |
| 650 pmullw m2, [r6+32] | |
| 651 paddsw m6, m2 | |
| 652 mova m2, m3 | |
| 653 pmullw m3, [r6+48] | |
| 654 paddsw m6, m3 | |
| 655 mova m3, m4 | |
| 656 mova m4, m5 | |
| 657 pmullw m5, [r6+80] | |
| 658 paddsw m6, m5 | |
| 659 | |
| 660 ; round/clip/store | |
| 661 paddsw m6, [pw_64] | |
| 662 psraw m6, 7 | |
| 663 packuswb m6, m7 | |
| 664 movh [r0], m6 | |
| 665 | |
| 666 ; go to next line | |
| 667 add r0, r1 | |
| 668 add r2, r3 | |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
669 dec r4d ; next row |
| 11975 | 670 jg .nextrow |
| 671 REP_RET | |
| 672 %endmacro | |
| 673 | |
| 674 INIT_MMX | |
| 675 FILTER_V mmxext, 4, 0 | |
| 676 INIT_XMM | |
| 677 FILTER_V sse2, 8, 8 | |
| 678 | |
| 11991 | 679 %macro FILTER_BILINEAR 3 |
| 680 cglobal put_vp8_bilinear%2_v_%1, 7,7,%3 | |
| 681 mov r5d, 8*16 | |
| 682 shl r6d, 4 | |
| 683 sub r5d, r6d | |
| 684 %ifdef PIC | |
| 685 lea r11, [bilinear_filter_vw_m] | |
| 686 %endif | |
| 687 pxor m6, m6 | |
| 12000 | 688 mova m4, [bilinear_filter_vw+r5-16] |
| 689 mova m5, [bilinear_filter_vw+r6-16] | |
| 11991 | 690 .nextrow |
| 691 movh m0, [r2+r3*0] | |
| 692 movh m1, [r2+r3*1] | |
| 693 movh m3, [r2+r3*2] | |
| 694 punpcklbw m0, m6 | |
| 695 punpcklbw m1, m6 | |
| 696 punpcklbw m3, m6 | |
| 697 mova m2, m1 | |
| 698 pmullw m0, m4 | |
| 699 pmullw m1, m5 | |
| 700 pmullw m2, m4 | |
| 701 pmullw m3, m5 | |
| 702 paddsw m0, m1 | |
| 703 paddsw m2, m3 | |
| 704 psraw m0, 2 | |
| 705 psraw m2, 2 | |
| 706 pavgw m0, m6 | |
| 707 pavgw m2, m6 | |
| 708 %ifidn %1, mmxext | |
| 709 packuswb m0, m0 | |
| 710 packuswb m2, m2 | |
| 711 movh [r0+r1*0], m0 | |
| 712 movh [r0+r1*1], m2 | |
| 713 %else | |
| 714 packuswb m0, m2 | |
| 715 movh [r0+r1*0], m0 | |
| 716 movhps [r0+r1*1], m0 | |
| 717 %endif | |
| 718 | |
| 719 lea r0, [r0+r1*2] | |
| 720 lea r2, [r2+r3*2] | |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
721 sub r4d, 2 |
| 11991 | 722 jg .nextrow |
| 723 REP_RET | |
| 724 | |
| 725 cglobal put_vp8_bilinear%2_h_%1, 7,7,%3 | |
| 726 mov r6d, 8*16 | |
| 727 shl r5d, 4 | |
| 728 sub r6d, r5d | |
| 729 %ifdef PIC | |
| 730 lea r11, [bilinear_filter_vw_m] | |
| 731 %endif | |
| 732 pxor m6, m6 | |
| 12000 | 733 mova m4, [bilinear_filter_vw+r6-16] |
| 734 mova m5, [bilinear_filter_vw+r5-16] | |
| 11991 | 735 .nextrow |
| 736 movh m0, [r2+r3*0+0] | |
| 737 movh m1, [r2+r3*0+1] | |
| 738 movh m2, [r2+r3*1+0] | |
| 739 movh m3, [r2+r3*1+1] | |
| 740 punpcklbw m0, m6 | |
| 741 punpcklbw m1, m6 | |
| 742 punpcklbw m2, m6 | |
| 743 punpcklbw m3, m6 | |
| 744 pmullw m0, m4 | |
| 745 pmullw m1, m5 | |
| 746 pmullw m2, m4 | |
| 747 pmullw m3, m5 | |
| 748 paddsw m0, m1 | |
| 749 paddsw m2, m3 | |
| 750 psraw m0, 2 | |
| 751 psraw m2, 2 | |
| 752 pavgw m0, m6 | |
| 753 pavgw m2, m6 | |
| 754 %ifidn %1, mmxext | |
| 755 packuswb m0, m0 | |
| 756 packuswb m2, m2 | |
| 757 movh [r0+r1*0], m0 | |
| 758 movh [r0+r1*1], m2 | |
| 759 %else | |
| 760 packuswb m0, m2 | |
| 761 movh [r0+r1*0], m0 | |
| 762 movhps [r0+r1*1], m0 | |
| 763 %endif | |
| 764 | |
| 765 lea r0, [r0+r1*2] | |
| 766 lea r2, [r2+r3*2] | |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
767 sub r4d, 2 |
| 11991 | 768 jg .nextrow |
| 769 REP_RET | |
| 770 %endmacro | |
| 771 | |
| 772 INIT_MMX | |
| 773 FILTER_BILINEAR mmxext, 4, 0 | |
| 774 INIT_XMM | |
| 775 FILTER_BILINEAR sse2, 8, 7 | |
| 776 | |
|
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
777 %macro FILTER_BILINEAR_SSSE3 1 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
778 cglobal put_vp8_bilinear%1_v_ssse3, 7,7 |
| 11991 | 779 shl r6d, 4 |
| 780 %ifdef PIC | |
| 781 lea r11, [bilinear_filter_vb_m] | |
| 782 %endif | |
| 783 pxor m4, m4 | |
| 12000 | 784 mova m3, [bilinear_filter_vb+r6-16] |
| 11991 | 785 .nextrow |
| 786 movh m0, [r2+r3*0] | |
| 787 movh m1, [r2+r3*1] | |
| 788 movh m2, [r2+r3*2] | |
| 789 punpcklbw m0, m1 | |
| 790 punpcklbw m1, m2 | |
| 791 pmaddubsw m0, m3 | |
| 792 pmaddubsw m1, m3 | |
| 793 psraw m0, 2 | |
| 794 psraw m1, 2 | |
| 795 pavgw m0, m4 | |
| 796 pavgw m1, m4 | |
|
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
797 %if mmsize==8 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
798 packuswb m0, m0 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
799 packuswb m1, m1 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
800 movh [r0+r1*0], m0 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
801 movh [r0+r1*1], m1 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
802 %else |
| 11991 | 803 packuswb m0, m1 |
| 804 movh [r0+r1*0], m0 | |
| 805 movhps [r0+r1*1], m0 | |
|
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
806 %endif |
| 11991 | 807 |
| 808 lea r0, [r0+r1*2] | |
| 809 lea r2, [r2+r3*2] | |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
810 sub r4d, 2 |
| 11991 | 811 jg .nextrow |
| 812 REP_RET | |
| 813 | |
|
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
814 cglobal put_vp8_bilinear%1_h_ssse3, 7,7 |
| 11991 | 815 shl r5d, 4 |
| 816 %ifdef PIC | |
| 817 lea r11, [bilinear_filter_vb_m] | |
| 818 %endif | |
| 819 pxor m4, m4 | |
| 820 mova m2, [filter_h2_shuf] | |
| 12000 | 821 mova m3, [bilinear_filter_vb+r5-16] |
| 11991 | 822 .nextrow |
| 823 movu m0, [r2+r3*0] | |
| 824 movu m1, [r2+r3*1] | |
| 825 pshufb m0, m2 | |
| 826 pshufb m1, m2 | |
| 827 pmaddubsw m0, m3 | |
| 828 pmaddubsw m1, m3 | |
| 829 psraw m0, 2 | |
| 830 psraw m1, 2 | |
| 831 pavgw m0, m4 | |
| 832 pavgw m1, m4 | |
|
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
833 %if mmsize==8 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
834 packuswb m0, m0 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
835 packuswb m1, m1 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
836 movh [r0+r1*0], m0 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
837 movh [r0+r1*1], m1 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
838 %else |
| 11991 | 839 packuswb m0, m1 |
| 840 movh [r0+r1*0], m0 | |
| 841 movhps [r0+r1*1], m0 | |
|
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
842 %endif |
| 11991 | 843 |
| 844 lea r0, [r0+r1*2] | |
| 845 lea r2, [r2+r3*2] | |
|
12400
4f13b2ded34d
Fix segfaults in VP8 SIMD code on Win64 (and FATE/win64 failures).
rbultje
parents:
12340
diff
changeset
|
846 sub r4d, 2 |
| 11991 | 847 jg .nextrow |
| 848 REP_RET | |
|
12082
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
849 %endmacro |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
850 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
851 INIT_MMX |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
852 FILTER_BILINEAR_SSSE3 4 |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
853 INIT_XMM |
|
8527154f6e81
SSSE3 versions of vp8 width4 bilinear MC functions
darkshikari
parents:
12054
diff
changeset
|
854 FILTER_BILINEAR_SSSE3 8 |
| 11991 | 855 |
| 11992 | 856 cglobal put_vp8_pixels8_mmx, 5,5 |
| 857 .nextrow: | |
| 858 movq mm0, [r2+r3*0] | |
| 859 movq mm1, [r2+r3*1] | |
| 860 lea r2, [r2+r3*2] | |
| 861 movq [r0+r1*0], mm0 | |
| 862 movq [r0+r1*1], mm1 | |
| 863 lea r0, [r0+r1*2] | |
| 864 sub r4d, 2 | |
| 865 jg .nextrow | |
| 866 REP_RET | |
| 867 | |
| 868 cglobal put_vp8_pixels16_mmx, 5,5 | |
| 869 .nextrow: | |
| 870 movq mm0, [r2+r3*0+0] | |
| 871 movq mm1, [r2+r3*0+8] | |
| 872 movq mm2, [r2+r3*1+0] | |
| 873 movq mm3, [r2+r3*1+8] | |
| 874 lea r2, [r2+r3*2] | |
| 875 movq [r0+r1*0+0], mm0 | |
| 876 movq [r0+r1*0+8], mm1 | |
| 877 movq [r0+r1*1+0], mm2 | |
| 878 movq [r0+r1*1+8], mm3 | |
| 879 lea r0, [r0+r1*2] | |
| 880 sub r4d, 2 | |
| 881 jg .nextrow | |
| 882 REP_RET | |
| 883 | |
| 884 cglobal put_vp8_pixels16_sse, 5,5,2 | |
| 885 .nextrow: | |
| 886 movups xmm0, [r2+r3*0] | |
| 887 movups xmm1, [r2+r3*1] | |
| 888 lea r2, [r2+r3*2] | |
| 889 movaps [r0+r1*0], xmm0 | |
| 890 movaps [r0+r1*1], xmm1 | |
| 891 lea r0, [r0+r1*2] | |
| 892 sub r4d, 2 | |
| 893 jg .nextrow | |
| 894 REP_RET | |
| 895 | |
| 11975 | 896 ;----------------------------------------------------------------------------- |
| 897 ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); | |
| 898 ;----------------------------------------------------------------------------- | |
| 899 | |
| 12238 | 900 %macro ADD_DC 4 |
| 901 %4 m2, [r0+%3] | |
| 902 %4 m3, [r0+r2+%3] | |
| 903 %4 m4, [r1+%3] | |
| 904 %4 m5, [r1+r2+%3] | |
| 905 paddusb m2, %1 | |
| 906 paddusb m3, %1 | |
| 907 paddusb m4, %1 | |
| 908 paddusb m5, %1 | |
| 909 psubusb m2, %2 | |
| 910 psubusb m3, %2 | |
| 911 psubusb m4, %2 | |
| 912 psubusb m5, %2 | |
| 913 %4 [r0+%3], m2 | |
| 914 %4 [r0+r2+%3], m3 | |
| 915 %4 [r1+%3], m4 | |
| 916 %4 [r1+r2+%3], m5 | |
| 917 %endmacro | |
| 918 | |
| 919 INIT_MMX | |
| 11975 | 920 cglobal vp8_idct_dc_add_mmx, 3, 3 |
| 921 ; load data | |
| 12238 | 922 movd m0, [r1] |
| 11975 | 923 |
| 924 ; calculate DC | |
| 12238 | 925 paddw m0, [pw_4] |
| 926 pxor m1, m1 | |
| 927 psraw m0, 3 | |
| 928 movd [r1], m1 | |
| 929 psubw m1, m0 | |
| 930 packuswb m0, m0 | |
| 931 packuswb m1, m1 | |
| 932 punpcklbw m0, m0 | |
| 933 punpcklbw m1, m1 | |
| 934 punpcklwd m0, m0 | |
| 935 punpcklwd m1, m1 | |
| 11975 | 936 |
| 937 ; add DC | |
| 12238 | 938 lea r1, [r0+r2*2] |
| 939 ADD_DC m0, m1, 0, movh | |
| 11975 | 940 RET |
| 941 | |
| 12238 | 942 INIT_XMM |
| 11975 | 943 cglobal vp8_idct_dc_add_sse4, 3, 3, 6 |
| 944 ; load data | |
| 12238 | 945 movd m0, [r1] |
| 946 pxor m1, m1 | |
| 947 | |
| 948 ; calculate DC | |
| 949 paddw m0, [pw_4] | |
| 950 movd [r1], m1 | |
| 951 lea r1, [r0+r2*2] | |
| 952 movd m2, [r0] | |
| 953 movd m3, [r0+r2] | |
| 954 movd m4, [r1] | |
| 955 movd m5, [r1+r2] | |
| 956 psraw m0, 3 | |
| 957 pshuflw m0, m0, 0 | |
| 958 punpcklqdq m0, m0 | |
| 959 punpckldq m2, m3 | |
| 960 punpckldq m4, m5 | |
| 961 punpcklbw m2, m1 | |
| 962 punpcklbw m4, m1 | |
| 963 paddw m2, m0 | |
| 964 paddw m4, m0 | |
| 965 packuswb m2, m4 | |
| 966 movd [r0], m2 | |
| 967 pextrd [r0+r2], m2, 1 | |
| 968 pextrd [r1], m2, 2 | |
| 969 pextrd [r1+r2], m2, 3 | |
| 970 RET | |
| 971 | |
| 972 ;----------------------------------------------------------------------------- | |
|
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
973 ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
| 12238 | 974 ;----------------------------------------------------------------------------- |
| 975 | |
| 976 INIT_MMX | |
|
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
977 cglobal vp8_idct_dc_add4y_mmx, 3, 3 |
| 12238 | 978 ; load data |
| 979 movd m0, [r1+32*0] ; A | |
| 980 movd m1, [r1+32*2] ; C | |
| 981 punpcklwd m0, [r1+32*1] ; A B | |
| 982 punpcklwd m1, [r1+32*3] ; C D | |
| 12239 | 983 punpckldq m0, m1 ; A B C D |
| 12238 | 984 pxor m6, m6 |
| 11975 | 985 |
| 986 ; calculate DC | |
| 12238 | 987 paddw m0, [pw_4] |
| 988 movd [r1+32*0], m6 | |
| 989 movd [r1+32*1], m6 | |
| 990 movd [r1+32*2], m6 | |
| 991 movd [r1+32*3], m6 | |
| 992 psraw m0, 3 | |
| 993 psubw m6, m0 | |
| 994 packuswb m0, m0 | |
| 995 packuswb m6, m6 | |
| 996 punpcklbw m0, m0 ; AABBCCDD | |
| 997 punpcklbw m6, m6 ; AABBCCDD | |
| 998 movq m1, m0 | |
| 999 movq m7, m6 | |
| 1000 punpcklbw m0, m0 ; AAAABBBB | |
| 1001 punpckhbw m1, m1 ; CCCCDDDD | |
| 1002 punpcklbw m6, m6 ; AAAABBBB | |
| 1003 punpckhbw m7, m7 ; CCCCDDDD | |
| 1004 | |
| 1005 ; add DC | |
| 1006 lea r1, [r0+r2*2] | |
| 1007 ADD_DC m0, m6, 0, mova | |
| 1008 ADD_DC m1, m7, 8, mova | |
| 1009 RET | |
| 1010 | |
| 1011 INIT_XMM | |
|
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1012 cglobal vp8_idct_dc_add4y_sse2, 3, 3, 6 |
| 12238 | 1013 ; load data |
| 1014 movd m0, [r1+32*0] ; A | |
| 1015 movd m1, [r1+32*2] ; C | |
| 1016 punpcklwd m0, [r1+32*1] ; A B | |
| 1017 punpcklwd m1, [r1+32*3] ; C D | |
| 12239 | 1018 punpckldq m0, m1 ; A B C D |
| 12238 | 1019 pxor m1, m1 |
| 1020 | |
| 1021 ; calculate DC | |
| 1022 paddw m0, [pw_4] | |
| 1023 movd [r1+32*0], m1 | |
| 1024 movd [r1+32*1], m1 | |
| 1025 movd [r1+32*2], m1 | |
| 1026 movd [r1+32*3], m1 | |
| 1027 psraw m0, 3 | |
| 1028 psubw m1, m0 | |
| 1029 packuswb m0, m0 | |
| 1030 packuswb m1, m1 | |
| 1031 punpcklbw m0, m0 | |
| 1032 punpcklbw m1, m1 | |
| 1033 punpcklbw m0, m0 | |
| 1034 punpcklbw m1, m1 | |
| 1035 | |
| 1036 ; add DC | |
| 1037 lea r1, [r0+r2*2] | |
| 1038 ADD_DC m0, m1, 0, mova | |
| 11975 | 1039 RET |
|
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1040 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1041 ;----------------------------------------------------------------------------- |
|
12241
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1042 ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride); |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1043 ;----------------------------------------------------------------------------- |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1044 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1045 INIT_MMX |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1046 cglobal vp8_idct_dc_add4uv_mmx, 3, 3 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1047 ; load data |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1048 movd m0, [r1+32*0] ; A |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1049 movd m1, [r1+32*2] ; C |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1050 punpcklwd m0, [r1+32*1] ; A B |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1051 punpcklwd m1, [r1+32*3] ; C D |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1052 punpckldq m0, m1 ; A B C D |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1053 pxor m6, m6 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1054 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1055 ; calculate DC |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1056 paddw m0, [pw_4] |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1057 movd [r1+32*0], m6 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1058 movd [r1+32*1], m6 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1059 movd [r1+32*2], m6 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1060 movd [r1+32*3], m6 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1061 psraw m0, 3 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1062 psubw m6, m0 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1063 packuswb m0, m0 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1064 packuswb m6, m6 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1065 punpcklbw m0, m0 ; AABBCCDD |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1066 punpcklbw m6, m6 ; AABBCCDD |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1067 movq m1, m0 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1068 movq m7, m6 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1069 punpcklbw m0, m0 ; AAAABBBB |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1070 punpckhbw m1, m1 ; CCCCDDDD |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1071 punpcklbw m6, m6 ; AAAABBBB |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1072 punpckhbw m7, m7 ; CCCCDDDD |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1073 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1074 ; add DC |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1075 lea r1, [r0+r2*2] |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1076 ADD_DC m0, m6, 0, mova |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1077 lea r0, [r0+r2*4] |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1078 lea r1, [r1+r2*4] |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1079 ADD_DC m1, m7, 0, mova |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1080 RET |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1081 |
|
c7f6ddcc5c01
VP8: optimize DC-only chroma case in the same way as luma.
darkshikari
parents:
12239
diff
changeset
|
1082 ;----------------------------------------------------------------------------- |
| 12013 | 1083 ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride); |
| 1084 ;----------------------------------------------------------------------------- | |
| 1085 | |
| 1086 ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) | |
| 1087 ; this macro assumes that m6/m7 have words for 20091/17734 loaded | |
| 1088 %macro VP8_MULTIPLY_SUMSUB 4 | |
| 1089 mova %3, %1 | |
| 1090 mova %4, %2 | |
| 1091 pmulhw %3, m6 ;20091(1) | |
| 1092 pmulhw %4, m6 ;20091(2) | |
| 1093 paddw %3, %1 | |
| 1094 paddw %4, %2 | |
| 12018 | 1095 paddw %1, %1 |
| 1096 paddw %2, %2 | |
| 12013 | 1097 pmulhw %1, m7 ;35468(1) |
| 1098 pmulhw %2, m7 ;35468(2) | |
| 1099 psubw %1, %4 | |
| 1100 paddw %2, %3 | |
| 1101 %endmacro | |
| 1102 | |
| 1103 ; calculate x0=%1+%3; x1=%1-%3 | |
| 1104 ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) | |
| 1105 ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) | |
| 1106 ; %5/%6 are temporary registers | |
| 1107 ; we assume m6/m7 have constant words 20091/17734 loaded in them | |
| 1108 %macro VP8_IDCT_TRANSFORM4x4_1D 6 | |
| 1109 SUMSUB_BA m%3, m%1, m%5 ;t0, t1 | |
| 1110 VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 | |
| 1111 SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3 | |
| 1112 SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2 | |
| 1113 SWAP %4, %1 | |
| 1114 SWAP %4, %3 | |
| 1115 %endmacro | |
| 1116 | |
| 1117 INIT_MMX | |
|
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1118 %macro VP8_IDCT_ADD 1 |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1119 cglobal vp8_idct_add_%1, 3, 3 |
| 12013 | 1120 ; load block data |
|
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1121 movq m0, [r1+ 0] |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1122 movq m1, [r1+ 8] |
| 12013 | 1123 movq m2, [r1+16] |
| 1124 movq m3, [r1+24] | |
| 1125 movq m6, [pw_20091] | |
| 1126 movq m7, [pw_17734] | |
|
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1127 %ifidn %1, sse |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1128 xorps xmm0, xmm0 |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1129 movaps [r1+ 0], xmm0 |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1130 movaps [r1+16], xmm0 |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1131 %else |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1132 pxor m4, m4 |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1133 movq [r1+ 0], m4 |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1134 movq [r1+ 8], m4 |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1135 movq [r1+16], m4 |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1136 movq [r1+24], m4 |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1137 %endif |
| 12013 | 1138 |
| 1139 ; actual IDCT | |
| 1140 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
| 1141 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
| 1142 paddw m0, [pw_4] | |
| 1143 VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 | |
| 1144 TRANSPOSE4x4W 0, 1, 2, 3, 4 | |
| 1145 | |
| 1146 ; store | |
| 1147 pxor m4, m4 | |
| 1148 lea r1, [r0+2*r2] | |
| 1149 STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2 | |
| 1150 STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2 | |
| 1151 | |
| 1152 RET | |
|
12235
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1153 %endmacro |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1154 |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1155 VP8_IDCT_ADD mmx |
|
e08d65897115
VP8: clear DCT blocks in iDCT instead of using clear_blocks.
darkshikari
parents:
12227
diff
changeset
|
1156 VP8_IDCT_ADD sse |
| 12013 | 1157 |
| 1158 ;----------------------------------------------------------------------------- | |
|
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1159 ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16]) |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1160 ;----------------------------------------------------------------------------- |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1161 |
| 12209 | 1162 %macro SCATTER_WHT 3 |
| 1163 movd r1d, m%1 | |
| 1164 movd r2d, m%2 | |
| 1165 mov [r0+2*16*(0+%3)], r1w | |
| 1166 mov [r0+2*16*(1+%3)], r2w | |
| 1167 shr r1d, 16 | |
| 1168 shr r2d, 16 | |
| 1169 psrlq m%1, 32 | |
| 1170 psrlq m%2, 32 | |
| 1171 mov [r0+2*16*(4+%3)], r1w | |
| 1172 mov [r0+2*16*(5+%3)], r2w | |
| 1173 movd r1d, m%1 | |
| 1174 movd r2d, m%2 | |
| 1175 mov [r0+2*16*(8+%3)], r1w | |
| 1176 mov [r0+2*16*(9+%3)], r2w | |
| 1177 shr r1d, 16 | |
| 1178 shr r2d, 16 | |
| 1179 mov [r0+2*16*(12+%3)], r1w | |
| 1180 mov [r0+2*16*(13+%3)], r2w | |
|
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1181 %endmacro |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1182 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1183 %macro HADAMARD4_1D 4 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1184 SUMSUB_BADC m%2, m%1, m%4, m%3 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1185 SUMSUB_BADC m%4, m%2, m%3, m%1 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1186 SWAP %1, %4, %3 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1187 %endmacro |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1188 |
|
12340
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1189 %macro VP8_DC_WHT 1 |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1190 cglobal vp8_luma_dc_wht_%1, 2,3 |
|
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1191 movq m0, [r1] |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1192 movq m1, [r1+8] |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1193 movq m2, [r1+16] |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1194 movq m3, [r1+24] |
|
12340
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1195 %ifidn %1, sse |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1196 xorps xmm0, xmm0 |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1197 movaps [r1+ 0], xmm0 |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1198 movaps [r1+16], xmm0 |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1199 %else |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1200 pxor m4, m4 |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1201 movq [r1+ 0], m4 |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1202 movq [r1+ 8], m4 |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1203 movq [r1+16], m4 |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1204 movq [r1+24], m4 |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1205 %endif |
|
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1206 HADAMARD4_1D 0, 1, 2, 3 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1207 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1208 paddw m0, [pw_3] |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1209 HADAMARD4_1D 0, 1, 2, 3 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1210 psraw m0, 3 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1211 psraw m1, 3 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1212 psraw m2, 3 |
|
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1213 psraw m3, 3 |
| 12209 | 1214 SCATTER_WHT 0, 1, 0 |
| 1215 SCATTER_WHT 2, 3, 2 | |
|
12006
d584c7373a64
Add mmxext version of VP8 DC Hadamard transform
darkshikari
parents:
12000
diff
changeset
|
1216 RET |
|
12340
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1217 %endmacro |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1218 |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1219 INIT_MMX |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1220 VP8_DC_WHT mmx |
|
2d15f62f4f8a
VP8: move zeroing of luma DC block into the WHT
darkshikari
parents:
12334
diff
changeset
|
1221 VP8_DC_WHT sse |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1222 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1223 ;----------------------------------------------------------------------------- |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1224 ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1225 ;----------------------------------------------------------------------------- |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1226 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1227 ; macro called with 7 mm register indexes as argument, and 4 regular registers |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1228 ; |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1229 ; first 4 mm registers will carry the transposed pixel data |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1230 ; the other three are scratchspace (one would be sufficient, but this allows |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1231 ; for more spreading/pipelining and thus faster execution on OOE CPUs) |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1232 ; |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1233 ; first two regular registers are buf+4*stride and buf+5*stride |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1234 ; third is -stride, fourth is +stride |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1235 %macro READ_8x4_INTERLEAVED 11 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1236 ; interleave 8 (A-H) rows of 4 pixels each |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1237 movd m%1, [%8+%10*4] ; A0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1238 movd m%5, [%9+%10*4] ; B0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1239 movd m%2, [%8+%10*2] ; C0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1240 movd m%6, [%8+%10] ; D0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1241 movd m%3, [%8] ; E0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1242 movd m%7, [%9] ; F0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1243 movd m%4, [%9+%11] ; G0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1244 punpcklbw m%1, m%5 ; A/B interleaved |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1245 movd m%5, [%9+%11*2] ; H0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1246 punpcklbw m%2, m%6 ; C/D interleaved |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1247 punpcklbw m%3, m%7 ; E/F interleaved |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1248 punpcklbw m%4, m%5 ; G/H interleaved |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1249 %endmacro |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1250 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1251 ; macro called with 7 mm register indexes as argument, and 5 regular registers |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1252 ; first 11 mean the same as READ_8x4_TRANSPOSED above |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1253 ; fifth regular register is scratchspace to reach the bottom 8 rows, it |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1254 ; will be set to second regular register + 8*stride at the end |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1255 %macro READ_16x4_INTERLEAVED 12 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1256 ; transpose 16 (A-P) rows of 4 pixels each |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1257 lea %12, [r0+8*r2] |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1258 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1259 ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1260 movd m%1, [%8+%10*4] ; A0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1261 movd m%3, [%12+%10*4] ; I0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1262 movd m%2, [%8+%10*2] ; C0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1263 movd m%4, [%12+%10*2] ; K0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1264 movd m%6, [%8+%10] ; D0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1265 movd m%5, [%12+%10] ; L0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1266 movd m%7, [%12] ; M0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1267 add %12, %11 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1268 punpcklbw m%1, m%3 ; A/I |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1269 movd m%3, [%8] ; E0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1270 punpcklbw m%2, m%4 ; C/K |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1271 punpcklbw m%6, m%5 ; D/L |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1272 punpcklbw m%3, m%7 ; E/M |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1273 punpcklbw m%2, m%6 ; C/D/K/L interleaved |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1274 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1275 ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1276 movd m%5, [%9+%10*4] ; B0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1277 movd m%4, [%12+%10*4] ; J0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1278 movd m%7, [%9] ; F0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1279 movd m%6, [%12] ; N0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1280 punpcklbw m%5, m%4 ; B/J |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1281 punpcklbw m%7, m%6 ; F/N |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1282 punpcklbw m%1, m%5 ; A/B/I/J interleaved |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1283 punpcklbw m%3, m%7 ; E/F/M/N interleaved |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1284 movd m%4, [%9+%11] ; G0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1285 movd m%6, [%12+%11] ; O0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1286 movd m%5, [%9+%11*2] ; H0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1287 movd m%7, [%12+%11*2] ; P0-3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1288 punpcklbw m%4, m%6 ; G/O |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1289 punpcklbw m%5, m%7 ; H/P |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1290 punpcklbw m%4, m%5 ; G/H/O/P interleaved |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1291 %endmacro |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1292 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1293 ; write 4 mm registers of 2 dwords each |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1294 ; first four arguments are mm register indexes containing source data |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1295 ; last four are registers containing buf+4*stride, buf+5*stride, |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1296 ; -stride and +stride |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1297 %macro WRITE_4x2D 8 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1298 ; write out (2 dwords per register) |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1299 movd [%5+%7*4], m%1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1300 movd [%5+%7*2], m%2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1301 movd [%5], m%3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1302 movd [%6+%8], m%4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1303 punpckhdq m%1, m%1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1304 punpckhdq m%2, m%2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1305 punpckhdq m%3, m%3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1306 punpckhdq m%4, m%4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1307 movd [%6+%7*4], m%1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1308 movd [%5+%7], m%2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1309 movd [%6], m%3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1310 movd [%6+%8*2], m%4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1311 %endmacro |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1312 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1313 ; write 4 xmm registers of 4 dwords each |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1314 ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1315 ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1316 ; we add 1*stride to the third regular registry in the process |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1317 ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1318 ; same memory region), or 8 if they cover two separate buffers (third one points to |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1319 ; a different memory region than the first two), allowing for more optimal code for |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1320 ; the 16-width case |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1321 %macro WRITE_4x4D 10 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1322 ; write out (4 dwords per register), start with dwords zero |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1323 movd [%5+%8*4], m%1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1324 movd [%5], m%2 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1325 movd [%7+%8*4], m%3 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1326 movd [%7], m%4 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1327 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1328 ; store dwords 1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1329 psrldq m%1, 4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1330 psrldq m%2, 4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1331 psrldq m%3, 4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1332 psrldq m%4, 4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1333 movd [%6+%8*4], m%1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1334 movd [%6], m%2 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1335 %if %10 == 16 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1336 movd [%6+%9*4], m%3 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1337 %endif |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1338 movd [%7+%9], m%4 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1339 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1340 ; write dwords 2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1341 psrldq m%1, 4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1342 psrldq m%2, 4 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1343 %if %10 == 8 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1344 movd [%5+%8*2], m%1 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1345 movd %5d, m%3 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1346 %endif |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1347 psrldq m%3, 4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1348 psrldq m%4, 4 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1349 %if %10 == 16 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1350 movd [%5+%8*2], m%1 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1351 %endif |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1352 movd [%6+%9], m%2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1353 movd [%7+%8*2], m%3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1354 movd [%7+%9*2], m%4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1355 add %7, %9 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1356 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1357 ; store dwords 3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1358 psrldq m%1, 4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1359 psrldq m%2, 4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1360 psrldq m%3, 4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1361 psrldq m%4, 4 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1362 %if %10 == 8 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1363 mov [%7+%8*4], %5d |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1364 movd [%6+%8*2], m%1 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1365 %else |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1366 movd [%5+%8], m%1 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1367 %endif |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1368 movd [%6+%9*2], m%2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1369 movd [%7+%8*2], m%3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1370 movd [%7+%9*2], m%4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1371 %endmacro |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1372 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1373 ; write 4 or 8 words in the mmx/xmm registers as 8 lines |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1374 ; 1 and 2 are the registers to write, this can be the same (for SSE2) |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1375 ; for pre-SSE4: |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1376 ; 3 is a general-purpose register that we will clobber |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1377 ; for SSE4: |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1378 ; 3 is a pointer to the destination's 5th line |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1379 ; 4 is a pointer to the destination's 4th line |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1380 ; 5/6 is -stride and +stride |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1381 %macro WRITE_2x4W 6 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1382 movd %3d, %1 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1383 punpckhdq %1, %1 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1384 mov [%4+%5*4], %3w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1385 shr %3, 16 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1386 add %4, %6 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1387 mov [%4+%5*4], %3w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1388 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1389 movd %3d, %1 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1390 add %4, %5 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1391 mov [%4+%5*2], %3w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1392 shr %3, 16 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1393 mov [%4+%5 ], %3w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1394 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1395 movd %3d, %2 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1396 punpckhdq %2, %2 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1397 mov [%4 ], %3w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1398 shr %3, 16 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1399 mov [%4+%6 ], %3w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1400 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1401 movd %3d, %2 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1402 add %4, %6 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1403 mov [%4+%6 ], %3w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1404 shr %3, 16 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1405 mov [%4+%6*2], %3w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1406 add %4, %5 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1407 %endmacro |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1408 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1409 %macro WRITE_8W_SSE2 5 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1410 movd %2d, %1 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1411 psrldq %1, 4 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1412 mov [%3+%4*4], %2w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1413 shr %2, 16 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1414 add %3, %5 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1415 mov [%3+%4*4], %2w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1416 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1417 movd %2d, %1 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1418 psrldq %1, 4 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1419 add %3, %4 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1420 mov [%3+%4*2], %2w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1421 shr %2, 16 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1422 mov [%3+%4 ], %2w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1423 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1424 movd %2d, %1 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1425 psrldq %1, 4 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1426 mov [%3 ], %2w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1427 shr %2, 16 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1428 mov [%3+%5 ], %2w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1429 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1430 movd %2d, %1 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1431 add %3, %5 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1432 mov [%3+%5 ], %2w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1433 shr %2, 16 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1434 mov [%3+%5*2], %2w |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1435 %endmacro |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1436 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1437 %macro WRITE_8W_SSE4 5 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1438 pextrw [%3+%4*4], %1, 0 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1439 pextrw [%2+%4*4], %1, 1 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1440 pextrw [%3+%4*2], %1, 2 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1441 pextrw [%3+%4 ], %1, 3 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1442 pextrw [%3 ], %1, 4 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1443 pextrw [%2 ], %1, 5 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1444 pextrw [%2+%5 ], %1, 6 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1445 pextrw [%2+%5*2], %1, 7 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1446 %endmacro |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1447 |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1448 %macro SPLATB_REG_MMX 2-3 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1449 movd %1, %2d |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1450 punpcklbw %1, %1 |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1451 punpcklwd %1, %1 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1452 punpckldq %1, %1 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1453 %endmacro |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1454 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1455 %macro SPLATB_REG_MMXEXT 2-3 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1456 movd %1, %2d |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1457 punpcklbw %1, %1 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1458 pshufw %1, %1, 0x0 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1459 %endmacro |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1460 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1461 %macro SPLATB_REG_SSE2 2-3 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1462 movd %1, %2d |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1463 punpcklbw %1, %1 |
| 12210 | 1464 pshuflw %1, %1, 0x0 |
| 1465 punpcklqdq %1, %1 | |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1466 %endmacro |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1467 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1468 %macro SPLATB_REG_SSSE3 3 |
|
12457
2982071047a2
Use "d" suffix for general-purpose registers used with movd.
reimar
parents:
12413
diff
changeset
|
1469 movd %1, %2d |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1470 pshufb %1, %3 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1471 %endmacro |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1472 |
|
12413
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1473 %macro SIMPLE_LOOPFILTER 4 |
|
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1474 cglobal vp8_%2_loop_filter_simple_%1, 3, %3, %4 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1475 %if mmsize == 8 ; mmx/mmxext |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1476 mov r3, 2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1477 %endif |
|
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1478 %ifnidn %1, sse2 |
|
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1479 %if mmsize == 16 |
| 12210 | 1480 pxor m0, m0 |
| 1481 %endif | |
|
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1482 %endif |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1483 SPLATB_REG m7, r2, m0 ; splat "flim" into register |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1484 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1485 ; set up indexes to address 4 rows |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1486 mov r2, r1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1487 neg r1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1488 %ifidn %2, h |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1489 lea r0, [r0+4*r2-2] |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1490 %endif |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1491 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1492 %if mmsize == 8 ; mmx / mmxext |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1493 .next8px |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1494 %endif |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1495 %ifidn %2, v |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1496 ; read 4 half/full rows of pixels |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1497 mova m0, [r0+r1*2] ; p1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1498 mova m1, [r0+r1] ; p0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1499 mova m2, [r0] ; q0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1500 mova m3, [r0+r2] ; q1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1501 %else ; h |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1502 lea r4, [r0+r2] |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1503 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1504 %if mmsize == 8 ; mmx/mmxext |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1505 READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1506 %else ; sse2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1507 READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1508 %endif |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1509 TRANSPOSE4x4W 0, 1, 2, 3, 4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1510 %endif |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1511 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1512 ; simple_limit |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1513 mova m5, m2 ; m5=backup of q0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1514 mova m6, m1 ; m6=backup of p0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1515 psubusb m1, m2 ; p0-q0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1516 psubusb m2, m6 ; q0-p0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1517 por m1, m2 ; FFABS(p0-q0) |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1518 paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1519 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1520 mova m4, m3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1521 mova m2, m0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1522 psubusb m3, m0 ; q1-p1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1523 psubusb m0, m4 ; p1-q1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1524 por m3, m0 ; FFABS(p1-q1) |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1525 mova m0, [pb_80] |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1526 pxor m2, m0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1527 pxor m4, m0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1528 psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1529 pand m3, [pb_FE] |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1530 psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1531 paddusb m3, m1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1532 psubusb m3, m7 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1533 pxor m1, m1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1534 pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1535 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1536 ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1537 mova m4, m5 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1538 pxor m5, m0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1539 pxor m0, m6 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1540 psubsb m5, m0 ; q0-p0 (signed) |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1541 paddsb m2, m5 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1542 paddsb m2, m5 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1543 paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1544 pand m2, m3 ; apply filter mask (m3) |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1545 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1546 mova m3, [pb_F8] |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1547 mova m1, m2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1548 paddsb m2, [pb_4] ; f1<<3=a+4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1549 paddsb m1, [pb_3] ; f2<<3=a+3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1550 pand m2, m3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1551 pand m1, m3 ; cache f2<<3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1552 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1553 pxor m0, m0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1554 pxor m3, m3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1555 pcmpgtb m0, m2 ; which values are <0? |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1556 psubb m3, m2 ; -f1<<3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1557 psrlq m2, 3 ; +f1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1558 psrlq m3, 3 ; -f1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1559 pand m3, m0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1560 pandn m0, m2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1561 psubusb m4, m0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1562 paddusb m4, m3 ; q0-f1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1563 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1564 pxor m0, m0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1565 pxor m3, m3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1566 pcmpgtb m0, m1 ; which values are <0? |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1567 psubb m3, m1 ; -f2<<3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1568 psrlq m1, 3 ; +f2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1569 psrlq m3, 3 ; -f2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1570 pand m3, m0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1571 pandn m0, m1 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1572 paddusb m6, m0 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1573 psubusb m6, m3 ; p0+f2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1574 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1575 ; store |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1576 %ifidn %2, v |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1577 mova [r0], m4 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1578 mova [r0+r1], m6 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1579 %else ; h |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1580 inc r0 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1581 SBUTTERFLY bw, 6, 4, 0 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1582 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1583 %if mmsize == 16 ; sse2 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1584 %ifidn %1, sse4 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1585 inc r4 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1586 %endif |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1587 WRITE_8W m6, r4, r0, r1, r2 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1588 lea r4, [r3+r1+1] |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1589 %ifidn %1, sse4 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1590 inc r3 |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1591 %endif |
|
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1592 WRITE_8W m4, r3, r4, r1, r2 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1593 %else ; mmx/mmxext |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1594 WRITE_2x4W m6, m4, r4, r0, r1, r2 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1595 %endif |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1596 %endif |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1597 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1598 %if mmsize == 8 ; mmx/mmxext |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1599 ; next 8 pixels |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1600 %ifidn %2, v |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1601 add r0, 8 ; advance 8 cols = pixels |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1602 %else ; h |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1603 lea r0, [r0+r2*8-1] ; advance 8 rows = lines |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1604 %endif |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1605 dec r3 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1606 jg .next8px |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1607 REP_RET |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1608 %else ; sse2 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1609 RET |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1610 %endif |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1611 %endmacro |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1612 |
|
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1613 INIT_MMX |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1614 %define SPLATB_REG SPLATB_REG_MMX |
|
12413
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1615 SIMPLE_LOOPFILTER mmx, v, 4, 0 |
|
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1616 SIMPLE_LOOPFILTER mmx, h, 5, 0 |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1617 %define SPLATB_REG SPLATB_REG_MMXEXT |
|
12413
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1618 SIMPLE_LOOPFILTER mmxext, v, 4, 0 |
|
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1619 SIMPLE_LOOPFILTER mmxext, h, 5, 0 |
|
12086
d780ae746855
Simple H/V loopfilter for VP8 in MMX, MMX2 and SSE2 (yay for yasm macros).
rbultje
parents:
12082
diff
changeset
|
1620 INIT_XMM |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1621 %define SPLATB_REG SPLATB_REG_SSE2 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1622 %define WRITE_8W WRITE_8W_SSE2 |
|
12413
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1623 SIMPLE_LOOPFILTER sse2, v, 3, 8 |
|
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1624 SIMPLE_LOOPFILTER sse2, h, 5, 8 |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1625 %define SPLATB_REG SPLATB_REG_SSSE3 |
|
12413
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1626 SIMPLE_LOOPFILTER ssse3, v, 3, 8 |
|
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1627 SIMPLE_LOOPFILTER ssse3, h, 5, 8 |
|
12334
435319d67bd8
Use word-writing instead of dword-writing (with two cached but otherwise
rbultje
parents:
12279
diff
changeset
|
1628 %define WRITE_8W WRITE_8W_SSE4 |
|
12413
e6e4059ea421
Mark xmm registers as clobbered in simple loopfilter. Should fix the last
rbultje
parents:
12400
diff
changeset
|
1629 SIMPLE_LOOPFILTER sse4, h, 5, 8 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1630 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1631 ;----------------------------------------------------------------------------- |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1632 ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1633 ; int flimE, int flimI, int hev_thr); |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1634 ;----------------------------------------------------------------------------- |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1635 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1636 %macro INNER_LOOPFILTER 5 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1637 %if %4 == 8 ; chroma |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1638 cglobal vp8_%2_loop_filter8uv_inner_%1, 6, %3, %5 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1639 %define dst8_reg r1 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1640 %define mstride_reg r2 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1641 %define E_reg r3 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1642 %define I_reg r4 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1643 %define hev_thr_reg r5 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1644 %else ; luma |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1645 cglobal vp8_%2_loop_filter16y_inner_%1, 5, %3, %5 |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1646 %define mstride_reg r1 |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1647 %define E_reg r2 |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1648 %define I_reg r3 |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1649 %define hev_thr_reg r4 |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1650 %ifdef m8 ; x86-64, sse2 |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1651 %define dst8_reg r4 |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1652 %elif mmsize == 16 ; x86-32, sse2 |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1653 %define dst8_reg r5 |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1654 %else ; x86-32, mmx/mmxext |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1655 %define cnt_reg r5 |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1656 %endif |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1657 %endif |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1658 %define dst_reg r0 |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1659 %define stride_reg E_reg |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1660 %define dst2_reg I_reg |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1661 %ifndef m8 |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1662 %define stack_reg hev_thr_reg |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1663 %endif |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1664 |
|
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1665 %ifnidn %1, sse2 |
|
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1666 %if mmsize == 16 |
| 12210 | 1667 pxor m7, m7 |
| 1668 %endif | |
|
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
1669 %endif |
| 12210 | 1670 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1671 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1672 ; splat function arguments |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1673 SPLATB_REG m0, E_reg, m7 ; E |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1674 SPLATB_REG m1, I_reg, m7 ; I |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1675 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1676 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1677 ; align stack |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1678 mov stack_reg, rsp ; backup stack pointer |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1679 and rsp, ~(mmsize-1) ; align stack |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1680 %ifidn %2, v |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1681 sub rsp, mmsize * 4 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1682 ; [3]=hev() result |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1683 %else ; h |
|
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1684 sub rsp, mmsize * 5 ; extra storage space for transposes |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1685 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1686 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1687 %define flim_E [rsp] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1688 %define flim_I [rsp+mmsize] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1689 %define hev_thr [rsp+mmsize*2] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1690 %define mask_res [rsp+mmsize*3] |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1691 %define p0backup [rsp+mmsize*3] |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1692 %define q0backup [rsp+mmsize*4] |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1693 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1694 mova flim_E, m0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1695 mova flim_I, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1696 mova hev_thr, m2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1697 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1698 %else ; sse2 on x86-64 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1699 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1700 %define flim_E m9 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1701 %define flim_I m10 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1702 %define hev_thr m11 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1703 %define mask_res m12 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1704 %define p0backup m12 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1705 %define q0backup m8 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1706 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1707 ; splat function arguments |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1708 SPLATB_REG flim_E, E_reg, m7 ; E |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1709 SPLATB_REG flim_I, I_reg, m7 ; I |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
1710 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1711 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1712 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1713 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1714 mov cnt_reg, 2 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1715 %endif |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1716 mov stride_reg, mstride_reg |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1717 neg mstride_reg |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1718 %ifidn %2, h |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1719 lea dst_reg, [dst_reg + stride_reg*4-4] |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1720 %if %4 == 8 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1721 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1722 %endif |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1723 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1724 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1725 %if mmsize == 8 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1726 .next8px |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1727 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1728 ; read |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1729 lea dst2_reg, [dst_reg + stride_reg] |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1730 %ifidn %2, v |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1731 %if %4 == 8 && mmsize == 16 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1732 %define movrow movh |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1733 %else |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1734 %define movrow mova |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1735 %endif |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1736 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1737 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1738 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1739 movrow m5, [dst2_reg] ; q1 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1740 movrow m6, [dst2_reg+ stride_reg] ; q2 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1741 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1742 %if mmsize == 16 && %4 == 8 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1743 movhps m0, [dst8_reg+mstride_reg*4] |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1744 movhps m2, [dst8_reg+mstride_reg*2] |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1745 add dst8_reg, stride_reg |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1746 movhps m1, [dst8_reg+mstride_reg*4] |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1747 movhps m5, [dst8_reg] |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1748 movhps m6, [dst8_reg+ stride_reg] |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1749 movhps m7, [dst8_reg+ stride_reg*2] |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1750 add dst8_reg, mstride_reg |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1751 %endif |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1752 %elif mmsize == 8 ; mmx/mmxext (h) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1753 ; read 8 rows of 8px each |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1754 movu m0, [dst_reg +mstride_reg*4] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1755 movu m1, [dst2_reg+mstride_reg*4] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1756 movu m2, [dst_reg +mstride_reg*2] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1757 movu m3, [dst_reg +mstride_reg] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1758 movu m4, [dst_reg] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1759 movu m5, [dst2_reg] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1760 movu m6, [dst2_reg+ stride_reg] |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1761 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1762 ; 8x8 transpose |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1763 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1764 mova q0backup, m1 |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1765 movu m7, [dst2_reg+ stride_reg*2] |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1766 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1767 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1768 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1769 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1770 mova m1, q0backup |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1771 mova q0backup, m2 ; store q0 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1772 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1773 mova p0backup, m5 ; store p0 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1774 SWAP 1, 4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1775 SWAP 2, 4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1776 SWAP 6, 3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1777 SWAP 5, 3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1778 %else ; sse2 (h) |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1779 %if %4 == 16 |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1780 lea dst8_reg, [dst_reg + stride_reg*8] |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1781 %endif |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1782 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1783 ; read 16 rows of 8px each, interleave |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1784 movh m0, [dst_reg +mstride_reg*4] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1785 movh m1, [dst8_reg+mstride_reg*4] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1786 movh m2, [dst_reg +mstride_reg*2] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1787 movh m5, [dst8_reg+mstride_reg*2] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1788 movh m3, [dst_reg +mstride_reg] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1789 movh m6, [dst8_reg+mstride_reg] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1790 movh m4, [dst_reg] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1791 movh m7, [dst8_reg] |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1792 punpcklbw m0, m1 ; A/I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1793 punpcklbw m2, m5 ; C/K |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1794 punpcklbw m3, m6 ; D/L |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1795 punpcklbw m4, m7 ; E/M |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1796 |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1797 add dst8_reg, stride_reg |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1798 movh m1, [dst2_reg+mstride_reg*4] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1799 movh m6, [dst8_reg+mstride_reg*4] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1800 movh m5, [dst2_reg] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1801 movh m7, [dst8_reg] |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1802 punpcklbw m1, m6 ; B/J |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1803 punpcklbw m5, m7 ; F/N |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1804 movh m6, [dst2_reg+ stride_reg] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1805 movh m7, [dst8_reg+ stride_reg] |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1806 punpcklbw m6, m7 ; G/O |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1807 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1808 ; 8x16 transpose |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1809 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1810 %ifdef m8 |
|
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1811 SWAP 1, 8 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1812 %else |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1813 mova q0backup, m1 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1814 %endif |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1815 movh m7, [dst2_reg+ stride_reg*2] |
|
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
1816 movh m1, [dst8_reg+ stride_reg*2] |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1817 punpcklbw m7, m1 ; H/P |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1818 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1819 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1820 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1821 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1822 %ifdef m8 |
|
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1823 SWAP 1, 8 |
|
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1824 SWAP 2, 8 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1825 %else |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1826 mova m1, q0backup |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1827 mova q0backup, m2 ; store q0 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1828 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1829 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1830 %ifdef m12 |
|
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1831 SWAP 5, 12 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1832 %else |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1833 mova p0backup, m5 ; store p0 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1834 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1835 SWAP 1, 4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1836 SWAP 2, 4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1837 SWAP 6, 3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1838 SWAP 5, 3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1839 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1840 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1841 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1842 mova m4, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1843 SWAP 4, 1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1844 psubusb m4, m0 ; p2-p3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1845 psubusb m0, m1 ; p3-p2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1846 por m0, m4 ; abs(p3-p2) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1847 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1848 mova m4, m2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1849 SWAP 4, 2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1850 psubusb m4, m1 ; p1-p2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1851 psubusb m1, m2 ; p2-p1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1852 por m1, m4 ; abs(p2-p1) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1853 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1854 mova m4, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1855 SWAP 4, 6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1856 psubusb m4, m7 ; q2-q3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1857 psubusb m7, m6 ; q3-q2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1858 por m7, m4 ; abs(q3-q2) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1859 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1860 mova m4, m5 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1861 SWAP 4, 5 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1862 psubusb m4, m6 ; q1-q2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1863 psubusb m6, m5 ; q2-q1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1864 por m6, m4 ; abs(q2-q1) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1865 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1866 %ifidn %1, mmx |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1867 mova m4, flim_I |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1868 pxor m3, m3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1869 psubusb m0, m4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1870 psubusb m1, m4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1871 psubusb m7, m4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1872 psubusb m6, m4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1873 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1874 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1875 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1876 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1877 pand m0, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1878 pand m7, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1879 pand m0, m7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1880 %else ; mmxext/sse2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1881 pmaxub m0, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1882 pmaxub m6, m7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1883 pmaxub m0, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1884 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1885 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1886 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1887 SWAP 7, 3 ; now m7 is zero |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1888 %ifidn %2, v |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1889 movrow m3, [dst_reg +mstride_reg] ; p0 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1890 %if mmsize == 16 && %4 == 8 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1891 movhps m3, [dst8_reg+mstride_reg] |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1892 %endif |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1893 %elifdef m12 |
|
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1894 SWAP 3, 12 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1895 %else |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1896 mova m3, p0backup |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1897 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1898 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1899 mova m1, m2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1900 SWAP 1, 2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1901 mova m6, m3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1902 SWAP 3, 6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1903 psubusb m1, m3 ; p1-p0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1904 psubusb m6, m2 ; p0-p1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1905 por m1, m6 ; abs(p1-p0) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1906 %ifidn %1, mmx |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1907 mova m6, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1908 psubusb m1, m4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1909 psubusb m6, hev_thr |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1910 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1911 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1912 pand m0, m1 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1913 mova mask_res, m6 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1914 %else ; mmxext/sse2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1915 pmaxub m0, m1 ; max_I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1916 SWAP 1, 4 ; max_hev_thresh |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1917 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1918 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1919 SWAP 6, 4 ; now m6 is I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1920 %ifidn %2, v |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1921 movrow m4, [dst_reg] ; q0 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1922 %if mmsize == 16 && %4 == 8 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1923 movhps m4, [dst8_reg] |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1924 %endif |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1925 %elifdef m8 |
|
12195
e7847fcff0f4
Be more efficient with registers or stack memory. Saves 8/16 bytes stack
rbultje
parents:
12194
diff
changeset
|
1926 SWAP 4, 8 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1927 %else |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1928 mova m4, q0backup |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1929 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1930 mova m1, m4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1931 SWAP 1, 4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1932 mova m7, m5 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1933 SWAP 7, 5 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1934 psubusb m1, m5 ; q0-q1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1935 psubusb m7, m4 ; q1-q0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1936 por m1, m7 ; abs(q1-q0) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1937 %ifidn %1, mmx |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1938 mova m7, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1939 psubusb m1, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1940 psubusb m7, hev_thr |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1941 pxor m6, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1942 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1943 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1944 mova m6, mask_res |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1945 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1946 pand m6, m7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1947 %else ; mmxext/sse2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1948 pxor m7, m7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1949 pmaxub m0, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1950 pmaxub m6, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1951 psubusb m0, flim_I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1952 psubusb m6, hev_thr |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1953 pcmpeqb m0, m7 ; max(abs(..)) <= I |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1954 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1955 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1956 %ifdef m12 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1957 SWAP 6, 12 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1958 %else |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
1959 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1960 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1961 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1962 ; simple_limit |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1963 mova m1, m3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1964 SWAP 1, 3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1965 mova m6, m4 ; keep copies of p0/q0 around for later use |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1966 SWAP 6, 4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1967 psubusb m1, m4 ; p0-q0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1968 psubusb m6, m3 ; q0-p0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1969 por m1, m6 ; abs(q0-p0) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1970 paddusb m1, m1 ; m1=2*abs(q0-p0) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1971 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1972 mova m7, m2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1973 SWAP 7, 2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1974 mova m6, m5 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1975 SWAP 6, 5 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1976 psubusb m7, m5 ; p1-q1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1977 psubusb m6, m2 ; q1-p1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1978 por m7, m6 ; abs(q1-p1) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1979 pxor m6, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1980 pand m7, [pb_FE] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1981 psrlq m7, 1 ; abs(q1-p1)/2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1982 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1983 psubusb m7, flim_E |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1984 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1985 pand m0, m7 ; normal_limit result |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1986 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1987 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1988 %ifdef m8 ; x86-64 && sse2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1989 mova m8, [pb_80] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1990 %define pb_80_var m8 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1991 %else ; x86-32 or mmx/mmxext |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1992 %define pb_80_var [pb_80] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1993 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1994 mova m1, m4 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1995 mova m7, m3 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1996 pxor m1, pb_80_var |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1997 pxor m7, pb_80_var |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1998 psubsb m1, m7 ; (signed) q0-p0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
1999 mova m6, m2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2000 mova m7, m5 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2001 pxor m6, pb_80_var |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2002 pxor m7, pb_80_var |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2003 psubsb m6, m7 ; (signed) p1-q1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2004 mova m7, mask_res |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2005 pandn m7, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2006 paddsb m7, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2007 paddsb m7, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2008 paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2009 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2010 pand m7, m0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2011 mova m1, [pb_F8] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2012 mova m6, m7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2013 paddsb m7, [pb_3] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2014 paddsb m6, [pb_4] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2015 pand m7, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2016 pand m6, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2017 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2018 pxor m1, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2019 pxor m0, m0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2020 pcmpgtb m1, m7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2021 psubb m0, m7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2022 psrlq m7, 3 ; +f2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2023 psrlq m0, 3 ; -f2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2024 pand m0, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2025 pandn m1, m7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2026 psubusb m3, m0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2027 paddusb m3, m1 ; p0+f2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2028 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2029 pxor m1, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2030 pxor m0, m0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2031 pcmpgtb m0, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2032 psubb m1, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2033 psrlq m6, 3 ; +f1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2034 psrlq m1, 3 ; -f1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2035 pand m1, m0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2036 pandn m0, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2037 psubusb m4, m0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2038 paddusb m4, m1 ; q0-f1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2039 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2040 %ifdef m12 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2041 SWAP 6, 12 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2042 %else |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2043 mova m6, mask_res |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2044 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2045 %ifidn %1, mmx |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2046 mova m7, [pb_1] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2047 %else ; mmxext/sse2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2048 pxor m7, m7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2049 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2050 pand m0, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2051 pand m1, m6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2052 %ifidn %1, mmx |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2053 paddusb m0, m7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2054 pand m1, [pb_FE] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2055 pandn m7, m0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2056 psrlq m1, 1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2057 psrlq m7, 1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2058 SWAP 0, 7 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2059 %else ; mmxext/sse2 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2060 psubusb m1, [pb_1] |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2061 pavgb m0, m7 ; a |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2062 pavgb m1, m7 ; -a |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2063 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2064 psubusb m5, m0 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2065 psubusb m2, m1 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2066 paddusb m5, m1 ; q1-a |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2067 paddusb m2, m0 ; p1+a |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2068 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2069 ; store |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2070 %ifidn %2, v |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2071 movrow [dst_reg +mstride_reg*2], m2 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2072 movrow [dst_reg +mstride_reg ], m3 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2073 movrow [dst_reg], m4 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2074 movrow [dst_reg + stride_reg ], m5 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2075 %if mmsize == 16 && %4 == 8 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2076 movhps [dst8_reg+mstride_reg*2], m2 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2077 movhps [dst8_reg+mstride_reg ], m3 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2078 movhps [dst8_reg], m4 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2079 movhps [dst8_reg+ stride_reg ], m5 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2080 %endif |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2081 %else ; h |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2082 add dst_reg, 2 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2083 add dst2_reg, 2 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2084 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2085 ; 4x8/16 transpose |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2086 TRANSPOSE4x4B 2, 3, 4, 5, 6 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2087 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2088 %if mmsize == 8 ; mmx/mmxext (h) |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2089 WRITE_4x2D 2, 3, 4, 5, dst_reg, dst2_reg, mstride_reg, stride_reg |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2090 %else ; sse2 (h) |
| 12180 | 2091 lea dst8_reg, [dst8_reg+mstride_reg+2] |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2092 WRITE_4x4D 2, 3, 4, 5, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2093 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2094 %endif |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2095 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2096 %if mmsize == 8 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2097 %if %4 == 8 ; chroma |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2098 %ifidn %2, h |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2099 sub dst_reg, 2 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2100 %endif |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2101 cmp dst_reg, dst8_reg |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2102 mov dst_reg, dst8_reg |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2103 jnz .next8px |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2104 %else |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2105 %ifidn %2, h |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2106 lea dst_reg, [dst_reg + stride_reg*8-2] |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2107 %else ; v |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2108 add dst_reg, 8 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2109 %endif |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2110 dec cnt_reg |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2111 jg .next8px |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2112 %endif |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2113 %endif |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2114 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2115 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2116 mov rsp, stack_reg ; restore stack pointer |
|
12173
c47ddb7df424
Change return statement, the REP_RET is a mistake since the else case (x86-64,
rbultje
parents:
12168
diff
changeset
|
2117 %endif |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2118 RET |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2119 %endmacro |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2120 |
|
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2121 INIT_MMX |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2122 %define SPLATB_REG SPLATB_REG_MMX |
| 12210 | 2123 INNER_LOOPFILTER mmx, v, 6, 16, 0 |
| 2124 INNER_LOOPFILTER mmx, h, 6, 16, 0 | |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2125 INNER_LOOPFILTER mmx, v, 6, 8, 0 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2126 INNER_LOOPFILTER mmx, h, 6, 8, 0 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2127 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2128 %define SPLATB_REG SPLATB_REG_MMXEXT |
| 12210 | 2129 INNER_LOOPFILTER mmxext, v, 6, 16, 0 |
| 2130 INNER_LOOPFILTER mmxext, h, 6, 16, 0 | |
| 2131 INNER_LOOPFILTER mmxext, v, 6, 8, 0 | |
| 2132 INNER_LOOPFILTER mmxext, h, 6, 8, 0 | |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2133 |
|
12168
b246b214c2e9
VP8 H/V inner loopfilter MMX/MMXEXT/SSE2 optimizations.
rbultje
parents:
12086
diff
changeset
|
2134 INIT_XMM |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2135 %define SPLATB_REG SPLATB_REG_SSE2 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2136 INNER_LOOPFILTER sse2, v, 5, 16, 13 |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2137 %ifdef m8 |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2138 INNER_LOOPFILTER sse2, h, 5, 16, 13 |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2139 %else |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2140 INNER_LOOPFILTER sse2, h, 6, 16, 13 |
|
12174
57038190cc5f
Give x86 r%d registers names, this will simplify implementation of the chroma
rbultje
parents:
12173
diff
changeset
|
2141 %endif |
|
12204
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2142 INNER_LOOPFILTER sse2, v, 6, 8, 13 |
|
563339ea87aa
Chroma (width=8) inner loopfilter MMX/MMX2/SSE2 for VP8 decoder.
rbultje
parents:
12198
diff
changeset
|
2143 INNER_LOOPFILTER sse2, h, 6, 8, 13 |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2144 |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2145 %define SPLATB_REG SPLATB_REG_SSSE3 |
| 12210 | 2146 INNER_LOOPFILTER ssse3, v, 5, 16, 13 |
| 2147 %ifdef m8 | |
| 2148 INNER_LOOPFILTER ssse3, h, 5, 16, 13 | |
| 2149 %else | |
| 2150 INNER_LOOPFILTER ssse3, h, 6, 16, 13 | |
| 2151 %endif | |
| 2152 INNER_LOOPFILTER ssse3, v, 6, 8, 13 | |
| 2153 INNER_LOOPFILTER ssse3, h, 6, 8, 13 | |
| 2154 | |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2155 ;----------------------------------------------------------------------------- |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2156 ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2157 ; int flimE, int flimI, int hev_thr); |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2158 ;----------------------------------------------------------------------------- |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2159 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2160 %macro MBEDGE_LOOPFILTER 5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2161 %if %4 == 8 ; chroma |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2162 cglobal vp8_%2_loop_filter8uv_mbedge_%1, 6, %3, %5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2163 %define dst8_reg r1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2164 %define mstride_reg r2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2165 %define E_reg r3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2166 %define I_reg r4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2167 %define hev_thr_reg r5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2168 %else ; luma |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2169 cglobal vp8_%2_loop_filter16y_mbedge_%1, 5, %3, %5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2170 %define mstride_reg r1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2171 %define E_reg r2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2172 %define I_reg r3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2173 %define hev_thr_reg r4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2174 %ifdef m8 ; x86-64, sse2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2175 %define dst8_reg r4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2176 %elif mmsize == 16 ; x86-32, sse2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2177 %define dst8_reg r5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2178 %else ; x86-32, mmx/mmxext |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2179 %define cnt_reg r5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2180 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2181 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2182 %define dst_reg r0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2183 %define stride_reg E_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2184 %define dst2_reg I_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2185 %ifndef m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2186 %define stack_reg hev_thr_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2187 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2188 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2189 %define ssse3_or_higher 0 |
|
12274
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
2190 %ifnidn %1, sse2 |
|
1d207bb5cd29
Use nested ifs instead of &&, which appears to not work with %ifidn (i.e. this
rbultje
parents:
12272
diff
changeset
|
2191 %if mmsize == 16 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2192 %define ssse3_or_higher 1 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2193 %endif |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2194 %endif |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2195 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2196 %if ssse3_or_higher |
| 12210 | 2197 pxor m7, m7 |
| 2198 %endif | |
| 2199 | |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2200 %ifndef m8 ; mmx/mmxext or sse2 on x86-32 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2201 ; splat function arguments |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2202 SPLATB_REG m0, E_reg, m7 ; E |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2203 SPLATB_REG m1, I_reg, m7 ; I |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2204 SPLATB_REG m2, hev_thr_reg, m7 ; hev_thresh |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2205 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2206 ; align stack |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2207 mov stack_reg, rsp ; backup stack pointer |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2208 and rsp, ~(mmsize-1) ; align stack |
|
12276
1c299b8f2930
Enable no-loop memory/register saving for ssse3/sse4 also.
rbultje
parents:
12275
diff
changeset
|
2209 %if mmsize == 16 |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2210 sub rsp, mmsize * 7 |
|
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2211 %else |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2212 sub rsp, mmsize * 8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2213 ; [3]=hev() result |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2214 ; [4]=filter tmp result |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2215 ; [5]/[6] = p2/q2 backup |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2216 ; [7]=lim_res sign result |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2217 %endif |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2218 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2219 %define flim_E [rsp] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2220 %define flim_I [rsp+mmsize] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2221 %define hev_thr [rsp+mmsize*2] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2222 %define mask_res [rsp+mmsize*3] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2223 %define lim_res [rsp+mmsize*4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2224 %define p0backup [rsp+mmsize*3] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2225 %define q0backup [rsp+mmsize*4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2226 %define p2backup [rsp+mmsize*5] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2227 %define q2backup [rsp+mmsize*6] |
|
12276
1c299b8f2930
Enable no-loop memory/register saving for ssse3/sse4 also.
rbultje
parents:
12275
diff
changeset
|
2228 %if mmsize == 16 |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2229 %define lim_sign [rsp] |
|
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2230 %else |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2231 %define lim_sign [rsp+mmsize*7] |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2232 %endif |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2233 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2234 mova flim_E, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2235 mova flim_I, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2236 mova hev_thr, m2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2237 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2238 %else ; sse2 on x86-64 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2239 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2240 %define flim_E m9 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2241 %define flim_I m10 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2242 %define hev_thr m11 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2243 %define mask_res m12 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2244 %define lim_res m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2245 %define p0backup m12 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2246 %define q0backup m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2247 %define p2backup m13 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2248 %define q2backup m14 |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2249 %define lim_sign m9 |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2250 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2251 ; splat function arguments |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2252 SPLATB_REG flim_E, E_reg, m7 ; E |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2253 SPLATB_REG flim_I, I_reg, m7 ; I |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2254 SPLATB_REG hev_thr, hev_thr_reg, m7 ; hev_thresh |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2255 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2256 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2257 %if mmsize == 8 && %4 == 16 ; mmx/mmxext |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2258 mov cnt_reg, 2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2259 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2260 mov stride_reg, mstride_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2261 neg mstride_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2262 %ifidn %2, h |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2263 lea dst_reg, [dst_reg + stride_reg*4-4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2264 %if %4 == 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2265 lea dst8_reg, [dst8_reg+ stride_reg*4-4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2266 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2267 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2268 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2269 %if mmsize == 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2270 .next8px |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2271 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2272 ; read |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2273 lea dst2_reg, [dst_reg + stride_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2274 %ifidn %2, v |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2275 %if %4 == 8 && mmsize == 16 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2276 %define movrow movh |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2277 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2278 %define movrow mova |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2279 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2280 movrow m0, [dst_reg +mstride_reg*4] ; p3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2281 movrow m1, [dst2_reg+mstride_reg*4] ; p2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2282 movrow m2, [dst_reg +mstride_reg*2] ; p1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2283 movrow m5, [dst2_reg] ; q1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2284 movrow m6, [dst2_reg+ stride_reg] ; q2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2285 movrow m7, [dst2_reg+ stride_reg*2] ; q3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2286 %if mmsize == 16 && %4 == 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2287 movhps m0, [dst8_reg+mstride_reg*4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2288 movhps m2, [dst8_reg+mstride_reg*2] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2289 add dst8_reg, stride_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2290 movhps m1, [dst8_reg+mstride_reg*4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2291 movhps m5, [dst8_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2292 movhps m6, [dst8_reg+ stride_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2293 movhps m7, [dst8_reg+ stride_reg*2] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2294 add dst8_reg, mstride_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2295 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2296 %elif mmsize == 8 ; mmx/mmxext (h) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2297 ; read 8 rows of 8px each |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2298 movu m0, [dst_reg +mstride_reg*4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2299 movu m1, [dst2_reg+mstride_reg*4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2300 movu m2, [dst_reg +mstride_reg*2] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2301 movu m3, [dst_reg +mstride_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2302 movu m4, [dst_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2303 movu m5, [dst2_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2304 movu m6, [dst2_reg+ stride_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2305 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2306 ; 8x8 transpose |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2307 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2308 mova q0backup, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2309 movu m7, [dst2_reg+ stride_reg*2] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2310 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2311 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2312 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2313 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2314 mova m1, q0backup |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2315 mova q0backup, m2 ; store q0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2316 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2317 mova p0backup, m5 ; store p0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2318 SWAP 1, 4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2319 SWAP 2, 4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2320 SWAP 6, 3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2321 SWAP 5, 3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2322 %else ; sse2 (h) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2323 %if %4 == 16 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2324 lea dst8_reg, [dst_reg + stride_reg*8] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2325 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2326 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2327 ; read 16 rows of 8px each, interleave |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2328 movh m0, [dst_reg +mstride_reg*4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2329 movh m1, [dst8_reg+mstride_reg*4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2330 movh m2, [dst_reg +mstride_reg*2] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2331 movh m5, [dst8_reg+mstride_reg*2] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2332 movh m3, [dst_reg +mstride_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2333 movh m6, [dst8_reg+mstride_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2334 movh m4, [dst_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2335 movh m7, [dst8_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2336 punpcklbw m0, m1 ; A/I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2337 punpcklbw m2, m5 ; C/K |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2338 punpcklbw m3, m6 ; D/L |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2339 punpcklbw m4, m7 ; E/M |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2340 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2341 add dst8_reg, stride_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2342 movh m1, [dst2_reg+mstride_reg*4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2343 movh m6, [dst8_reg+mstride_reg*4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2344 movh m5, [dst2_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2345 movh m7, [dst8_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2346 punpcklbw m1, m6 ; B/J |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2347 punpcklbw m5, m7 ; F/N |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2348 movh m6, [dst2_reg+ stride_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2349 movh m7, [dst8_reg+ stride_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2350 punpcklbw m6, m7 ; G/O |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2351 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2352 ; 8x16 transpose |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2353 TRANSPOSE4x4B 0, 1, 2, 3, 7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2354 %ifdef m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2355 SWAP 1, 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2356 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2357 mova q0backup, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2358 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2359 movh m7, [dst2_reg+ stride_reg*2] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2360 movh m1, [dst8_reg+ stride_reg*2] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2361 punpcklbw m7, m1 ; H/P |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2362 TRANSPOSE4x4B 4, 5, 6, 7, 1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2363 SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2364 SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2365 SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2366 %ifdef m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2367 SWAP 1, 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2368 SWAP 2, 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2369 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2370 mova m1, q0backup |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2371 mova q0backup, m2 ; store q0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2372 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2373 SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2374 %ifdef m12 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2375 SWAP 5, 12 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2376 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2377 mova p0backup, m5 ; store p0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2378 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2379 SWAP 1, 4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2380 SWAP 2, 4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2381 SWAP 6, 3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2382 SWAP 5, 3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2383 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2384 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2385 ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2386 mova m4, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2387 SWAP 4, 1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2388 psubusb m4, m0 ; p2-p3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2389 psubusb m0, m1 ; p3-p2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2390 por m0, m4 ; abs(p3-p2) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2391 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2392 mova m4, m2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2393 SWAP 4, 2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2394 psubusb m4, m1 ; p1-p2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2395 mova p2backup, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2396 psubusb m1, m2 ; p2-p1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2397 por m1, m4 ; abs(p2-p1) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2398 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2399 mova m4, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2400 SWAP 4, 6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2401 psubusb m4, m7 ; q2-q3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2402 psubusb m7, m6 ; q3-q2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2403 por m7, m4 ; abs(q3-q2) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2404 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2405 mova m4, m5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2406 SWAP 4, 5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2407 psubusb m4, m6 ; q1-q2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2408 mova q2backup, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2409 psubusb m6, m5 ; q2-q1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2410 por m6, m4 ; abs(q2-q1) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2411 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2412 %ifidn %1, mmx |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2413 mova m4, flim_I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2414 pxor m3, m3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2415 psubusb m0, m4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2416 psubusb m1, m4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2417 psubusb m7, m4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2418 psubusb m6, m4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2419 pcmpeqb m0, m3 ; abs(p3-p2) <= I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2420 pcmpeqb m1, m3 ; abs(p2-p1) <= I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2421 pcmpeqb m7, m3 ; abs(q3-q2) <= I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2422 pcmpeqb m6, m3 ; abs(q2-q1) <= I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2423 pand m0, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2424 pand m7, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2425 pand m0, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2426 %else ; mmxext/sse2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2427 pmaxub m0, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2428 pmaxub m6, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2429 pmaxub m0, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2430 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2431 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2432 ; normal_limit and high_edge_variance for p1-p0, q1-q0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2433 SWAP 7, 3 ; now m7 is zero |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2434 %ifidn %2, v |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2435 movrow m3, [dst_reg +mstride_reg] ; p0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2436 %if mmsize == 16 && %4 == 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2437 movhps m3, [dst8_reg+mstride_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2438 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2439 %elifdef m12 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2440 SWAP 3, 12 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2441 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2442 mova m3, p0backup |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2443 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2444 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2445 mova m1, m2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2446 SWAP 1, 2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2447 mova m6, m3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2448 SWAP 3, 6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2449 psubusb m1, m3 ; p1-p0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2450 psubusb m6, m2 ; p0-p1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2451 por m1, m6 ; abs(p1-p0) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2452 %ifidn %1, mmx |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2453 mova m6, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2454 psubusb m1, m4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2455 psubusb m6, hev_thr |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2456 pcmpeqb m1, m7 ; abs(p1-p0) <= I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2457 pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2458 pand m0, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2459 mova mask_res, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2460 %else ; mmxext/sse2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2461 pmaxub m0, m1 ; max_I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2462 SWAP 1, 4 ; max_hev_thresh |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2463 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2464 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2465 SWAP 6, 4 ; now m6 is I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2466 %ifidn %2, v |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2467 movrow m4, [dst_reg] ; q0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2468 %if mmsize == 16 && %4 == 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2469 movhps m4, [dst8_reg] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2470 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2471 %elifdef m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2472 SWAP 4, 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2473 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2474 mova m4, q0backup |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2475 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2476 mova m1, m4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2477 SWAP 1, 4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2478 mova m7, m5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2479 SWAP 7, 5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2480 psubusb m1, m5 ; q0-q1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2481 psubusb m7, m4 ; q1-q0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2482 por m1, m7 ; abs(q1-q0) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2483 %ifidn %1, mmx |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2484 mova m7, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2485 psubusb m1, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2486 psubusb m7, hev_thr |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2487 pxor m6, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2488 pcmpeqb m1, m6 ; abs(q1-q0) <= I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2489 pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2490 mova m6, mask_res |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2491 pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2492 pand m6, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2493 %else ; mmxext/sse2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2494 pxor m7, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2495 pmaxub m0, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2496 pmaxub m6, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2497 psubusb m0, flim_I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2498 psubusb m6, hev_thr |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2499 pcmpeqb m0, m7 ; max(abs(..)) <= I |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2500 pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2501 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2502 %ifdef m12 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2503 SWAP 6, 12 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2504 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2505 mova mask_res, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2506 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2507 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2508 ; simple_limit |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2509 mova m1, m3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2510 SWAP 1, 3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2511 mova m6, m4 ; keep copies of p0/q0 around for later use |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2512 SWAP 6, 4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2513 psubusb m1, m4 ; p0-q0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2514 psubusb m6, m3 ; q0-p0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2515 por m1, m6 ; abs(q0-p0) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2516 paddusb m1, m1 ; m1=2*abs(q0-p0) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2517 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2518 mova m7, m2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2519 SWAP 7, 2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2520 mova m6, m5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2521 SWAP 6, 5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2522 psubusb m7, m5 ; p1-q1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2523 psubusb m6, m2 ; q1-p1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2524 por m7, m6 ; abs(q1-p1) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2525 pxor m6, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2526 pand m7, [pb_FE] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2527 psrlq m7, 1 ; abs(q1-p1)/2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2528 paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2529 psubusb m7, flim_E |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2530 pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2531 pand m0, m7 ; normal_limit result |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2532 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2533 ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2534 %ifdef m8 ; x86-64 && sse2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2535 mova m8, [pb_80] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2536 %define pb_80_var m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2537 %else ; x86-32 or mmx/mmxext |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2538 %define pb_80_var [pb_80] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2539 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2540 mova m1, m4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2541 mova m7, m3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2542 pxor m1, pb_80_var |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2543 pxor m7, pb_80_var |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2544 psubsb m1, m7 ; (signed) q0-p0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2545 mova m6, m2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2546 mova m7, m5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2547 pxor m6, pb_80_var |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2548 pxor m7, pb_80_var |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2549 psubsb m6, m7 ; (signed) p1-q1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2550 mova m7, mask_res |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2551 paddsb m6, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2552 paddsb m6, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2553 paddsb m6, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2554 pand m6, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2555 %ifdef m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2556 mova lim_res, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2557 pand lim_res, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2558 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2559 mova m0, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2560 pand m0, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2561 mova lim_res, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2562 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2563 pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2564 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2565 mova m1, [pb_F8] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2566 mova m6, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2567 paddsb m7, [pb_3] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2568 paddsb m6, [pb_4] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2569 pand m7, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2570 pand m6, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2571 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2572 pxor m1, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2573 pxor m0, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2574 pcmpgtb m1, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2575 psubb m0, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2576 psrlq m7, 3 ; +f2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2577 psrlq m0, 3 ; -f2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2578 pand m0, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2579 pandn m1, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2580 psubusb m3, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2581 paddusb m3, m1 ; p0+f2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2582 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2583 pxor m1, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2584 pxor m0, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2585 pcmpgtb m0, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2586 psubb m1, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2587 psrlq m6, 3 ; +f1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2588 psrlq m1, 3 ; -f1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2589 pand m1, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2590 pandn m0, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2591 psubusb m4, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2592 paddusb m4, m1 ; q0-f1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2593 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2594 ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2595 %if ssse3_or_higher |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2596 mova m7, [pb_1] |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2597 %else |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2598 mova m7, [pw_63] |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2599 %endif |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2600 %ifdef m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2601 SWAP 1, 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2602 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2603 mova m1, lim_res |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2604 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2605 pxor m0, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2606 mova m6, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2607 pcmpgtb m0, m1 ; which are negative |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2608 %if ssse3_or_higher |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2609 punpcklbw m6, m7 ; interleave with "1" for rounding |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2610 punpckhbw m1, m7 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2611 %else |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2612 punpcklbw m6, m0 ; signed byte->word |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2613 punpckhbw m1, m0 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2614 %endif |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2615 mova lim_sign, m0 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2616 %if ssse3_or_higher |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2617 mova m7, [pb_27_63] |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2618 %ifndef m8 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2619 mova lim_res, m1 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2620 %endif |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2621 %ifdef m10 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2622 SWAP 0, 10 ; don't lose lim_sign copy |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2623 %endif |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2624 mova m0, m7 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2625 pmaddubsw m7, m6 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2626 SWAP 6, 7 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2627 pmaddubsw m0, m1 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2628 SWAP 1, 0 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2629 %ifdef m10 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2630 SWAP 0, 10 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2631 %else |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2632 mova m0, lim_sign |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2633 %endif |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2634 %else |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2635 mova mask_res, m6 ; backup for later in filter |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2636 mova lim_res, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2637 pmullw m6, [pw_27] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2638 pmullw m1, [pw_27] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2639 paddw m6, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2640 paddw m1, m7 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2641 %endif |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2642 psraw m6, 7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2643 psraw m1, 7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2644 packsswb m6, m1 ; a0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2645 pxor m1, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2646 psubb m1, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2647 pand m1, m0 ; -a0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2648 pandn m0, m6 ; +a0 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2649 %if ssse3_or_higher |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2650 mova m6, [pb_18_63] ; pipelining |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2651 %endif |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2652 psubusb m3, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2653 paddusb m4, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2654 paddusb m3, m0 ; p0+a0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2655 psubusb m4, m0 ; q0-a0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2656 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2657 %if ssse3_or_higher |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2658 SWAP 6, 7 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2659 %ifdef m10 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2660 SWAP 1, 10 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2661 %else |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2662 mova m1, lim_res |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2663 %endif |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2664 mova m0, m7 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2665 pmaddubsw m7, m6 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2666 SWAP 6, 7 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2667 pmaddubsw m0, m1 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2668 SWAP 1, 0 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2669 %ifdef m10 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2670 SWAP 0, 10 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2671 %endif |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2672 mova m0, lim_sign |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2673 %else |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2674 mova m6, mask_res |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2675 mova m1, lim_res |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2676 pmullw m6, [pw_18] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2677 pmullw m1, [pw_18] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2678 paddw m6, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2679 paddw m1, m7 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2680 %endif |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2681 mova m0, lim_sign |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2682 psraw m6, 7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2683 psraw m1, 7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2684 packsswb m6, m1 ; a1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2685 pxor m1, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2686 psubb m1, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2687 pand m1, m0 ; -a1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2688 pandn m0, m6 ; +a1 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2689 %if ssse3_or_higher |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2690 mova m6, [pb_9_63] |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2691 %endif |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2692 psubusb m2, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2693 paddusb m5, m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2694 paddusb m2, m0 ; p1+a1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2695 psubusb m5, m0 ; q1-a1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2696 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2697 %if ssse3_or_higher |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2698 SWAP 6, 7 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2699 %ifdef m10 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2700 SWAP 1, 10 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2701 %else |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2702 mova m1, lim_res |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2703 %endif |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2704 mova m0, m7 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2705 pmaddubsw m7, m6 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2706 SWAP 6, 7 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2707 pmaddubsw m0, m1 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2708 SWAP 1, 0 |
|
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2709 %else |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2710 %ifdef m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2711 SWAP 6, 12 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2712 SWAP 1, 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2713 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2714 mova m6, mask_res |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2715 mova m1, lim_res |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2716 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2717 pmullw m6, [pw_9] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2718 pmullw m1, [pw_9] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2719 paddw m6, m7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2720 paddw m1, m7 |
|
12279
7fb91885433c
Use pmaddubsw for the mbedge_filter (>=ssse3), 6-10 cycles faster.
rbultje
parents:
12278
diff
changeset
|
2721 %endif |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2722 %ifdef m9 |
|
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2723 SWAP 7, 9 |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2724 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2725 mova m7, lim_sign |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2726 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2727 psraw m6, 7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2728 psraw m1, 7 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2729 packsswb m6, m1 ; a1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2730 pxor m0, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2731 psubb m0, m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2732 pand m0, m7 ; -a1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2733 pandn m7, m6 ; +a1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2734 %ifdef m8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2735 SWAP 1, 13 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2736 SWAP 6, 14 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2737 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2738 mova m1, p2backup |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2739 mova m6, q2backup |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2740 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2741 psubusb m1, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2742 paddusb m6, m0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2743 paddusb m1, m7 ; p1+a1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2744 psubusb m6, m7 ; q1-a1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2745 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2746 ; store |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2747 %ifidn %2, v |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2748 movrow [dst2_reg+mstride_reg*4], m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2749 movrow [dst_reg +mstride_reg*2], m2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2750 movrow [dst_reg +mstride_reg ], m3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2751 movrow [dst_reg], m4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2752 movrow [dst2_reg], m5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2753 movrow [dst2_reg+ stride_reg ], m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2754 %if mmsize == 16 && %4 == 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2755 add dst8_reg, mstride_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2756 movhps [dst8_reg+mstride_reg*2], m1 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2757 movhps [dst8_reg+mstride_reg ], m2 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2758 movhps [dst8_reg], m3 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2759 add dst8_reg, stride_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2760 movhps [dst8_reg], m4 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2761 movhps [dst8_reg+ stride_reg ], m5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2762 movhps [dst8_reg+ stride_reg*2], m6 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2763 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2764 %else ; h |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2765 inc dst_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2766 inc dst2_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2767 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2768 ; 4x8/16 transpose |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2769 TRANSPOSE4x4B 1, 2, 3, 4, 0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2770 SBUTTERFLY bw, 5, 6, 0 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2771 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2772 %if mmsize == 8 ; mmx/mmxext (h) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2773 WRITE_4x2D 1, 2, 3, 4, dst_reg, dst2_reg, mstride_reg, stride_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2774 add dst_reg, 4 |
|
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2775 WRITE_2x4W m5, m6, dst2_reg, dst_reg, mstride_reg, stride_reg |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2776 %else ; sse2 (h) |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2777 lea dst8_reg, [dst8_reg+mstride_reg+1] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2778 WRITE_4x4D 1, 2, 3, 4, dst_reg, dst2_reg, dst8_reg, mstride_reg, stride_reg, %4 |
|
12214
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2779 lea dst_reg, [dst2_reg+mstride_reg+4] |
|
657d353cd515
Fix and enable horizontal >=SSE2 mbedge loopfilter.
rbultje
parents:
12211
diff
changeset
|
2780 lea dst8_reg, [dst8_reg+mstride_reg+4] |
|
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2781 %ifidn %1, sse4 |
|
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2782 add dst2_reg, 4 |
|
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2783 %endif |
|
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2784 WRITE_8W m5, dst2_reg, dst_reg, mstride_reg, stride_reg |
|
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2785 %ifidn %1, sse4 |
|
12268
259988e7ad0f
Fix obvious bug in assignment. Somehow, the test vectors don't test this...
rbultje
parents:
12266
diff
changeset
|
2786 lea dst2_reg, [dst8_reg+ stride_reg] |
|
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2787 %endif |
|
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2788 WRITE_8W m6, dst2_reg, dst8_reg, mstride_reg, stride_reg |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2789 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2790 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2791 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2792 %if mmsize == 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2793 %if %4 == 8 ; chroma |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2794 %ifidn %2, h |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2795 sub dst_reg, 5 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2796 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2797 cmp dst_reg, dst8_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2798 mov dst_reg, dst8_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2799 jnz .next8px |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2800 %else |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2801 %ifidn %2, h |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2802 lea dst_reg, [dst_reg + stride_reg*8-5] |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2803 %else ; v |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2804 add dst_reg, 8 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2805 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2806 dec cnt_reg |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2807 jg .next8px |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2808 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2809 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2810 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2811 %ifndef m8 ; sse2 on x86-32 or mmx/mmxext |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2812 mov rsp, stack_reg ; restore stack pointer |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2813 %endif |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2814 RET |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2815 %endmacro |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2816 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2817 INIT_MMX |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2818 %define SPLATB_REG SPLATB_REG_MMX |
| 12210 | 2819 MBEDGE_LOOPFILTER mmx, v, 6, 16, 0 |
| 2820 MBEDGE_LOOPFILTER mmx, h, 6, 16, 0 | |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2821 MBEDGE_LOOPFILTER mmx, v, 6, 8, 0 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2822 MBEDGE_LOOPFILTER mmx, h, 6, 8, 0 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2823 |
|
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2824 %define SPLATB_REG SPLATB_REG_MMXEXT |
| 12210 | 2825 MBEDGE_LOOPFILTER mmxext, v, 6, 16, 0 |
| 2826 MBEDGE_LOOPFILTER mmxext, h, 6, 16, 0 | |
| 2827 MBEDGE_LOOPFILTER mmxext, v, 6, 8, 0 | |
| 2828 MBEDGE_LOOPFILTER mmxext, h, 6, 8, 0 | |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2829 |
|
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2830 INIT_XMM |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2831 %define SPLATB_REG SPLATB_REG_SSE2 |
|
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2832 %define WRITE_8W WRITE_8W_SSE2 |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2833 MBEDGE_LOOPFILTER sse2, v, 5, 16, 15 |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2834 %ifdef m8 |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2835 MBEDGE_LOOPFILTER sse2, h, 5, 16, 15 |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2836 %else |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2837 MBEDGE_LOOPFILTER sse2, h, 6, 16, 15 |
|
12205
d38e8565ba05
VP8 MBedge loopfilter MMX/MMX2/SSE2 functions for both luma (width=16)
rbultje
parents:
12204
diff
changeset
|
2838 %endif |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2839 MBEDGE_LOOPFILTER sse2, v, 6, 8, 15 |
|
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2840 MBEDGE_LOOPFILTER sse2, h, 6, 8, 15 |
| 12210 | 2841 |
|
12266
48d6738904a9
Fix SPLATB_REG mess. Used to be a if/elseif/elseif/elseif spaghetti, so this
rbultje
parents:
12241
diff
changeset
|
2842 %define SPLATB_REG SPLATB_REG_SSSE3 |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2843 MBEDGE_LOOPFILTER ssse3, v, 5, 16, 15 |
| 12210 | 2844 %ifdef m8 |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2845 MBEDGE_LOOPFILTER ssse3, h, 5, 16, 15 |
| 12210 | 2846 %else |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2847 MBEDGE_LOOPFILTER ssse3, h, 6, 16, 15 |
| 12210 | 2848 %endif |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2849 MBEDGE_LOOPFILTER ssse3, v, 6, 8, 15 |
|
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2850 MBEDGE_LOOPFILTER ssse3, h, 6, 8, 15 |
|
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2851 |
|
12272
dd90555c98fd
Split pextrw macro-spaghetti into several opt-specific macros, this will make
rbultje
parents:
12268
diff
changeset
|
2852 %define WRITE_8W WRITE_8W_SSE4 |
|
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2853 %ifdef m8 |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2854 MBEDGE_LOOPFILTER sse4, h, 5, 16, 15 |
|
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2855 %else |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2856 MBEDGE_LOOPFILTER sse4, h, 6, 16, 15 |
|
12227
d07e6037846d
Use pextrw for SSE4 mbedge filter result writing, speedup 5-10cycles on
rbultje
parents:
12214
diff
changeset
|
2857 %endif |
|
12275
709d5848abf8
Save a register (or regsize of stackspace for x86-32) for the no-loop
rbultje
parents:
12274
diff
changeset
|
2858 MBEDGE_LOOPFILTER sse4, h, 6, 8, 15 |
