Mercurial > libavcodec.hg
annotate x86/dsputil_yasm.asm @ 12530:63edd10ad4bc libavcodec tip
Try to fix crashes introduced by r25218
r25218 made assumptions about the existence of past reference frames that
weren't necessarily true.
| author | darkshikari |
|---|---|
| date | Tue, 28 Sep 2010 09:06:22 +0000 |
| parents | 980030a3e315 |
| children |
| rev | line source |
|---|---|
| 8430 | 1 ;****************************************************************************** |
| 2 ;* MMX optimized DSP utils | |
| 3 ;* Copyright (c) 2008 Loren Merritt | |
| 4 ;* | |
| 5 ;* This file is part of FFmpeg. | |
| 6 ;* | |
| 7 ;* FFmpeg is free software; you can redistribute it and/or | |
| 8 ;* modify it under the terms of the GNU Lesser General Public | |
| 9 ;* License as published by the Free Software Foundation; either | |
| 10 ;* version 2.1 of the License, or (at your option) any later version. | |
| 11 ;* | |
| 12 ;* FFmpeg is distributed in the hope that it will be useful, | |
| 13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 15 ;* Lesser General Public License for more details. | |
| 16 ;* | |
| 17 ;* You should have received a copy of the GNU Lesser General Public | |
| 18 ;* License along with FFmpeg; if not, write to the Free Software | |
| 19 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |
| 20 ;****************************************************************************** | |
| 21 | |
| 22 %include "x86inc.asm" | |
| 23 | |
| 10430 | 24 SECTION_RODATA |
| 25 pb_f: times 16 db 15 | |
| 26 pb_zzzzzzzz77777777: times 8 db -1 | |
| 27 pb_7: times 8 db 7 | |
| 28 pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 | |
| 29 pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 | |
| 30 | |
| 8430 | 31 section .text align=16 |
| 32 | |
| 33 %macro PSWAPD_SSE 2 | |
| 34 pshufw %1, %2, 0x4e | |
| 35 %endmacro | |
| 36 %macro PSWAPD_3DN1 2 | |
| 37 movq %1, %2 | |
| 38 psrlq %1, 32 | |
| 39 punpckldq %1, %2 | |
| 40 %endmacro | |
| 41 | |
| 42 %macro FLOAT_TO_INT16_INTERLEAVE6 1 | |
|
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
43 ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |
| 8430 | 44 cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5 |
| 45 %ifdef ARCH_X86_64 | |
| 46 %define lend r10d | |
| 47 mov lend, r2d | |
| 48 %else | |
| 49 %define lend dword r2m | |
| 50 %endif | |
| 51 mov src1q, [srcq+1*gprsize] | |
| 52 mov src2q, [srcq+2*gprsize] | |
| 53 mov src3q, [srcq+3*gprsize] | |
| 54 mov src4q, [srcq+4*gprsize] | |
| 55 mov src5q, [srcq+5*gprsize] | |
| 56 mov srcq, [srcq] | |
| 57 sub src1q, srcq | |
| 58 sub src2q, srcq | |
| 59 sub src3q, srcq | |
| 60 sub src4q, srcq | |
| 61 sub src5q, srcq | |
| 62 .loop: | |
| 63 cvtps2pi mm0, [srcq] | |
| 64 cvtps2pi mm1, [srcq+src1q] | |
| 65 cvtps2pi mm2, [srcq+src2q] | |
| 66 cvtps2pi mm3, [srcq+src3q] | |
| 67 cvtps2pi mm4, [srcq+src4q] | |
| 68 cvtps2pi mm5, [srcq+src5q] | |
| 69 packssdw mm0, mm3 | |
| 70 packssdw mm1, mm4 | |
| 71 packssdw mm2, mm5 | |
| 72 pswapd mm3, mm0 | |
| 73 punpcklwd mm0, mm1 | |
| 74 punpckhwd mm1, mm2 | |
| 75 punpcklwd mm2, mm3 | |
| 76 pswapd mm3, mm0 | |
| 77 punpckldq mm0, mm2 | |
| 78 punpckhdq mm2, mm1 | |
| 79 punpckldq mm1, mm3 | |
| 80 movq [dstq ], mm0 | |
| 81 movq [dstq+16], mm2 | |
| 82 movq [dstq+ 8], mm1 | |
| 83 add srcq, 8 | |
| 84 add dstq, 24 | |
| 85 sub lend, 2 | |
| 86 jg .loop | |
| 87 emms | |
| 88 RET | |
| 89 %endmacro ; FLOAT_TO_INT16_INTERLEAVE6 | |
| 90 | |
| 91 %define pswapd PSWAPD_SSE | |
| 92 FLOAT_TO_INT16_INTERLEAVE6 sse | |
| 93 %define cvtps2pi pf2id | |
| 94 %define pswapd PSWAPD_3DN1 | |
| 95 FLOAT_TO_INT16_INTERLEAVE6 3dnow | |
| 96 %undef pswapd | |
| 97 FLOAT_TO_INT16_INTERLEAVE6 3dn2 | |
| 98 %undef cvtps2pi | |
| 99 | |
| 8760 | 100 |
| 101 | |
| 10633 | 102 %macro SCALARPRODUCT 1 |
| 10644 | 103 ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift) |
| 10633 | 104 cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift |
| 105 shl orderq, 1 | |
| 106 add v1q, orderq | |
| 107 add v2q, orderq | |
| 108 neg orderq | |
| 109 movd m3, shiftm | |
| 110 pxor m2, m2 | |
| 111 .loop: | |
| 112 movu m0, [v1q + orderq] | |
| 113 movu m1, [v1q + orderq + mmsize] | |
| 114 pmaddwd m0, [v2q + orderq] | |
| 115 pmaddwd m1, [v2q + orderq + mmsize] | |
| 116 paddd m2, m0 | |
| 117 paddd m2, m1 | |
| 118 add orderq, mmsize*2 | |
| 119 jl .loop | |
| 120 %if mmsize == 16 | |
| 121 movhlps m0, m2 | |
| 122 paddd m2, m0 | |
| 123 psrad m2, m3 | |
| 124 pshuflw m0, m2, 0x4e | |
| 125 %else | |
| 126 psrad m2, m3 | |
| 127 pshufw m0, m2, 0x4e | |
| 128 %endif | |
| 129 paddd m2, m0 | |
| 130 movd eax, m2 | |
| 131 RET | |
| 10644 | 132 |
| 133 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | |
| 10660 | 134 cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul |
| 10644 | 135 shl orderq, 1 |
| 136 movd m7, mulm | |
| 137 %if mmsize == 16 | |
| 138 pshuflw m7, m7, 0 | |
| 139 punpcklqdq m7, m7 | |
| 140 %else | |
| 141 pshufw m7, m7, 0 | |
| 142 %endif | |
| 143 pxor m6, m6 | |
| 144 add v1q, orderq | |
| 145 add v2q, orderq | |
| 146 add v3q, orderq | |
| 147 neg orderq | |
| 148 .loop: | |
| 149 movu m0, [v2q + orderq] | |
| 150 movu m1, [v2q + orderq + mmsize] | |
| 151 mova m4, [v1q + orderq] | |
| 152 mova m5, [v1q + orderq + mmsize] | |
| 153 movu m2, [v3q + orderq] | |
| 154 movu m3, [v3q + orderq + mmsize] | |
| 155 pmaddwd m0, m4 | |
| 156 pmaddwd m1, m5 | |
| 157 pmullw m2, m7 | |
| 158 pmullw m3, m7 | |
| 159 paddd m6, m0 | |
| 160 paddd m6, m1 | |
| 161 paddw m2, m4 | |
| 162 paddw m3, m5 | |
| 163 mova [v1q + orderq], m2 | |
| 164 mova [v1q + orderq + mmsize], m3 | |
| 165 add orderq, mmsize*2 | |
| 166 jl .loop | |
| 167 %if mmsize == 16 | |
| 168 movhlps m0, m6 | |
| 169 paddd m6, m0 | |
| 170 pshuflw m0, m6, 0x4e | |
| 171 %else | |
| 172 pshufw m0, m6, 0x4e | |
| 173 %endif | |
| 174 paddd m6, m0 | |
| 175 movd eax, m6 | |
| 176 RET | |
| 10633 | 177 %endmacro |
| 178 | |
| 179 INIT_MMX | |
| 180 SCALARPRODUCT mmx2 | |
| 181 INIT_XMM | |
| 182 SCALARPRODUCT sse2 | |
| 183 | |
| 10644 | 184 %macro SCALARPRODUCT_LOOP 1 |
| 185 align 16 | |
| 186 .loop%1: | |
| 187 sub orderq, mmsize*2 | |
| 188 %if %1 | |
| 189 mova m1, m4 | |
| 190 mova m4, [v2q + orderq] | |
| 191 mova m0, [v2q + orderq + mmsize] | |
| 192 palignr m1, m0, %1 | |
| 193 palignr m0, m4, %1 | |
| 194 mova m3, m5 | |
| 195 mova m5, [v3q + orderq] | |
| 196 mova m2, [v3q + orderq + mmsize] | |
| 197 palignr m3, m2, %1 | |
| 198 palignr m2, m5, %1 | |
| 199 %else | |
| 200 mova m0, [v2q + orderq] | |
| 201 mova m1, [v2q + orderq + mmsize] | |
| 202 mova m2, [v3q + orderq] | |
| 203 mova m3, [v3q + orderq + mmsize] | |
| 204 %endif | |
|
10646
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
205 %define t0 [v1q + orderq] |
|
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
206 %define t1 [v1q + orderq + mmsize] |
|
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
207 %ifdef ARCH_X86_64 |
|
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
208 mova m8, t0 |
|
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
209 mova m9, t1 |
|
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
210 %define t0 m8 |
|
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
211 %define t1 m9 |
|
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
212 %endif |
|
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
213 pmaddwd m0, t0 |
|
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
214 pmaddwd m1, t1 |
| 10644 | 215 pmullw m2, m7 |
| 216 pmullw m3, m7 | |
|
10646
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
217 paddw m2, t0 |
|
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
218 paddw m3, t1 |
| 10644 | 219 paddd m6, m0 |
| 220 paddd m6, m1 | |
| 221 mova [v1q + orderq], m2 | |
| 222 mova [v1q + orderq + mmsize], m3 | |
| 223 jg .loop%1 | |
| 224 %if %1 | |
| 225 jmp .end | |
| 226 %endif | |
| 227 %endmacro | |
| 228 | |
| 229 ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) | |
|
10646
bb14c1586891
slightly faster scalarproduct_and_madd_int16_ssse3 on penryn, no change on conroe
lorenm
parents:
10644
diff
changeset
|
230 cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul |
| 10644 | 231 shl orderq, 1 |
| 232 movd m7, mulm | |
| 233 pshuflw m7, m7, 0 | |
| 234 punpcklqdq m7, m7 | |
| 235 pxor m6, m6 | |
| 236 mov r4d, v2d | |
| 237 and r4d, 15 | |
| 238 and v2q, ~15 | |
| 239 and v3q, ~15 | |
| 240 mova m4, [v2q + orderq] | |
| 241 mova m5, [v3q + orderq] | |
| 242 ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) | |
| 243 cmp r4d, 0 | |
| 244 je .loop0 | |
| 245 cmp r4d, 2 | |
| 246 je .loop2 | |
| 247 cmp r4d, 4 | |
| 248 je .loop4 | |
| 249 cmp r4d, 6 | |
| 250 je .loop6 | |
| 251 cmp r4d, 8 | |
| 252 je .loop8 | |
| 253 cmp r4d, 10 | |
| 254 je .loop10 | |
| 255 cmp r4d, 12 | |
| 256 je .loop12 | |
| 257 SCALARPRODUCT_LOOP 14 | |
| 258 SCALARPRODUCT_LOOP 12 | |
| 259 SCALARPRODUCT_LOOP 10 | |
| 260 SCALARPRODUCT_LOOP 8 | |
| 261 SCALARPRODUCT_LOOP 6 | |
| 262 SCALARPRODUCT_LOOP 4 | |
| 263 SCALARPRODUCT_LOOP 2 | |
| 264 SCALARPRODUCT_LOOP 0 | |
| 265 .end: | |
| 266 movhlps m0, m6 | |
| 267 paddd m6, m0 | |
| 268 pshuflw m0, m6, 0x4e | |
| 269 paddd m6, m0 | |
| 270 movd eax, m6 | |
| 271 RET | |
| 272 | |
| 10633 | 273 |
| 274 | |
|
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
275 ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |
| 8760 | 276 cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top |
| 277 movq mm0, [topq] | |
| 278 movq mm2, mm0 | |
| 279 movd mm4, [left_topq] | |
| 280 psllq mm2, 8 | |
| 281 movq mm1, mm0 | |
| 282 por mm4, mm2 | |
| 283 movd mm3, [leftq] | |
| 284 psubb mm0, mm4 ; t-tl | |
| 285 add dstq, wq | |
| 286 add topq, wq | |
| 287 add diffq, wq | |
| 288 neg wq | |
| 289 jmp .skip | |
| 290 .loop: | |
| 291 movq mm4, [topq+wq] | |
| 292 movq mm0, mm4 | |
| 293 psllq mm4, 8 | |
| 294 por mm4, mm1 | |
| 295 movq mm1, mm0 ; t | |
| 296 psubb mm0, mm4 ; t-tl | |
| 297 .skip: | |
| 298 movq mm2, [diffq+wq] | |
| 299 %assign i 0 | |
| 300 %rep 8 | |
| 301 movq mm4, mm0 | |
| 302 paddb mm4, mm3 ; t-tl+l | |
| 303 movq mm5, mm3 | |
| 304 pmaxub mm3, mm1 | |
| 305 pminub mm5, mm1 | |
| 306 pminub mm3, mm4 | |
| 307 pmaxub mm3, mm5 ; median | |
| 308 paddb mm3, mm2 ; +residual | |
| 309 %if i==0 | |
| 310 movq mm7, mm3 | |
| 311 psllq mm7, 56 | |
| 312 %else | |
| 313 movq mm6, mm3 | |
| 314 psrlq mm7, 8 | |
| 315 psllq mm6, 56 | |
| 316 por mm7, mm6 | |
| 317 %endif | |
| 318 %if i<7 | |
| 319 psrlq mm0, 8 | |
| 320 psrlq mm1, 8 | |
| 321 psrlq mm2, 8 | |
| 322 %endif | |
| 323 %assign i i+1 | |
| 324 %endrep | |
| 325 movq [dstq+wq], mm7 | |
| 326 add wq, 8 | |
| 327 jl .loop | |
| 328 movzx r2d, byte [dstq-1] | |
| 329 mov [leftq], r2d | |
| 330 movzx r2d, byte [topq-1] | |
| 331 mov [left_topq], r2d | |
| 332 RET | |
| 10430 | 333 |
| 334 | |
| 335 %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned | |
| 336 add srcq, wq | |
| 337 add dstq, wq | |
| 338 neg wq | |
| 339 %%.loop: | |
| 340 mova m1, [srcq+wq] | |
| 341 mova m2, m1 | |
| 342 psllw m1, 8 | |
| 343 paddb m1, m2 | |
| 344 mova m2, m1 | |
| 345 pshufb m1, m3 | |
| 346 paddb m1, m2 | |
| 347 pshufb m0, m5 | |
| 348 mova m2, m1 | |
| 349 pshufb m1, m4 | |
| 350 paddb m1, m2 | |
| 351 %if mmsize == 16 | |
| 352 mova m2, m1 | |
| 353 pshufb m1, m6 | |
| 354 paddb m1, m2 | |
| 355 %endif | |
| 356 paddb m0, m1 | |
| 357 %if %1 | |
| 358 mova [dstq+wq], m0 | |
| 359 %else | |
| 360 movq [dstq+wq], m0 | |
| 361 movhps [dstq+wq+8], m0 | |
| 362 %endif | |
| 363 add wq, mmsize | |
| 364 jl %%.loop | |
| 365 mov eax, mmsize-1 | |
| 366 sub eax, wd | |
| 367 movd m1, eax | |
| 368 pshufb m0, m1 | |
| 369 movd eax, m0 | |
| 370 RET | |
| 371 %endmacro | |
| 372 | |
|
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
373 ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |
| 10430 | 374 INIT_MMX |
| 375 cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left | |
| 376 .skip_prologue: | |
|
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
377 mova m5, [pb_7] |
|
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
378 mova m4, [pb_zzzz3333zzzzbbbb] |
|
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
379 mova m3, [pb_zz11zz55zz99zzdd] |
| 10430 | 380 movd m0, leftm |
| 381 psllq m0, 56 | |
| 382 ADD_HFYU_LEFT_LOOP 1 | |
| 383 | |
| 384 INIT_XMM | |
| 385 cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left | |
|
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
386 mova m5, [pb_f] |
|
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
387 mova m6, [pb_zzzzzzzz77777777] |
|
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
388 mova m4, [pb_zzzz3333zzzzbbbb] |
|
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
389 mova m3, [pb_zz11zz55zz99zzdd] |
| 10430 | 390 movd m0, leftm |
| 391 pslldq m0, 15 | |
| 392 test srcq, 15 | |
|
10434
276b3a342389
fix linking on systems with a function name prefix (10l in r20287)
lorenm
parents:
10431
diff
changeset
|
393 jnz add_hfyu_left_prediction_ssse3.skip_prologue |
| 10430 | 394 test dstq, 15 |
| 395 jnz .unaligned | |
| 396 ADD_HFYU_LEFT_LOOP 1 | |
| 397 .unaligned: | |
| 398 ADD_HFYU_LEFT_LOOP 0 | |
| 399 | |
| 10964 | 400 |
|
11931
980030a3e315
Update x264asm header files to latest versions.
darkshikari
parents:
10964
diff
changeset
|
401 ; float scalarproduct_float_sse(const float *v1, const float *v2, int len) |
| 10964 | 402 cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset |
| 403 neg offsetq | |
| 404 shl offsetq, 2 | |
| 405 sub v1q, offsetq | |
| 406 sub v2q, offsetq | |
| 407 xorps xmm0, xmm0 | |
| 408 .loop: | |
| 409 movaps xmm1, [v1q+offsetq] | |
| 410 mulps xmm1, [v2q+offsetq] | |
| 411 addps xmm0, xmm1 | |
| 412 add offsetq, 16 | |
| 413 js .loop | |
| 414 movhlps xmm1, xmm0 | |
| 415 addps xmm0, xmm1 | |
| 416 movss xmm1, xmm0 | |
| 417 shufps xmm0, xmm0, 1 | |
| 418 addss xmm0, xmm1 | |
| 419 %ifndef ARCH_X86_64 | |
| 420 movd r0m, xmm0 | |
| 421 fld dword r0m | |
| 422 %endif | |
| 423 RET |
