Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 2039:f25e485a7850 libavcodec
mmx optimized version of the per line/accurate deblock filter
vertical default mmx deblock filter fix
| author | michael |
|---|---|
| date | Thu, 27 May 2004 21:42:00 +0000 |
| parents | 02b59a3c62cd |
| children | 5de466b3360e |
comparison
equal
deleted
inserted
replaced
| 2038:02b59a3c62cd | 2039:f25e485a7850 |
|---|---|
| 1029 "movq %%mm1, %%mm6 \n\t" | 1029 "movq %%mm1, %%mm6 \n\t" |
| 1030 "psubusw %%mm3, %%mm6 \n\t" | 1030 "psubusw %%mm3, %%mm6 \n\t" |
| 1031 "psubw %%mm6, %%mm1 \n\t" | 1031 "psubw %%mm6, %%mm1 \n\t" |
| 1032 #endif | 1032 #endif |
| 1033 | 1033 |
| 1034 "movd %2, %%mm2 \n\t" // QP | |
| 1035 "punpcklbw %%mm7, %%mm2 \n\t" | |
| 1036 | |
| 1034 "movq %%mm7, %%mm6 \n\t" // 0 | 1037 "movq %%mm7, %%mm6 \n\t" // 0 |
| 1035 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) | 1038 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
| 1036 "pxor %%mm6, %%mm4 \n\t" | 1039 "pxor %%mm6, %%mm4 \n\t" |
| 1037 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| | 1040 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
| 1038 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) | 1041 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
| 1039 "pxor %%mm7, %%mm5 \n\t" | 1042 "pxor %%mm7, %%mm5 \n\t" |
| 1040 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| | 1043 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
| 1041 // 100 opcodes | 1044 // 100 opcodes |
| 1042 "movd %2, %%mm2 \n\t" // QP | |
| 1043 "psllw $3, %%mm2 \n\t" // 8QP | 1045 "psllw $3, %%mm2 \n\t" // 8QP |
| 1044 "movq %%mm2, %%mm3 \n\t" // 8QP | 1046 "movq %%mm2, %%mm3 \n\t" // 8QP |
| 1045 "pcmpgtw %%mm4, %%mm2 \n\t" | 1047 "pcmpgtw %%mm4, %%mm2 \n\t" |
| 1046 "pcmpgtw %%mm5, %%mm3 \n\t" | 1048 "pcmpgtw %%mm5, %%mm3 \n\t" |
| 1047 "pand %%mm2, %%mm4 \n\t" | 1049 "pand %%mm2, %%mm4 \n\t" |
| 2608 } | 2610 } |
| 2609 } | 2611 } |
| 2610 #endif | 2612 #endif |
| 2611 } | 2613 } |
| 2612 | 2614 |
| 2615 #ifdef HAVE_MMX | |
| 2616 /** | |
| 2617 * accurate deblock filter | |
| 2618 */ | |
| 2619 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ | |
| 2620 int y; | |
| 2621 const int QP= c->QP; | |
| 2622 int64_t dc_mask, eq_mask; | |
| 2623 src+= step*3; // src points to begin of the 8x8 Block | |
| 2624 //START_TIMER | |
| 2625 asm volatile( | |
| 2626 "movq %0, %%mm7 \n\t" | |
| 2627 "movq %1, %%mm6 \n\t" | |
| 2628 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) | |
| 2629 ); | |
| 2630 | |
| 2631 asm volatile( | |
| 2632 "leal (%2, %3), %%eax \n\t" | |
| 2633 // 0 1 2 3 4 5 6 7 8 9 | |
| 2634 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 | |
| 2635 | |
| 2636 "movq (%2), %%mm0 \n\t" | |
| 2637 "movq (%%eax), %%mm1 \n\t" | |
| 2638 "movq %%mm1, %%mm3 \n\t" | |
| 2639 "movq %%mm1, %%mm4 \n\t" | |
| 2640 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece | |
| 2641 "paddb %%mm7, %%mm0 \n\t" | |
| 2642 "pcmpgtb %%mm6, %%mm0 \n\t" | |
| 2643 | |
| 2644 "movq (%%eax,%3), %%mm2 \n\t" | |
| 2645 PMAXUB(%%mm2, %%mm4) | |
| 2646 PMINUB(%%mm2, %%mm3, %%mm5) | |
| 2647 "psubb %%mm2, %%mm1 \n\t" | |
| 2648 "paddb %%mm7, %%mm1 \n\t" | |
| 2649 "pcmpgtb %%mm6, %%mm1 \n\t" | |
| 2650 "paddb %%mm1, %%mm0 \n\t" | |
| 2651 | |
| 2652 "movq (%%eax, %3, 2), %%mm1 \n\t" | |
| 2653 PMAXUB(%%mm1, %%mm4) | |
| 2654 PMINUB(%%mm1, %%mm3, %%mm5) | |
| 2655 "psubb %%mm1, %%mm2 \n\t" | |
| 2656 "paddb %%mm7, %%mm2 \n\t" | |
| 2657 "pcmpgtb %%mm6, %%mm2 \n\t" | |
| 2658 "paddb %%mm2, %%mm0 \n\t" | |
| 2659 | |
| 2660 "leal (%%eax, %3, 4), %%eax \n\t" | |
| 2661 | |
| 2662 "movq (%2, %3, 4), %%mm2 \n\t" | |
| 2663 PMAXUB(%%mm2, %%mm4) | |
| 2664 PMINUB(%%mm2, %%mm3, %%mm5) | |
| 2665 "psubb %%mm2, %%mm1 \n\t" | |
| 2666 "paddb %%mm7, %%mm1 \n\t" | |
| 2667 "pcmpgtb %%mm6, %%mm1 \n\t" | |
| 2668 "paddb %%mm1, %%mm0 \n\t" | |
| 2669 | |
| 2670 "movq (%%eax), %%mm1 \n\t" | |
| 2671 PMAXUB(%%mm1, %%mm4) | |
| 2672 PMINUB(%%mm1, %%mm3, %%mm5) | |
| 2673 "psubb %%mm1, %%mm2 \n\t" | |
| 2674 "paddb %%mm7, %%mm2 \n\t" | |
| 2675 "pcmpgtb %%mm6, %%mm2 \n\t" | |
| 2676 "paddb %%mm2, %%mm0 \n\t" | |
| 2677 | |
| 2678 "movq (%%eax, %3), %%mm2 \n\t" | |
| 2679 PMAXUB(%%mm2, %%mm4) | |
| 2680 PMINUB(%%mm2, %%mm3, %%mm5) | |
| 2681 "psubb %%mm2, %%mm1 \n\t" | |
| 2682 "paddb %%mm7, %%mm1 \n\t" | |
| 2683 "pcmpgtb %%mm6, %%mm1 \n\t" | |
| 2684 "paddb %%mm1, %%mm0 \n\t" | |
| 2685 | |
| 2686 "movq (%%eax, %3, 2), %%mm1 \n\t" | |
| 2687 PMAXUB(%%mm1, %%mm4) | |
| 2688 PMINUB(%%mm1, %%mm3, %%mm5) | |
| 2689 "psubb %%mm1, %%mm2 \n\t" | |
| 2690 "paddb %%mm7, %%mm2 \n\t" | |
| 2691 "pcmpgtb %%mm6, %%mm2 \n\t" | |
| 2692 "paddb %%mm2, %%mm0 \n\t" | |
| 2693 | |
| 2694 "movq (%2, %3, 8), %%mm2 \n\t" | |
| 2695 PMAXUB(%%mm2, %%mm4) | |
| 2696 PMINUB(%%mm2, %%mm3, %%mm5) | |
| 2697 "psubb %%mm2, %%mm1 \n\t" | |
| 2698 "paddb %%mm7, %%mm1 \n\t" | |
| 2699 "pcmpgtb %%mm6, %%mm1 \n\t" | |
| 2700 "paddb %%mm1, %%mm0 \n\t" | |
| 2701 | |
| 2702 "movq (%%eax, %3, 4), %%mm1 \n\t" | |
| 2703 "psubb %%mm1, %%mm2 \n\t" | |
| 2704 "paddb %%mm7, %%mm2 \n\t" | |
| 2705 "pcmpgtb %%mm6, %%mm2 \n\t" | |
| 2706 "paddb %%mm2, %%mm0 \n\t" | |
| 2707 "psubusb %%mm3, %%mm4 \n\t" | |
| 2708 | |
| 2709 "movq %4, %%mm7 \n\t" // QP,..., QP | |
| 2710 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP | |
| 2711 "pcmpgtb %%mm4, %%mm7 \n\t" // Diff < 2QP -> FF | |
| 2712 "movq %%mm7, %1 \n\t" | |
| 2713 | |
| 2714 "pxor %%mm6, %%mm6 \n\t" | |
| 2715 "movq %5, %%mm7 \n\t" | |
| 2716 "punpcklbw %%mm7, %%mm7 \n\t" | |
| 2717 "punpcklbw %%mm7, %%mm7 \n\t" | |
| 2718 "punpcklbw %%mm7, %%mm7 \n\t" | |
| 2719 "psubb %%mm0, %%mm6 \n\t" | |
| 2720 "pcmpgtb %%mm7, %%mm6 \n\t" | |
| 2721 "movq %%mm6, %0 \n\t" | |
| 2722 | |
| 2723 : "=m" (eq_mask), "=m" (dc_mask) | |
| 2724 : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) | |
| 2725 : "%eax" | |
| 2726 ); | |
| 2727 | |
| 2728 src+= step; // src points to begin of the 8x8 Block | |
| 2729 | |
| 2730 if(eq_mask != -1LL){ | |
| 2731 asm volatile( | |
| 2732 "pxor %%mm7, %%mm7 \n\t" | |
| 2733 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars | |
| 2734 "andl $0xFFFFFFF8, %%ecx \n\t" // align | |
| 2735 // 0 1 2 3 4 5 6 7 8 9 | |
| 2736 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 | |
| 2737 | |
| 2738 "movq (%0), %%mm0 \n\t" | |
| 2739 "movq %%mm0, %%mm1 \n\t" | |
| 2740 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 | |
| 2741 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 | |
| 2742 | |
| 2743 "movq (%0, %1), %%mm2 \n\t" | |
| 2744 "leal (%0, %1, 2), %%eax \n\t" | |
| 2745 "movq %%mm2, %%mm3 \n\t" | |
| 2746 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 | |
| 2747 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 | |
| 2748 | |
| 2749 "movq (%%eax), %%mm4 \n\t" | |
| 2750 "movq %%mm4, %%mm5 \n\t" | |
| 2751 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 | |
| 2752 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 | |
| 2753 | |
| 2754 "paddw %%mm0, %%mm0 \n\t" // 2L0 | |
| 2755 "paddw %%mm1, %%mm1 \n\t" // 2H0 | |
| 2756 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 | |
| 2757 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 | |
| 2758 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 | |
| 2759 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 | |
| 2760 | |
| 2761 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 | |
| 2762 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 | |
| 2763 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 | |
| 2764 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 | |
| 2765 | |
| 2766 "movq (%%eax, %1), %%mm2 \n\t" | |
| 2767 "movq %%mm2, %%mm3 \n\t" | |
| 2768 "punpcklbw %%mm7, %%mm2 \n\t" // L3 | |
| 2769 "punpckhbw %%mm7, %%mm3 \n\t" // H3 | |
| 2770 | |
| 2771 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 | |
| 2772 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 | |
| 2773 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
| 2774 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 2775 "movq %%mm0, (%%ecx) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
| 2776 "movq %%mm1, 8(%%ecx) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 2777 | |
| 2778 "movq (%%eax, %1, 2), %%mm0 \n\t" | |
| 2779 "movq %%mm0, %%mm1 \n\t" | |
| 2780 "punpcklbw %%mm7, %%mm0 \n\t" // L4 | |
| 2781 "punpckhbw %%mm7, %%mm1 \n\t" // H4 | |
| 2782 | |
| 2783 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 | |
| 2784 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 | |
| 2785 "movq %%mm2, 16(%%ecx) \n\t" // L3 - L4 | |
| 2786 "movq %%mm3, 24(%%ecx) \n\t" // H3 - H4 | |
| 2787 "paddw %%mm4, %%mm4 \n\t" // 2L2 | |
| 2788 "paddw %%mm5, %%mm5 \n\t" // 2H2 | |
| 2789 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 | |
| 2790 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 | |
| 2791 | |
| 2792 "leal (%%eax, %1), %0 \n\t" | |
| 2793 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 | |
| 2794 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 | |
| 2795 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 | |
| 2796 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 | |
| 2797 //50 opcodes so far | |
| 2798 "movq (%0, %1, 2), %%mm2 \n\t" | |
| 2799 "movq %%mm2, %%mm3 \n\t" | |
| 2800 "punpcklbw %%mm7, %%mm2 \n\t" // L5 | |
| 2801 "punpckhbw %%mm7, %%mm3 \n\t" // H5 | |
| 2802 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 | |
| 2803 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 | |
| 2804 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 | |
| 2805 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 | |
| 2806 | |
| 2807 "movq (%%eax, %1, 4), %%mm6 \n\t" | |
| 2808 "punpcklbw %%mm7, %%mm6 \n\t" // L6 | |
| 2809 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 | |
| 2810 "movq (%%eax, %1, 4), %%mm6 \n\t" | |
| 2811 "punpckhbw %%mm7, %%mm6 \n\t" // H6 | |
| 2812 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 | |
| 2813 | |
| 2814 "paddw %%mm0, %%mm0 \n\t" // 2L4 | |
| 2815 "paddw %%mm1, %%mm1 \n\t" // 2H4 | |
| 2816 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 | |
| 2817 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 | |
| 2818 | |
| 2819 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 | |
| 2820 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 | |
| 2821 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 | |
| 2822 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 | |
| 2823 | |
| 2824 "movq (%0, %1, 4), %%mm2 \n\t" | |
| 2825 "movq %%mm2, %%mm3 \n\t" | |
| 2826 "punpcklbw %%mm7, %%mm2 \n\t" // L7 | |
| 2827 "punpckhbw %%mm7, %%mm3 \n\t" // H7 | |
| 2828 | |
| 2829 "paddw %%mm2, %%mm2 \n\t" // 2L7 | |
| 2830 "paddw %%mm3, %%mm3 \n\t" // 2H7 | |
| 2831 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 | |
| 2832 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 | |
| 2833 | |
| 2834 "movq (%%ecx), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 | |
| 2835 "movq 8(%%ecx), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 | |
| 2836 | |
| 2837 #ifdef HAVE_MMX2 | |
| 2838 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 2839 "psubw %%mm0, %%mm6 \n\t" | |
| 2840 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
| 2841 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 2842 "psubw %%mm1, %%mm6 \n\t" | |
| 2843 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
| 2844 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 2845 "psubw %%mm2, %%mm6 \n\t" | |
| 2846 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
| 2847 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 2848 "psubw %%mm3, %%mm6 \n\t" | |
| 2849 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
| 2850 #else | |
| 2851 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 2852 "pcmpgtw %%mm0, %%mm6 \n\t" | |
| 2853 "pxor %%mm6, %%mm0 \n\t" | |
| 2854 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| | |
| 2855 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 2856 "pcmpgtw %%mm1, %%mm6 \n\t" | |
| 2857 "pxor %%mm6, %%mm1 \n\t" | |
| 2858 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| | |
| 2859 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 2860 "pcmpgtw %%mm2, %%mm6 \n\t" | |
| 2861 "pxor %%mm6, %%mm2 \n\t" | |
| 2862 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| | |
| 2863 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 2864 "pcmpgtw %%mm3, %%mm6 \n\t" | |
| 2865 "pxor %%mm6, %%mm3 \n\t" | |
| 2866 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| | |
| 2867 #endif | |
| 2868 | |
| 2869 #ifdef HAVE_MMX2 | |
| 2870 "pminsw %%mm2, %%mm0 \n\t" | |
| 2871 "pminsw %%mm3, %%mm1 \n\t" | |
| 2872 #else | |
| 2873 "movq %%mm0, %%mm6 \n\t" | |
| 2874 "psubusw %%mm2, %%mm6 \n\t" | |
| 2875 "psubw %%mm6, %%mm0 \n\t" | |
| 2876 "movq %%mm1, %%mm6 \n\t" | |
| 2877 "psubusw %%mm3, %%mm6 \n\t" | |
| 2878 "psubw %%mm6, %%mm1 \n\t" | |
| 2879 #endif | |
| 2880 | |
| 2881 "movd %2, %%mm2 \n\t" // QP | |
| 2882 "punpcklbw %%mm7, %%mm2 \n\t" | |
| 2883 | |
| 2884 "movq %%mm7, %%mm6 \n\t" // 0 | |
| 2885 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) | |
| 2886 "pxor %%mm6, %%mm4 \n\t" | |
| 2887 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| | |
| 2888 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) | |
| 2889 "pxor %%mm7, %%mm5 \n\t" | |
| 2890 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| | |
| 2891 // 100 opcodes | |
| 2892 "psllw $3, %%mm2 \n\t" // 8QP | |
| 2893 "movq %%mm2, %%mm3 \n\t" // 8QP | |
| 2894 "pcmpgtw %%mm4, %%mm2 \n\t" | |
| 2895 "pcmpgtw %%mm5, %%mm3 \n\t" | |
| 2896 "pand %%mm2, %%mm4 \n\t" | |
| 2897 "pand %%mm3, %%mm5 \n\t" | |
| 2898 | |
| 2899 | |
| 2900 "psubusw %%mm0, %%mm4 \n\t" // hd | |
| 2901 "psubusw %%mm1, %%mm5 \n\t" // ld | |
| 2902 | |
| 2903 | |
| 2904 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 | |
| 2905 "pmullw %%mm2, %%mm4 \n\t" | |
| 2906 "pmullw %%mm2, %%mm5 \n\t" | |
| 2907 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 | |
| 2908 "paddw %%mm2, %%mm4 \n\t" | |
| 2909 "paddw %%mm2, %%mm5 \n\t" | |
| 2910 "psrlw $6, %%mm4 \n\t" | |
| 2911 "psrlw $6, %%mm5 \n\t" | |
| 2912 | |
| 2913 "movq 16(%%ecx), %%mm0 \n\t" // L3 - L4 | |
| 2914 "movq 24(%%ecx), %%mm1 \n\t" // H3 - H4 | |
| 2915 | |
| 2916 "pxor %%mm2, %%mm2 \n\t" | |
| 2917 "pxor %%mm3, %%mm3 \n\t" | |
| 2918 | |
| 2919 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) | |
| 2920 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) | |
| 2921 "pxor %%mm2, %%mm0 \n\t" | |
| 2922 "pxor %%mm3, %%mm1 \n\t" | |
| 2923 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| | |
| 2924 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| | |
| 2925 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 | |
| 2926 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 | |
| 2927 | |
| 2928 "pxor %%mm6, %%mm2 \n\t" | |
| 2929 "pxor %%mm7, %%mm3 \n\t" | |
| 2930 "pand %%mm2, %%mm4 \n\t" | |
| 2931 "pand %%mm3, %%mm5 \n\t" | |
| 2932 | |
| 2933 #ifdef HAVE_MMX2 | |
| 2934 "pminsw %%mm0, %%mm4 \n\t" | |
| 2935 "pminsw %%mm1, %%mm5 \n\t" | |
| 2936 #else | |
| 2937 "movq %%mm4, %%mm2 \n\t" | |
| 2938 "psubusw %%mm0, %%mm2 \n\t" | |
| 2939 "psubw %%mm2, %%mm4 \n\t" | |
| 2940 "movq %%mm5, %%mm2 \n\t" | |
| 2941 "psubusw %%mm1, %%mm2 \n\t" | |
| 2942 "psubw %%mm2, %%mm5 \n\t" | |
| 2943 #endif | |
| 2944 "pxor %%mm6, %%mm4 \n\t" | |
| 2945 "pxor %%mm7, %%mm5 \n\t" | |
| 2946 "psubw %%mm6, %%mm4 \n\t" | |
| 2947 "psubw %%mm7, %%mm5 \n\t" | |
| 2948 "packsswb %%mm5, %%mm4 \n\t" | |
| 2949 "movq %3, %%mm1 \n\t" | |
| 2950 "pandn %%mm4, %%mm1 \n\t" | |
| 2951 "movq (%0), %%mm0 \n\t" | |
| 2952 "paddb %%mm1, %%mm0 \n\t" | |
| 2953 "movq %%mm0, (%0) \n\t" | |
| 2954 "movq (%0, %1), %%mm0 \n\t" | |
| 2955 "psubb %%mm1, %%mm0 \n\t" | |
| 2956 "movq %%mm0, (%0, %1) \n\t" | |
| 2957 | |
| 2958 : "+r" (src) | |
| 2959 : "r" (step), "m" (c->pQPb), "m"(eq_mask) | |
| 2960 : "%eax", "%ecx" | |
| 2961 ); | |
| 2962 src-= 3*step; //reverse src change from asm | |
| 2963 } | |
| 2964 | |
| 2965 for(y=0; y<8; y++){ | |
| 2966 if((eq_mask>>(y*8))&1){ | |
| 2967 if((dc_mask>>(y*8))&1){ | |
| 2968 const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0]; | |
| 2969 const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step]; | |
| 2970 | |
| 2971 int sums[10]; | |
| 2972 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4; | |
| 2973 sums[1] = sums[0] - first + src[3*step]; | |
| 2974 sums[2] = sums[1] - first + src[4*step]; | |
| 2975 sums[3] = sums[2] - first + src[5*step]; | |
| 2976 sums[4] = sums[3] - first + src[6*step]; | |
| 2977 sums[5] = sums[4] - src[0*step] + src[7*step]; | |
| 2978 sums[6] = sums[5] - src[1*step] + last; | |
| 2979 sums[7] = sums[6] - src[2*step] + last; | |
| 2980 sums[8] = sums[7] - src[3*step] + last; | |
| 2981 sums[9] = sums[8] - src[4*step] + last; | |
| 2982 | |
| 2983 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4; | |
| 2984 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4; | |
| 2985 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4; | |
| 2986 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4; | |
| 2987 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4; | |
| 2988 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4; | |
| 2989 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4; | |
| 2990 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4; | |
| 2991 } | |
| 2992 } | |
| 2993 | |
| 2994 src += stride; | |
| 2995 } | |
| 2996 /*if(step==16){ | |
| 2997 STOP_TIMER("step16") | |
| 2998 }else{ | |
| 2999 STOP_TIMER("stepX") | |
| 3000 }*/ | |
| 3001 } | |
| 3002 #endif //HAVE_MMX | |
| 3003 | |
| 2613 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, | 3004 static void RENAME(postProcess)(uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
| 2614 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); | 3005 QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
| 2615 | 3006 |
| 2616 /** | 3007 /** |
| 2617 * Copies a block from src to dst and fixes the blacklevel | 3008 * Copies a block from src to dst and fixes the blacklevel |
| 3111 if(t==1) | 3502 if(t==1) |
| 3112 RENAME(doVertLowPass)(dstBlock, stride, &c); | 3503 RENAME(doVertLowPass)(dstBlock, stride, &c); |
| 3113 else if(t==2) | 3504 else if(t==2) |
| 3114 RENAME(doVertDefFilter)(dstBlock, stride, &c); | 3505 RENAME(doVertDefFilter)(dstBlock, stride, &c); |
| 3115 }else if(mode & V_A_DEBLOCK){ | 3506 }else if(mode & V_A_DEBLOCK){ |
| 3116 do_a_deblock(dstBlock, stride, 1, &c); | 3507 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); |
| 3117 } | 3508 } |
| 3118 } | 3509 } |
| 3119 | 3510 |
| 3120 #ifdef HAVE_MMX | 3511 #ifdef HAVE_MMX |
| 3121 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); | 3512 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
| 3134 if(t==1) | 3525 if(t==1) |
| 3135 RENAME(doVertLowPass)(tempBlock1, 16, &c); | 3526 RENAME(doVertLowPass)(tempBlock1, 16, &c); |
| 3136 else if(t==2) | 3527 else if(t==2) |
| 3137 RENAME(doVertDefFilter)(tempBlock1, 16, &c); | 3528 RENAME(doVertDefFilter)(tempBlock1, 16, &c); |
| 3138 }else if(mode & H_A_DEBLOCK){ | 3529 }else if(mode & H_A_DEBLOCK){ |
| 3139 do_a_deblock(tempBlock1, 16, 1, &c); | 3530 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c); |
| 3140 } | 3531 } |
| 3141 | 3532 |
| 3142 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); | 3533 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
| 3143 | 3534 |
| 3144 #else | 3535 #else |
| 3151 if(t==1) | 3542 if(t==1) |
| 3152 RENAME(doHorizLowPass)(dstBlock-4, stride, &c); | 3543 RENAME(doHorizLowPass)(dstBlock-4, stride, &c); |
| 3153 else if(t==2) | 3544 else if(t==2) |
| 3154 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); | 3545 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); |
| 3155 }else if(mode & H_A_DEBLOCK){ | 3546 }else if(mode & H_A_DEBLOCK){ |
| 3156 do_a_deblock(dstBlock-8, 1, stride, &c); | 3547 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c); |
| 3157 } | 3548 } |
| 3158 #endif | 3549 #endif |
| 3159 if(mode & DERING) | 3550 if(mode & DERING) |
| 3160 { | 3551 { |
| 3161 //FIXME filter first line | 3552 //FIXME filter first line |
