Mercurial > libavcodec.hg
comparison libpostproc/postprocess_template.c @ 2040:5de466b3360e libavcodec
per line lowpass filter in mmx
| author | michael |
|---|---|
| date | Fri, 28 May 2004 13:23:53 +0000 |
| parents | f25e485a7850 |
| children | b996fbe0a7e7 |
comparison
equal
deleted
inserted
replaced
| 2039:f25e485a7850 | 2040:5de466b3360e |
|---|---|
| 2615 #ifdef HAVE_MMX | 2615 #ifdef HAVE_MMX |
| 2616 /** | 2616 /** |
| 2617 * accurate deblock filter | 2617 * accurate deblock filter |
| 2618 */ | 2618 */ |
| 2619 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ | 2619 static always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ |
| 2620 int y; | |
| 2621 const int QP= c->QP; | |
| 2622 int64_t dc_mask, eq_mask; | 2620 int64_t dc_mask, eq_mask; |
| 2621 int64_t sums[10*8*2]; | |
| 2623 src+= step*3; // src points to begin of the 8x8 Block | 2622 src+= step*3; // src points to begin of the 8x8 Block |
| 2624 //START_TIMER | 2623 //START_TIMER |
| 2625 asm volatile( | 2624 asm volatile( |
| 2626 "movq %0, %%mm7 \n\t" | 2625 "movq %0, %%mm7 \n\t" |
| 2627 "movq %1, %%mm6 \n\t" | 2626 "movq %1, %%mm6 \n\t" |
| 2723 : "=m" (eq_mask), "=m" (dc_mask) | 2722 : "=m" (eq_mask), "=m" (dc_mask) |
| 2724 : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) | 2723 : "r" (src), "r" (step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) |
| 2725 : "%eax" | 2724 : "%eax" |
| 2726 ); | 2725 ); |
| 2727 | 2726 |
| 2728 src+= step; // src points to begin of the 8x8 Block | 2727 if(dc_mask & eq_mask){ |
| 2728 int offset= -8*step; | |
| 2729 int64_t *temp_sums= sums; | |
| 2730 | |
| 2731 asm volatile( | |
| 2732 "movq %2, %%mm0 \n\t" // QP,..., QP | |
| 2733 "pxor %%mm4, %%mm4 \n\t" | |
| 2734 | |
| 2735 "movq (%0), %%mm6 \n\t" | |
| 2736 "movq (%0, %1), %%mm5 \n\t" | |
| 2737 "movq %%mm5, %%mm1 \n\t" | |
| 2738 "movq %%mm6, %%mm2 \n\t" | |
| 2739 "psubusb %%mm6, %%mm5 \n\t" | |
| 2740 "psubusb %%mm1, %%mm2 \n\t" | |
| 2741 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
| 2742 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | |
| 2743 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | |
| 2744 | |
| 2745 "pxor %%mm6, %%mm1 \n\t" | |
| 2746 "pand %%mm0, %%mm1 \n\t" | |
| 2747 "pxor %%mm1, %%mm6 \n\t" | |
| 2748 // 0:QP 6:First | |
| 2749 | |
| 2750 "movq (%0, %1, 8), %%mm5 \n\t" | |
| 2751 "addl %1, %0 \n\t" // %0 points to line 1 not 0 | |
| 2752 "movq (%0, %1, 8), %%mm7 \n\t" | |
| 2753 "movq %%mm5, %%mm1 \n\t" | |
| 2754 "movq %%mm7, %%mm2 \n\t" | |
| 2755 "psubusb %%mm7, %%mm5 \n\t" | |
| 2756 "psubusb %%mm1, %%mm2 \n\t" | |
| 2757 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines | |
| 2758 "movq %2, %%mm0 \n\t" // QP,..., QP | |
| 2759 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 | |
| 2760 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF | |
| 2761 | |
| 2762 "pxor %%mm7, %%mm1 \n\t" | |
| 2763 "pand %%mm0, %%mm1 \n\t" | |
| 2764 "pxor %%mm1, %%mm7 \n\t" | |
| 2765 | |
| 2766 "movq %%mm6, %%mm5 \n\t" | |
| 2767 "punpckhbw %%mm4, %%mm6 \n\t" | |
| 2768 "punpcklbw %%mm4, %%mm5 \n\t" | |
| 2769 // 4:0 5/6:First 7:Last | |
| 2770 | |
| 2771 "movq %%mm5, %%mm0 \n\t" | |
| 2772 "movq %%mm6, %%mm1 \n\t" | |
| 2773 "psllw $2, %%mm0 \n\t" | |
| 2774 "psllw $2, %%mm1 \n\t" | |
| 2775 "paddw "MANGLE(w04)", %%mm0 \n\t" | |
| 2776 "paddw "MANGLE(w04)", %%mm1 \n\t" | |
| 2777 | |
| 2778 #define NEXT\ | |
| 2779 "movq (%0), %%mm2 \n\t"\ | |
| 2780 "movq (%0), %%mm3 \n\t"\ | |
| 2781 "addl %1, %0 \n\t"\ | |
| 2782 "punpcklbw %%mm4, %%mm2 \n\t"\ | |
| 2783 "punpckhbw %%mm4, %%mm3 \n\t"\ | |
| 2784 "paddw %%mm2, %%mm0 \n\t"\ | |
| 2785 "paddw %%mm3, %%mm1 \n\t" | |
| 2786 | |
| 2787 #define PREV\ | |
| 2788 "movq (%0), %%mm2 \n\t"\ | |
| 2789 "movq (%0), %%mm3 \n\t"\ | |
| 2790 "addl %1, %0 \n\t"\ | |
| 2791 "punpcklbw %%mm4, %%mm2 \n\t"\ | |
| 2792 "punpckhbw %%mm4, %%mm3 \n\t"\ | |
| 2793 "psubw %%mm2, %%mm0 \n\t"\ | |
| 2794 "psubw %%mm3, %%mm1 \n\t" | |
| 2795 | |
| 2796 | |
| 2797 NEXT //0 | |
| 2798 NEXT //1 | |
| 2799 NEXT //2 | |
| 2800 "movq %%mm0, (%3) \n\t" | |
| 2801 "movq %%mm1, 8(%3) \n\t" | |
| 2802 | |
| 2803 NEXT //3 | |
| 2804 "psubw %%mm5, %%mm0 \n\t" | |
| 2805 "psubw %%mm6, %%mm1 \n\t" | |
| 2806 "movq %%mm0, 16(%3) \n\t" | |
| 2807 "movq %%mm1, 24(%3) \n\t" | |
| 2808 | |
| 2809 NEXT //4 | |
| 2810 "psubw %%mm5, %%mm0 \n\t" | |
| 2811 "psubw %%mm6, %%mm1 \n\t" | |
| 2812 "movq %%mm0, 32(%3) \n\t" | |
| 2813 "movq %%mm1, 40(%3) \n\t" | |
| 2814 | |
| 2815 NEXT //5 | |
| 2816 "psubw %%mm5, %%mm0 \n\t" | |
| 2817 "psubw %%mm6, %%mm1 \n\t" | |
| 2818 "movq %%mm0, 48(%3) \n\t" | |
| 2819 "movq %%mm1, 56(%3) \n\t" | |
| 2820 | |
| 2821 NEXT //6 | |
| 2822 "psubw %%mm5, %%mm0 \n\t" | |
| 2823 "psubw %%mm6, %%mm1 \n\t" | |
| 2824 "movq %%mm0, 64(%3) \n\t" | |
| 2825 "movq %%mm1, 72(%3) \n\t" | |
| 2826 | |
| 2827 "movq %%mm7, %%mm6 \n\t" | |
| 2828 "punpckhbw %%mm4, %%mm7 \n\t" | |
| 2829 "punpcklbw %%mm4, %%mm6 \n\t" | |
| 2830 | |
| 2831 NEXT //7 | |
| 2832 "movl %4, %0 \n\t" | |
| 2833 "addl %1, %0 \n\t" | |
| 2834 PREV //0 | |
| 2835 "movq %%mm0, 80(%3) \n\t" | |
| 2836 "movq %%mm1, 88(%3) \n\t" | |
| 2837 | |
| 2838 PREV //1 | |
| 2839 "paddw %%mm6, %%mm0 \n\t" | |
| 2840 "paddw %%mm7, %%mm1 \n\t" | |
| 2841 "movq %%mm0, 96(%3) \n\t" | |
| 2842 "movq %%mm1, 104(%3) \n\t" | |
| 2843 | |
| 2844 PREV //2 | |
| 2845 "paddw %%mm6, %%mm0 \n\t" | |
| 2846 "paddw %%mm7, %%mm1 \n\t" | |
| 2847 "movq %%mm0, 112(%3) \n\t" | |
| 2848 "movq %%mm1, 120(%3) \n\t" | |
| 2849 | |
| 2850 PREV //3 | |
| 2851 "paddw %%mm6, %%mm0 \n\t" | |
| 2852 "paddw %%mm7, %%mm1 \n\t" | |
| 2853 "movq %%mm0, 128(%3) \n\t" | |
| 2854 "movq %%mm1, 136(%3) \n\t" | |
| 2855 | |
| 2856 PREV //4 | |
| 2857 "paddw %%mm6, %%mm0 \n\t" | |
| 2858 "paddw %%mm7, %%mm1 \n\t" | |
| 2859 "movq %%mm0, 144(%3) \n\t" | |
| 2860 "movq %%mm1, 152(%3) \n\t" | |
| 2861 | |
| 2862 "movl %4, %0 \n\t" //FIXME | |
| 2863 | |
| 2864 : "+&r"(src) | |
| 2865 : "r" (step), "m" (c->pQPb), "r"(sums), "g"(src) | |
| 2866 ); | |
| 2867 | |
| 2868 src+= step; // src points to begin of the 8x8 Block | |
| 2869 | |
| 2870 asm volatile( | |
| 2871 "movq %4, %%mm6 \n\t" | |
| 2872 "pcmpeqb %%mm5, %%mm5 \n\t" | |
| 2873 "pxor %%mm6, %%mm5 \n\t" | |
| 2874 "pxor %%mm7, %%mm7 \n\t" | |
| 2875 | |
| 2876 "1: \n\t" | |
| 2877 "movq (%1), %%mm0 \n\t" | |
| 2878 "movq 8(%1), %%mm1 \n\t" | |
| 2879 "paddw 32(%1), %%mm0 \n\t" | |
| 2880 "paddw 40(%1), %%mm1 \n\t" | |
| 2881 "movq (%0, %3), %%mm2 \n\t" | |
| 2882 "movq %%mm2, %%mm3 \n\t" | |
| 2883 "movq %%mm2, %%mm4 \n\t" | |
| 2884 "punpcklbw %%mm7, %%mm2 \n\t" | |
| 2885 "punpckhbw %%mm7, %%mm3 \n\t" | |
| 2886 "paddw %%mm2, %%mm0 \n\t" | |
| 2887 "paddw %%mm3, %%mm1 \n\t" | |
| 2888 "paddw %%mm2, %%mm0 \n\t" | |
| 2889 "paddw %%mm3, %%mm1 \n\t" | |
| 2890 "psrlw $4, %%mm0 \n\t" | |
| 2891 "psrlw $4, %%mm1 \n\t" | |
| 2892 "packuswb %%mm1, %%mm0 \n\t" | |
| 2893 "pand %%mm6, %%mm0 \n\t" | |
| 2894 "pand %%mm5, %%mm4 \n\t" | |
| 2895 "por %%mm4, %%mm0 \n\t" | |
| 2896 "movq %%mm0, (%0, %3) \n\t" | |
| 2897 "addl $16, %1 \n\t" | |
| 2898 "addl %2, %0 \n\t" | |
| 2899 " js 1b \n\t" | |
| 2900 | |
| 2901 : "+r"(offset), "+r"(temp_sums) | |
| 2902 : "r" (step), "r"(src - offset), "m"(dc_mask & eq_mask) | |
| 2903 ); | |
| 2904 }else | |
| 2905 src+= step; // src points to begin of the 8x8 Block | |
| 2729 | 2906 |
| 2730 if(eq_mask != -1LL){ | 2907 if(eq_mask != -1LL){ |
| 2908 uint8_t *temp_src= src; | |
| 2731 asm volatile( | 2909 asm volatile( |
| 2732 "pxor %%mm7, %%mm7 \n\t" | 2910 "pxor %%mm7, %%mm7 \n\t" |
| 2733 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars | 2911 "leal -40(%%esp), %%ecx \n\t" // make space for 4 8-byte vars |
| 2734 "andl $0xFFFFFFF8, %%ecx \n\t" // align | 2912 "andl $0xFFFFFFF8, %%ecx \n\t" // align |
| 2735 // 0 1 2 3 4 5 6 7 8 9 | 2913 // 0 1 2 3 4 5 6 7 8 9 |
| 2953 "movq %%mm0, (%0) \n\t" | 3131 "movq %%mm0, (%0) \n\t" |
| 2954 "movq (%0, %1), %%mm0 \n\t" | 3132 "movq (%0, %1), %%mm0 \n\t" |
| 2955 "psubb %%mm1, %%mm0 \n\t" | 3133 "psubb %%mm1, %%mm0 \n\t" |
| 2956 "movq %%mm0, (%0, %1) \n\t" | 3134 "movq %%mm0, (%0, %1) \n\t" |
| 2957 | 3135 |
| 2958 : "+r" (src) | 3136 : "+r" (temp_src) |
| 2959 : "r" (step), "m" (c->pQPb), "m"(eq_mask) | 3137 : "r" (step), "m" (c->pQPb), "m"(eq_mask) |
| 2960 : "%eax", "%ecx" | 3138 : "%eax", "%ecx" |
| 2961 ); | 3139 ); |
| 2962 src-= 3*step; //reverse src change from asm | |
| 2963 } | |
| 2964 | |
| 2965 for(y=0; y<8; y++){ | |
| 2966 if((eq_mask>>(y*8))&1){ | |
| 2967 if((dc_mask>>(y*8))&1){ | |
| 2968 const int first= ABS(src[-1*step] - src[0]) < QP ? src[-1*step] : src[0]; | |
| 2969 const int last= ABS(src[8*step] - src[7*step]) < QP ? src[8*step] : src[7*step]; | |
| 2970 | |
| 2971 int sums[10]; | |
| 2972 sums[0] = 4*first + src[0*step] + src[1*step] + src[2*step] + 4; | |
| 2973 sums[1] = sums[0] - first + src[3*step]; | |
| 2974 sums[2] = sums[1] - first + src[4*step]; | |
| 2975 sums[3] = sums[2] - first + src[5*step]; | |
| 2976 sums[4] = sums[3] - first + src[6*step]; | |
| 2977 sums[5] = sums[4] - src[0*step] + src[7*step]; | |
| 2978 sums[6] = sums[5] - src[1*step] + last; | |
| 2979 sums[7] = sums[6] - src[2*step] + last; | |
| 2980 sums[8] = sums[7] - src[3*step] + last; | |
| 2981 sums[9] = sums[8] - src[4*step] + last; | |
| 2982 | |
| 2983 src[0*step]= (sums[0] + sums[2] + 2*src[0*step])>>4; | |
| 2984 src[1*step]= (sums[1] + sums[3] + 2*src[1*step])>>4; | |
| 2985 src[2*step]= (sums[2] + sums[4] + 2*src[2*step])>>4; | |
| 2986 src[3*step]= (sums[3] + sums[5] + 2*src[3*step])>>4; | |
| 2987 src[4*step]= (sums[4] + sums[6] + 2*src[4*step])>>4; | |
| 2988 src[5*step]= (sums[5] + sums[7] + 2*src[5*step])>>4; | |
| 2989 src[6*step]= (sums[6] + sums[8] + 2*src[6*step])>>4; | |
| 2990 src[7*step]= (sums[7] + sums[9] + 2*src[7*step])>>4; | |
| 2991 } | |
| 2992 } | |
| 2993 | |
| 2994 src += stride; | |
| 2995 } | 3140 } |
| 2996 /*if(step==16){ | 3141 /*if(step==16){ |
| 2997 STOP_TIMER("step16") | 3142 STOP_TIMER("step16") |
| 2998 }else{ | 3143 }else{ |
| 2999 STOP_TIMER("stepX") | 3144 STOP_TIMER("stepX") |
