Mercurial > libavcodec.hg
comparison mpegaudiodec.c @ 2489:2e5ae040e92b libavcodec
optimizing imdct36()
| author | michael |
|---|---|
| date | Tue, 01 Feb 2005 21:27:18 +0000 |
| parents | dfdb6bf4b90f |
| children | c2c642ad6ef4 |
comparison
equal
deleted
inserted
replaced
| 2488:8dd4672b9f74 | 2489:2e5ae040e92b |
|---|---|
| 66 #define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) | 66 #define MUL64(a,b) ((int64_t)(a) * (int64_t)(b)) |
| 67 #define FIX(a) ((int)((a) * FRAC_ONE)) | 67 #define FIX(a) ((int)((a) * FRAC_ONE)) |
| 68 /* WARNING: only correct for posititive numbers */ | 68 /* WARNING: only correct for posititive numbers */ |
| 69 #define FIXR(a) ((int)((a) * FRAC_ONE + 0.5)) | 69 #define FIXR(a) ((int)((a) * FRAC_ONE + 0.5)) |
| 70 #define FRAC_RND(a) (((a) + (FRAC_ONE/2)) >> FRAC_BITS) | 70 #define FRAC_RND(a) (((a) + (FRAC_ONE/2)) >> FRAC_BITS) |
| 71 | |
| 72 #define FIXHR(a) ((int)((a) * (1LL<<32) + 0.5)) | |
| 73 //#define MULH(a,b) (((int64_t)(a) * (int64_t)(b))>>32) //gcc 3.4 creates an incredibly bloated mess out of this | |
| 74 static always_inline int MULH(int a, int b){ | |
| 75 return ((int64_t)(a) * (int64_t)(b))>>32; | |
| 76 } | |
| 71 | 77 |
| 72 #if FRAC_BITS <= 15 | 78 #if FRAC_BITS <= 15 |
| 73 typedef int16_t MPA_INT; | 79 typedef int16_t MPA_INT; |
| 74 #else | 80 #else |
| 75 typedef int32_t MPA_INT; | 81 typedef int32_t MPA_INT; |
| 491 csa_table_float[i][0] = cs; | 497 csa_table_float[i][0] = cs; |
| 492 csa_table_float[i][1] = ca; | 498 csa_table_float[i][1] = ca; |
| 493 csa_table_float[i][2] = ca + cs; | 499 csa_table_float[i][2] = ca + cs; |
| 494 csa_table_float[i][3] = ca - cs; | 500 csa_table_float[i][3] = ca - cs; |
| 495 // printf("%d %d %d %d\n", FIX(cs), FIX(cs-1), FIX(ca), FIX(cs)-FIX(ca)); | 501 // printf("%d %d %d %d\n", FIX(cs), FIX(cs-1), FIX(ca), FIX(cs)-FIX(ca)); |
| 502 // av_log(NULL, AV_LOG_DEBUG,"%f %f %f %f\n", cs, ca, ca+cs, ca-cs); | |
| 496 } | 503 } |
| 497 | 504 |
| 498 /* compute mdct windows */ | 505 /* compute mdct windows */ |
| 499 for(i=0;i<36;i++) { | 506 for(i=0;i<36;i++) { |
| 500 int v; | 507 for(j=0; j<4; j++){ |
| 501 v = FIXR(sin(M_PI * (i + 0.5) / 36.0)); | 508 double d; |
| 502 mdct_win[0][i] = v; | 509 if(j==2) continue; |
| 503 mdct_win[1][i] = v; | 510 |
| 504 mdct_win[3][i] = v; | 511 d= sin(M_PI * (i + 0.5) / 36.0); |
| 505 } | 512 if(j==1){ |
| 506 for(i=0;i<6;i++) { | 513 if (i>=30) d= 0; |
| 507 mdct_win[1][18 + i] = FIXR(1.0); | 514 else if(i>=24) d= sin(M_PI * (i - 18 + 0.5) / 12.0); |
| 508 mdct_win[1][24 + i] = FIXR(sin(M_PI * ((i + 6) + 0.5) / 12.0)); | 515 else if(i>=18) d= 1; |
| 509 mdct_win[1][30 + i] = FIXR(0.0); | 516 }else if(j==3){ |
| 510 | 517 if (i< 6) d= 0; |
| 511 mdct_win[3][i] = FIXR(0.0); | 518 else if(i< 12) d= sin(M_PI * (i - 6 + 0.5) / 12.0); |
| 512 mdct_win[3][6 + i] = FIXR(sin(M_PI * (i + 0.5) / 12.0)); | 519 else if(i< 18) d= 1; |
| 513 mdct_win[3][12 + i] = FIXR(1.0); | 520 } |
| 521 //merge last stage of imdct into the window coefficients | |
| 522 if (i/9 == 0) d*= 0.5 / cos(M_PI*(2*( i) +19)/72); | |
| 523 else if(i/9 == 1) d*= 0.5 / cos(M_PI*(2*(17 - i) +19)/72); | |
| 524 else if(i/9 == 2) d*= 0.5 / cos(M_PI*(2*( i) +19)/72); | |
| 525 else d*=-0.5 / cos(M_PI*(2*(17 - i) +19)/72); | |
| 526 mdct_win[j][i] = FIXHR((d / (1<<5))); | |
| 527 // av_log(NULL, AV_LOG_DEBUG, "%2d %d %f\n", i,j,d / (1<<5)); | |
| 528 } | |
| 514 } | 529 } |
| 515 | 530 |
| 516 for(i=0;i<12;i++) | 531 for(i=0;i<12;i++) |
| 517 mdct_win[2][i] = FIXR(sin(M_PI * (i + 0.5) / 12.0)); | 532 mdct_win[2][i] = FIXR(sin(M_PI * (i + 0.5) / 12.0)); |
| 518 | 533 |
| 1009 #undef C7 | 1024 #undef C7 |
| 1010 #undef C9 | 1025 #undef C9 |
| 1011 #undef C11 | 1026 #undef C11 |
| 1012 | 1027 |
| 1013 /* cos(pi*i/18) */ | 1028 /* cos(pi*i/18) */ |
| 1014 #define C1 FIXR(0.98480775301220805936) | 1029 #define C1 FIXHR(0.98480775301220805936/2) |
| 1015 #define C2 FIXR(0.93969262078590838405) | 1030 #define C2 FIXHR(0.93969262078590838405/2) |
| 1016 #define C3 FIXR(0.86602540378443864676) | 1031 #define C3 FIXHR(0.86602540378443864676/2) |
| 1017 #define C4 FIXR(0.76604444311897803520) | 1032 #define C4 FIXHR(0.76604444311897803520/2) |
| 1018 #define C5 FIXR(0.64278760968653932632) | 1033 #define C5 FIXHR(0.64278760968653932632/2) |
| 1019 #define C6 FIXR(0.5) | 1034 #define C6 FIXHR(0.5/2) |
| 1020 #define C7 FIXR(0.34202014332566873304) | 1035 #define C7 FIXHR(0.34202014332566873304/2) |
| 1021 #define C8 FIXR(0.17364817766693034885) | 1036 #define C8 FIXHR(0.17364817766693034885/2) |
| 1037 | |
| 1022 | 1038 |
| 1023 /* 0.5 / cos(pi*(2*i+1)/36) */ | 1039 /* 0.5 / cos(pi*(2*i+1)/36) */ |
| 1024 static const int icos36[9] = { | 1040 static const int icos36[9] = { |
| 1025 FIXR(0.50190991877167369479), | 1041 FIXR(0.50190991877167369479), |
| 1026 FIXR(0.51763809020504152469), | 1042 FIXR(0.51763809020504152469), |
| 1030 FIXR(0.87172339781054900991), | 1046 FIXR(0.87172339781054900991), |
| 1031 FIXR(1.18310079157624925896), | 1047 FIXR(1.18310079157624925896), |
| 1032 FIXR(1.93185165257813657349), | 1048 FIXR(1.93185165257813657349), |
| 1033 FIXR(5.73685662283492756461), | 1049 FIXR(5.73685662283492756461), |
| 1034 }; | 1050 }; |
| 1035 | |
| 1036 static const int icos72[18] = { | |
| 1037 /* 0.5 / cos(pi*(2*i+19)/72) */ | |
| 1038 FIXR(0.74009361646113053152), | |
| 1039 FIXR(0.82133981585229078570), | |
| 1040 FIXR(0.93057949835178895673), | |
| 1041 FIXR(1.08284028510010010928), | |
| 1042 FIXR(1.30656296487637652785), | |
| 1043 FIXR(1.66275476171152078719), | |
| 1044 FIXR(2.31011315767264929558), | |
| 1045 FIXR(3.83064878777019433457), | |
| 1046 FIXR(11.46279281302667383546), | |
| 1047 | |
| 1048 /* 0.5 / cos(pi*(2*(i + 18) +19)/72) */ | |
| 1049 FIXR(-0.67817085245462840086), | |
| 1050 FIXR(-0.63023620700513223342), | |
| 1051 FIXR(-0.59284452371708034528), | |
| 1052 FIXR(-0.56369097343317117734), | |
| 1053 FIXR(-0.54119610014619698439), | |
| 1054 FIXR(-0.52426456257040533932), | |
| 1055 FIXR(-0.51213975715725461845), | |
| 1056 FIXR(-0.50431448029007636036), | |
| 1057 FIXR(-0.50047634258165998492), | |
| 1058 }; | |
| 1059 | |
| 1060 /* using Lee like decomposition followed by hand coded 9 points DCT */ | 1051 /* using Lee like decomposition followed by hand coded 9 points DCT */ |
| 1061 static void imdct36(int *out, int *in) | 1052 static void imdct36(int *out, int *buf, int *in, int *win) |
| 1062 { | 1053 { |
| 1063 int i, j, t0, t1, t2, t3, s0, s1, s2, s3; | 1054 int i, j, t0, t1, t2, t3, s0, s1, s2, s3; |
| 1064 int tmp[18], *tmp1, *in1; | 1055 int tmp[18], *tmp1, *in1; |
| 1065 int64_t in3_3, in6_6; | |
| 1066 | 1056 |
| 1067 for(i=17;i>=1;i--) | 1057 for(i=17;i>=1;i--) |
| 1068 in[i] += in[i-1]; | 1058 in[i] += in[i-1]; |
| 1069 for(i=17;i>=3;i-=2) | 1059 for(i=17;i>=3;i-=2) |
| 1070 in[i] += in[i-2]; | 1060 in[i] += in[i-2]; |
| 1071 | 1061 |
| 1072 for(j=0;j<2;j++) { | 1062 for(j=0;j<2;j++) { |
| 1073 tmp1 = tmp + j; | 1063 tmp1 = tmp + j; |
| 1074 in1 = in + j; | 1064 in1 = in + j; |
| 1075 | 1065 #if 0 |
| 1076 in3_3 = MUL64(in1[2*3], C3); | 1066 //more accurate but slower |
| 1077 in6_6 = MUL64(in1[2*6], C6); | 1067 int64_t t0, t1, t2, t3; |
| 1078 | 1068 t2 = in1[2*4] + in1[2*8] - in1[2*2]; |
| 1079 tmp1[0] = FRAC_RND(MUL64(in1[2*1], C1) + in3_3 + | 1069 |
| 1080 MUL64(in1[2*5], C5) + MUL64(in1[2*7], C7)); | 1070 t3 = (in1[2*0] + (int64_t)(in1[2*6]>>1))<<32; |
| 1081 tmp1[2] = in1[2*0] + FRAC_RND(MUL64(in1[2*2], C2) + | 1071 t1 = in1[2*0] - in1[2*6]; |
| 1082 MUL64(in1[2*4], C4) + in6_6 + | 1072 tmp1[ 6] = t1 - (t2>>1); |
| 1083 MUL64(in1[2*8], C8)); | 1073 tmp1[16] = t1 + t2; |
| 1084 tmp1[4] = FRAC_RND(MUL64(in1[2*1] - in1[2*5] - in1[2*7], C3)); | 1074 |
| 1085 tmp1[6] = FRAC_RND(MUL64(in1[2*2] - in1[2*4] - in1[2*8], C6)) - | 1075 t0 = MUL64(2*(in1[2*2] + in1[2*4]), C2); |
| 1086 in1[2*6] + in1[2*0]; | 1076 t1 = MUL64( in1[2*4] - in1[2*8] , -2*C8); |
| 1087 tmp1[8] = FRAC_RND(MUL64(in1[2*1], C5) - in3_3 - | 1077 t2 = MUL64(2*(in1[2*2] + in1[2*8]), -C4); |
| 1088 MUL64(in1[2*5], C7) + MUL64(in1[2*7], C1)); | 1078 |
| 1089 tmp1[10] = in1[2*0] + FRAC_RND(MUL64(-in1[2*2], C8) - | 1079 tmp1[10] = (t3 - t0 - t2) >> 32; |
| 1090 MUL64(in1[2*4], C2) + in6_6 + | 1080 tmp1[ 2] = (t3 + t0 + t1) >> 32; |
| 1091 MUL64(in1[2*8], C4)); | 1081 tmp1[14] = (t3 + t2 - t1) >> 32; |
| 1092 tmp1[12] = FRAC_RND(MUL64(in1[2*1], C7) - in3_3 + | 1082 |
| 1093 MUL64(in1[2*5], C1) - | 1083 tmp1[ 4] = MULH(2*(in1[2*5] + in1[2*7] - in1[2*1]), -C3); |
| 1094 MUL64(in1[2*7], C5)); | 1084 t2 = MUL64(2*(in1[2*1] + in1[2*5]), C1); |
| 1095 tmp1[14] = in1[2*0] + FRAC_RND(MUL64(-in1[2*2], C4) + | 1085 t3 = MUL64( in1[2*5] - in1[2*7] , -2*C7); |
| 1096 MUL64(in1[2*4], C8) + in6_6 - | 1086 t0 = MUL64(2*in1[2*3], C3); |
| 1097 MUL64(in1[2*8], C2)); | 1087 |
| 1098 tmp1[16] = in1[2*0] - in1[2*2] + in1[2*4] - in1[2*6] + in1[2*8]; | 1088 t1 = MUL64(2*(in1[2*1] + in1[2*7]), -C5); |
| 1089 | |
| 1090 tmp1[ 0] = (t2 + t3 + t0) >> 32; | |
| 1091 tmp1[12] = (t2 + t1 - t0) >> 32; | |
| 1092 tmp1[ 8] = (t3 - t1 - t0) >> 32; | |
| 1093 #else | |
| 1094 t2 = in1[2*4] + in1[2*8] - in1[2*2]; | |
| 1095 | |
| 1096 t3 = in1[2*0] + (in1[2*6]>>1); | |
| 1097 t1 = in1[2*0] - in1[2*6]; | |
| 1098 tmp1[ 6] = t1 - (t2>>1); | |
| 1099 tmp1[16] = t1 + t2; | |
| 1100 | |
| 1101 t0 = MULH(2*(in1[2*2] + in1[2*4]), C2); | |
| 1102 t1 = MULH( in1[2*4] - in1[2*8] , -2*C8); | |
| 1103 t2 = MULH(2*(in1[2*2] + in1[2*8]), -C4); | |
| 1104 | |
| 1105 tmp1[10] = t3 - t0 - t2; | |
| 1106 tmp1[ 2] = t3 + t0 + t1; | |
| 1107 tmp1[14] = t3 + t2 - t1; | |
| 1108 | |
| 1109 tmp1[ 4] = MULH(2*(in1[2*5] + in1[2*7] - in1[2*1]), -C3); | |
| 1110 t2 = MULH(2*(in1[2*1] + in1[2*5]), C1); | |
| 1111 t3 = MULH( in1[2*5] - in1[2*7] , -2*C7); | |
| 1112 t0 = MULH(2*in1[2*3], C3); | |
| 1113 | |
| 1114 t1 = MULH(2*(in1[2*1] + in1[2*7]), -C5); | |
| 1115 | |
| 1116 tmp1[ 0] = t2 + t3 + t0; | |
| 1117 tmp1[12] = t2 + t1 - t0; | |
| 1118 tmp1[ 8] = t3 - t1 - t0; | |
| 1119 #endif | |
| 1099 } | 1120 } |
| 1100 | 1121 |
| 1101 i = 0; | 1122 i = 0; |
| 1102 for(j=0;j<4;j++) { | 1123 for(j=0;j<4;j++) { |
| 1103 t0 = tmp[i]; | 1124 t0 = tmp[i]; |
| 1108 t2 = tmp[i + 1]; | 1129 t2 = tmp[i + 1]; |
| 1109 t3 = tmp[i + 3]; | 1130 t3 = tmp[i + 3]; |
| 1110 s1 = MULL(t3 + t2, icos36[j]); | 1131 s1 = MULL(t3 + t2, icos36[j]); |
| 1111 s3 = MULL(t3 - t2, icos36[8 - j]); | 1132 s3 = MULL(t3 - t2, icos36[8 - j]); |
| 1112 | 1133 |
| 1113 t0 = MULL(s0 + s1, icos72[9 + 8 - j]); | 1134 t0 = (s0 + s1) << 5; |
| 1114 t1 = MULL(s0 - s1, icos72[8 - j]); | 1135 t1 = (s0 - s1) << 5; |
| 1115 out[18 + 9 + j] = t0; | 1136 out[(9 + j)*SBLIMIT] = -MULH(t1, win[9 + j]) + buf[9 + j]; |
| 1116 out[18 + 8 - j] = t0; | 1137 out[(8 - j)*SBLIMIT] = MULH(t1, win[8 - j]) + buf[8 - j]; |
| 1117 out[9 + j] = -t1; | 1138 buf[9 + j] = MULH(t0, win[18 + 9 + j]); |
| 1118 out[8 - j] = t1; | 1139 buf[8 - j] = MULH(t0, win[18 + 8 - j]); |
| 1119 | 1140 |
| 1120 t0 = MULL(s2 + s3, icos72[9+j]); | 1141 t0 = (s2 + s3) << 5; |
| 1121 t1 = MULL(s2 - s3, icos72[j]); | 1142 t1 = (s2 - s3) << 5; |
| 1122 out[18 + 9 + (8 - j)] = t0; | 1143 out[(9 + 8 - j)*SBLIMIT] = -MULH(t1, win[9 + 8 - j]) + buf[9 + 8 - j]; |
| 1123 out[18 + j] = t0; | 1144 out[( j)*SBLIMIT] = MULH(t1, win[ j]) + buf[ j]; |
| 1124 out[9 + (8 - j)] = -t1; | 1145 buf[9 + 8 - j] = MULH(t0, win[18 + 9 + 8 - j]); |
| 1125 out[j] = t1; | 1146 buf[ + j] = MULH(t0, win[18 + j]); |
| 1126 i += 4; | 1147 i += 4; |
| 1127 } | 1148 } |
| 1128 | 1149 |
| 1129 s0 = tmp[16]; | 1150 s0 = tmp[16]; |
| 1130 s1 = MULL(tmp[17], icos36[4]); | 1151 s1 = MULL(tmp[17], icos36[4]); |
| 1131 t0 = MULL(s0 + s1, icos72[9 + 4]); | 1152 t0 = (s0 + s1) << 5; |
| 1132 t1 = MULL(s0 - s1, icos72[4]); | 1153 t1 = (s0 - s1) << 5; |
| 1133 out[18 + 9 + 4] = t0; | 1154 out[(9 + 4)*SBLIMIT] = -MULH(t1, win[9 + 4]) + buf[9 + 4]; |
| 1134 out[18 + 8 - 4] = t0; | 1155 out[(8 - 4)*SBLIMIT] = MULH(t1, win[8 - 4]) + buf[8 - 4]; |
| 1135 out[9 + 4] = -t1; | 1156 buf[9 + 4] = MULH(t0, win[18 + 9 + 4]); |
| 1136 out[8 - 4] = t1; | 1157 buf[8 - 4] = MULH(t0, win[18 + 8 - 4]); |
| 1137 } | 1158 } |
| 1138 | 1159 |
| 1139 /* header decoding. MUST check the header before because no | 1160 /* header decoding. MUST check the header before because no |
| 1140 consistency check is done there. Return 1 if free format found and | 1161 consistency check is done there. Return 1 if free format found and |
| 1141 that the frame size must be computed externally */ | 1162 that the frame size must be computed externally */ |
| 2062 } | 2083 } |
| 2063 | 2084 |
| 2064 buf = mdct_buf; | 2085 buf = mdct_buf; |
| 2065 ptr = g->sb_hybrid; | 2086 ptr = g->sb_hybrid; |
| 2066 for(j=0;j<mdct_long_end;j++) { | 2087 for(j=0;j<mdct_long_end;j++) { |
| 2067 imdct36(out, ptr); | |
| 2068 /* apply window & overlap with previous buffer */ | 2088 /* apply window & overlap with previous buffer */ |
| 2069 out_ptr = sb_samples + j; | 2089 out_ptr = sb_samples + j; |
| 2070 /* select window */ | 2090 /* select window */ |
| 2071 if (g->switch_point && j < 2) | 2091 if (g->switch_point && j < 2) |
| 2072 win1 = mdct_win[0]; | 2092 win1 = mdct_win[0]; |
| 2073 else | 2093 else |
| 2074 win1 = mdct_win[g->block_type]; | 2094 win1 = mdct_win[g->block_type]; |
| 2075 /* select frequency inversion */ | 2095 /* select frequency inversion */ |
| 2076 win = win1 + ((4 * 36) & -(j & 1)); | 2096 win = win1 + ((4 * 36) & -(j & 1)); |
| 2077 for(i=0;i<18;i++) { | 2097 imdct36(out_ptr, buf, ptr, win); |
| 2078 *out_ptr = MULL(out[i], win[i]) + buf[i]; | 2098 out_ptr += 18*SBLIMIT; |
| 2079 buf[i] = MULL(out[i + 18], win[i + 18]); | |
| 2080 out_ptr += SBLIMIT; | |
| 2081 } | |
| 2082 ptr += 18; | 2099 ptr += 18; |
| 2083 buf += 18; | 2100 buf += 18; |
| 2084 } | 2101 } |
| 2085 for(j=mdct_long_end;j<sblimit;j++) { | 2102 for(j=mdct_long_end;j<sblimit;j++) { |
| 2086 for(i=0;i<6;i++) { | 2103 for(i=0;i<6;i++) { |
