Mercurial > libguess
annotate russian_impl.c @ 2:754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
- new UCS-2LE/BE DFAs
- now arabic_impl.c uses arabic DFAs
- dfa common macros have been moved to dfa.h
- minor cleanups
| author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
|---|---|
| date | Wed, 11 Jun 2008 00:11:30 +0900 |
| parents | d9b6ff839eab |
| children |
| rev | line source |
|---|---|
| 0 | 1 /* |
| 2 * This code is derivitive of librcd. | |
| 3 * No copyright notice was found. | |
| 4 */ | |
| 5 | |
| 6 #include <stdio.h> | |
| 7 #include <string.h> | |
| 8 | |
| 9 #include "libguess.h" | |
| 10 | |
| 11 #define NF_VALUE -2 | |
| 12 #define max(a,b) ((a>b)?a:b) | |
| 13 #define min(a,b) ((a<b)?a:b) | |
| 14 #define bit(i) (1<<i) | |
| 15 | |
| 16 typedef struct lng_stat2 { | |
| 17 unsigned char a; | |
| 18 unsigned char b; | |
| 19 double rate; | |
| 20 double srate; | |
| 21 double erate; | |
| 22 } lng_stat2; | |
| 23 | |
| 24 #include "russian_tab.c" | |
| 25 | |
| 26 | |
| 27 static int end_symbol(char ch) { | |
| 28 if (ch=='\r'||ch=='\n'||ch==0||ch==' '||ch=='\t'||ch==','||ch=='.'||ch=='!'||ch=='?'||ch==';'||ch=='-'||ch==':'||ch=='"'||ch=='\''||ch==')') return 1; | |
| 29 return 0; | |
| 30 } | |
| 31 | |
| 32 static int start_symbol(char ch) { | |
| 33 if ((ch=='\t')||ch=='\r'||ch=='\n'||(ch==' ')||(ch=='(')||(ch=='"')||(ch=='\'')) return 1; | |
| 34 return 0; | |
|
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
35 } |
| 0 | 36 |
| 37 typedef const struct lng_stat2 *lng_stat2_ptr; | |
| 38 | |
| 39 static void bfind(const unsigned char *a, lng_stat2_ptr *w, lng_stat2_ptr *k, lng_stat2_ptr *al) { | |
| 40 const struct lng_stat2 *winptr, *koiptr,*altptr; | |
| 41 int ki,wi,ai,d,ws=0,ks=0,as=0; | |
| 42 d=npow2>>1; | |
| 43 wi=d; | |
| 44 ki=d; | |
| 45 ai=d; | |
| 46 winptr=0; | |
| 47 koiptr=0; | |
| 48 altptr=0; | |
| 49 do{ | |
| 50 d>>=1; | |
|
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
51 |
| 0 | 52 if(!ws){ |
| 53 if (wi>indexes2) wi-=d; | |
| 54 else { | |
| 55 winptr=enc_win+wi-1; | |
| 56 if(a[0]==winptr->a){ | |
| 57 if(a[1]==winptr->b){ | |
| 58 ws=1; | |
| 59 }else if(a[1]<winptr->b){ | |
| 60 wi-=d; | |
| 61 }else{ //b>win[wi].b | |
| 62 wi+=d; | |
| 63 } | |
| 64 }else if(a[0]<winptr->a){ | |
| 65 wi-=d; | |
| 66 }else{ //a>win[wi].a | |
| 67 wi+=d; | |
| 68 } | |
| 69 } | |
| 70 } | |
| 71 if(!ks){ | |
| 72 if (ki>indexes2) ki-=d; | |
| 73 else { | |
| 74 koiptr=enc_koi+ki-1; | |
| 75 if(a[0]==koiptr->a){ | |
| 76 if(a[1]==koiptr->b){ | |
| 77 ks=1; | |
| 78 }else if(a[1]<koiptr->b){ | |
| 79 ki-=d; | |
| 80 }else{ //b>win[wi].b | |
| 81 ki+=d; | |
| 82 } | |
| 83 }else if(a[0]<koiptr->a){ | |
| 84 ki-=d; | |
| 85 }else{ //a>win[wi].a | |
| 86 ki+=d; | |
| 87 } | |
| 88 } | |
| 89 } | |
| 90 if(!as){ | |
| 91 if (ai>indexes2) ai-=d; | |
| 92 else { | |
| 93 altptr=enc_alt+ai-1; | |
| 94 if(a[0]==altptr->a){ | |
| 95 if(a[1]==altptr->b){ | |
| 96 as=1; | |
| 97 }else if(a[1]<altptr->b){ | |
| 98 ai-=d; | |
| 99 }else{ //b>win[wi].b | |
| 100 ai+=d; | |
| 101 } | |
| 102 }else if(a[0]<altptr->a){ | |
| 103 ai-=d; | |
| 104 }else{ //a>win[wi].a | |
| 105 ai+=d; | |
| 106 } | |
| 107 } | |
| 108 } | |
| 109 }while(d); | |
| 110 if (ws) *w=winptr; | |
| 111 else *w=NULL; | |
| 112 if (ks) *k=koiptr; | |
| 113 else *k=NULL; | |
| 114 if (as) *al=altptr; | |
| 115 else *al=NULL; | |
| 116 } | |
| 117 | |
| 118 static double calculate(double s, double m, double e) { | |
| 119 return s+m+e; | |
| 120 } | |
| 121 | |
| 122 static const char *is_win_charset2(const unsigned char *txt, int len){ | |
| 123 const struct lng_stat2 *winptr, *koiptr,*altptr; | |
| 124 double winstep,koistep,altstep,winestep,koiestep,altestep,winsstep,koisstep,altsstep; | |
| 125 double winstat=0,koistat=0,altstat=0,winestat=0,koiestat=0,altestat=0,winsstat=0,koisstat=0,altsstat=0; | |
| 126 long j; | |
|
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
127 |
| 0 | 128 #ifdef _AUTO_DEBUG |
| 129 fprintf(stderr,"Word: %s\n",txt); | |
| 130 #endif | |
| 131 for(j=0;j<len-1;j++){ | |
| 132 //skip bottom half of table | |
| 133 if(txt[j]<128 || txt[j+1]<128) continue; | |
| 134 #ifdef _AUTO_DEBUG | |
| 135 fprintf(stderr,"Pair: %c%c",txt[j],txt[j+1]); | |
| 136 #endif | |
| 137 bfind(txt+j,&winptr,&koiptr,&altptr); | |
| 138 | |
| 139 if ((j==0)||(start_symbol(txt[j-1]))) { | |
| 140 if (winptr) winsstep=winptr->srate; | |
| 141 else winsstep=NF_VALUE; | |
| 142 if (koiptr) koisstep=koiptr->srate; | |
| 143 else koisstep=NF_VALUE; | |
| 144 if (altptr) altsstep=altptr->srate; | |
| 145 else altsstep=NF_VALUE; | |
| 146 winestep=0; | |
| 147 koiestep=0; | |
| 148 altestep=0; | |
| 149 winstep=0; | |
| 150 koistep=0; | |
| 151 altstep=0; | |
| 152 #ifdef _AUTO_DEBUG | |
| 153 fprintf(stderr,", Win %lf, Koi %lf, Alt: %lf\n",winsstep,koisstep,altsstep); | |
| 154 #endif | |
| 155 } else if ((j==len-2)||(end_symbol(txt[j+2]))) { | |
| 156 if (winptr) winestep=winptr->erate; | |
| 157 else winestep=NF_VALUE; | |
| 158 if (koiptr) koiestep=koiptr->erate; | |
| 159 else koiestep=NF_VALUE; | |
| 160 if (altptr) altestep=altptr->erate; | |
| 161 else altestep=NF_VALUE; | |
| 162 winsstep=0; | |
| 163 koisstep=0; | |
| 164 altsstep=0; | |
| 165 winstep=0; | |
| 166 koistep=0; | |
| 167 altstep=0; | |
| 168 #ifdef _AUTO_DEBUG | |
| 169 fprintf(stderr,", Win %lf, Koi %lf, Alt %lf\n",winestep,koiestep,altestep); | |
| 170 #endif | |
| 171 } else { | |
| 172 if (winptr) winstep=winptr->rate; | |
| 173 else winstep=NF_VALUE; | |
| 174 if (koiptr) koistep=koiptr->rate; | |
| 175 else koistep=NF_VALUE; | |
| 176 if (altptr) altstep=altptr->rate; | |
| 177 else altstep=NF_VALUE; | |
| 178 winsstep=0; | |
| 179 winestep=0; | |
| 180 koisstep=0; | |
| 181 koiestep=0; | |
| 182 altsstep=0; | |
| 183 altestep=0; | |
| 184 #ifdef _AUTO_DEBUG | |
| 185 fprintf(stderr,", Win %lf, Koi %lf, Alt %lf\n",winstep,koistep,altstep); | |
| 186 #endif | |
| 187 } | |
|
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
188 |
| 0 | 189 winstat+=winstep; |
| 190 koistat+=koistep; | |
| 191 altstat+=altstep; | |
| 192 winsstat+=winsstep; | |
| 193 koisstat+=koisstep; | |
| 194 altsstat+=altsstep; | |
| 195 winestat+=winestep; | |
| 196 koiestat+=koiestep; | |
| 197 altestat+=altestep; | |
| 198 } | |
| 199 | |
| 200 #ifdef _AUTO_DEBUG | |
| 201 fprintf(stderr,"Start. Win: %lf, Koi: %lf, Alt: %lf\n",winsstat,koisstat,altsstat); | |
| 202 fprintf(stderr,"Middle. Win: %lf, Koi: %lf, Alt: %lf\n",winstat,koistat,altstat); | |
| 203 fprintf(stderr,"End. Win: %lf, Koi: %lf, Alt: %lf\n",winestat,koiestat,altestat); | |
| 204 fprintf(stderr,"Final. Win: %lf, Koi: %lf, Alt: %lf\n",calculate(winsstat,winstat,winestat),calculate(koisstat,koistat,koiestat),calculate(altsstat,altstat,altestat)); | |
| 205 #endif | |
| 206 if ((calculate(altsstat,altstat,altestat)>calculate(koisstat,koistat,koiestat))&&(calculate(altsstat,altstat,altestat)>calculate(winsstat,winstat,winestat))) return "CP866"; | |
| 207 if (calculate(koisstat,koistat,koiestat)>calculate(winsstat,winstat,winestat)) return "KOI8-R"; | |
| 208 return "CP1251"; | |
| 209 } | |
| 210 | |
| 211 const char *guess_ru(const char *buf, int len) | |
| 212 { | |
| 213 if (dfa_validate_utf8(buf, len)) | |
| 214 return "UTF-8"; | |
| 215 | |
| 216 return is_win_charset2((const unsigned char *) buf, len); | |
| 217 } | |
| 218 |
