Mercurial > libguess
annotate cjk_impl.c @ 3:70e2c306231e
- implemented dfa utility functions.
- added dfa.c.
- rewrote guess functions for ar, gr, hw and tr scripts with dfa utilities.
- guess functions for cjk scripts too.
| author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
|---|---|
| date | Thu, 12 Jun 2008 20:20:43 +0900 |
| parents | 754a4550c64e |
| children |
| rev | line source |
|---|---|
| 0 | 1 /* |
| 2 * This code is derivative of guess.c of Gauche-0.8.3. | |
| 3 * The following is the original copyright notice. | |
| 4 */ | |
| 5 | |
| 6 /* | |
|
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
7 * guess.c - guessing character encoding |
| 0 | 8 * |
| 9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. | |
|
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
10 * |
| 0 | 11 * Redistribution and use in source and binary forms, with or without |
| 12 * modification, are permitted provided that the following conditions | |
| 13 * are met: | |
|
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
14 * |
| 0 | 15 * 1. Redistributions of source code must retain the above copyright |
| 16 * notice, this list of conditions and the following disclaimer. | |
| 17 * | |
| 18 * 2. Redistributions in binary form must reproduce the above copyright | |
| 19 * notice, this list of conditions and the following disclaimer in the | |
| 20 * documentation and/or other materials provided with the distribution. | |
| 21 * | |
| 22 * 3. Neither the name of the authors nor the names of its contributors | |
| 23 * may be used to endorse or promote products derived from this | |
| 24 * software without specific prior written permission. | |
| 25 * | |
| 26 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 27 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 28 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 29 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
| 30 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| 31 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | |
| 32 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
| 33 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF | |
| 34 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
| 35 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| 36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 37 * | |
| 38 */ | |
| 39 | |
| 40 #include "libguess.h" | |
|
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
41 #include "dfa.h" |
| 0 | 42 |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
43 #include <stdio.h> |
| 0 | 44 |
|
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
45 /* take precedence if scores are same. you can customize the order as: */ |
|
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
46 /* ORDER_** &highest, &second, ... &lowest */ |
|
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
47 #define ORDER_JP &utf8, &sjis, &eucj |
|
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
48 #define ORDER_TW &utf8, &big5 |
|
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
49 #define ORDER_CN &utf8, &gb2312, &gb18030 |
|
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
50 #define ORDER_KR &utf8, &euck, &johab |
| 0 | 51 |
| 52 /* include DFA table generated by guess.scm */ | |
| 53 #include "guess_tab.c" | |
| 54 | |
| 55 int dfa_validate_utf8(const char *buf, int buflen) | |
| 56 { | |
| 57 int i; | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
58 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
| 0 | 59 |
| 60 for (i = 0; i < buflen; i++) { | |
| 61 int c = (unsigned char) buf[i]; | |
| 62 | |
| 63 if (DFA_ALIVE(utf8)) | |
| 64 DFA_NEXT(utf8, c); | |
| 65 else | |
| 66 break; | |
| 67 } | |
| 68 | |
| 69 DFA_NEXT(utf8, '\0'); //Bug #53 | |
| 70 | |
| 71 if(DFA_ALIVE(utf8)) | |
| 72 return 1; | |
|
2
754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
0
diff
changeset
|
73 else |
| 0 | 74 return 0; |
| 75 } | |
| 76 | |
| 77 const char *guess_jp(const char *buf, int buflen) | |
| 78 { | |
| 79 int i; | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
80 const char *rv = NULL; |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
81 guess_dfa eucj = DFA_INIT(guess_eucj_st, guess_eucj_ar, "EUC-JP"); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
82 guess_dfa sjis = DFA_INIT(guess_sjis_st, guess_sjis_ar, "SJIS"); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
83 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
| 0 | 84 guess_dfa *top = NULL; |
| 85 | |
| 86 guess_dfa *order[] = { ORDER_JP, NULL }; | |
| 87 | |
| 88 for (i = 0; i < buflen; i++) { | |
| 89 int c = (unsigned char) buf[i]; | |
| 90 | |
| 91 /* special treatment of iso-2022 escape sequence */ | |
| 92 if (c == 0x1b) { | |
| 93 if (i < buflen - 1) { | |
| 94 c = (unsigned char) buf[++i]; | |
| 95 if (c == '$' || c == '(') | |
| 96 return "ISO-2022-JP"; | |
| 97 } | |
| 98 } | |
| 99 | |
| 100 /* special treatment of BOM */ | |
| 101 if (i == 0 && c == 0xff) { | |
| 102 if (i < buflen - 1) { | |
| 103 c = (unsigned char) buf[i + 1]; | |
| 104 if (c == 0xfe) | |
| 105 return UCS_2LE; | |
| 106 } | |
| 107 } | |
| 108 if (i == 0 && c == 0xfe) { | |
| 109 if (i < buflen - 1) { | |
| 110 c = (unsigned char) buf[i + 1]; | |
| 111 if (c == 0xff) | |
| 112 return UCS_2BE; | |
| 113 } | |
| 114 } | |
| 115 | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
116 rv = dfa_process(order, c); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
117 if(rv) |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
118 return rv; |
| 0 | 119 |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
120 if (dfa_none(order)) { |
| 0 | 121 /* we ran out the possibilities */ |
| 122 return NULL; | |
| 123 } | |
| 124 } | |
| 125 | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
126 top = dfa_top(order); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
127 if(top) |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
128 return top->name; |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
129 else |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
130 return NULL; |
| 0 | 131 } |
| 132 | |
| 133 const char *guess_tw(const char *buf, int buflen) | |
| 134 { | |
| 135 int i; | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
136 const char *rv = NULL; |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
137 guess_dfa big5 = DFA_INIT(guess_big5_st, guess_big5_ar, "BIG5"); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
138 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
| 0 | 139 guess_dfa *top = NULL; |
| 140 | |
| 141 guess_dfa *order[] = { ORDER_TW, NULL }; | |
| 142 | |
| 143 for (i = 0; i < buflen; i++) { | |
| 144 int c = (unsigned char) buf[i]; | |
| 145 | |
| 146 /* special treatment of iso-2022 escape sequence */ | |
| 147 if (c == 0x1b) { | |
| 148 if (i < buflen - 1) { | |
| 149 c = (unsigned char) buf[++i]; | |
| 150 if (c == '$' || c == '(') | |
| 151 return "ISO-2022-TW"; | |
| 152 } | |
| 153 } | |
| 154 | |
| 155 /* special treatment of BOM */ | |
| 156 if (i == 0 && c == 0xff) { | |
| 157 if (i < buflen - 1) { | |
| 158 c = (unsigned char) buf[i + 1]; | |
| 159 if (c == 0xfe) | |
| 160 return UCS_2LE; | |
| 161 } | |
| 162 } | |
| 163 if (i == 0 && c == 0xfe) { | |
| 164 if (i < buflen - 1) { | |
| 165 c = (unsigned char) buf[i + 1]; | |
| 166 if (c == 0xff) | |
| 167 return UCS_2BE; | |
| 168 } | |
| 169 } | |
| 170 | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
171 rv = dfa_process(order, c); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
172 if(rv) |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
173 return rv; |
| 0 | 174 |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
175 if (dfa_none(order)) { |
| 0 | 176 /* we ran out the possibilities */ |
| 177 return NULL; | |
| 178 } | |
| 179 } | |
| 180 | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
181 top = dfa_top(order); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
182 if (top) |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
183 return top->name; |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
184 else |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
185 return NULL; |
| 0 | 186 } |
| 187 | |
| 188 const char *guess_cn(const char *buf, int buflen) | |
| 189 { | |
| 190 int i; | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
191 const char *rv = NULL; |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
192 guess_dfa gb2312 = DFA_INIT(guess_gb2312_st, guess_gb2312_ar, "GB2312"); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
193 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
194 guess_dfa gb18030 = DFA_INIT(guess_gb18030_st, guess_gb18030_ar, "GB18030"); |
| 0 | 195 guess_dfa *top = NULL; |
| 196 | |
| 197 guess_dfa *order[] = { ORDER_CN, NULL }; | |
| 198 | |
| 199 for (i = 0; i < buflen; i++) { | |
| 200 int c = (unsigned char) buf[i]; | |
| 201 int c2; | |
| 202 | |
| 203 /* special treatment of iso-2022 escape sequence */ | |
| 204 if (c == 0x1b) { | |
| 205 if (i < buflen - 1) { | |
| 206 c = (unsigned char) buf[i + 1]; | |
| 207 c2 = (unsigned char) buf[i + 2]; | |
| 208 if (c == '$' && (c2 == ')' || c2 == '+')) | |
| 209 return "ISO-2022-CN"; | |
| 210 } | |
| 211 } | |
| 212 | |
| 213 /* special treatment of BOM */ | |
| 214 if (i == 0 && c == 0xff) { | |
| 215 if (i < buflen - 1) { | |
| 216 c = (unsigned char) buf[i + 1]; | |
| 217 if (c == 0xfe) | |
| 218 return UCS_2LE; | |
| 219 } | |
| 220 } | |
| 221 if (i == 0 && c == 0xfe) { | |
| 222 if (i < buflen - 1) { | |
| 223 c = (unsigned char) buf[i + 1]; | |
| 224 if (c == 0xff) | |
| 225 return UCS_2BE; | |
| 226 } | |
| 227 } | |
| 228 | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
229 rv = dfa_process(order, c); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
230 if(rv) |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
231 return rv; |
| 0 | 232 |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
233 if (dfa_none(order)) { |
| 0 | 234 /* we ran out the possibilities */ |
| 235 return NULL; | |
| 236 } | |
| 237 } | |
| 238 | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
239 top = dfa_top(order); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
240 if(top) |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
241 return top->name; |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
242 else |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
243 return NULL; |
| 0 | 244 } |
| 245 | |
| 246 const char *guess_kr(const char *buf, int buflen) | |
| 247 { | |
| 248 int i; | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
249 const char *rv = NULL; |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
250 guess_dfa euck = DFA_INIT(guess_euck_st, guess_euck_ar, "EUC-KR"); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
251 guess_dfa utf8 = DFA_INIT(guess_utf8_st, guess_utf8_ar, "UTF-8"); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
252 guess_dfa johab = DFA_INIT(guess_johab_st, guess_johab_ar, "JOHAB"); |
| 0 | 253 guess_dfa *top = NULL; |
| 254 | |
| 255 guess_dfa *order[] = { ORDER_KR, NULL }; | |
| 256 | |
| 257 for (i = 0; i < buflen; i++) { | |
| 258 int c = (unsigned char) buf[i]; | |
| 259 int c2; | |
| 260 | |
| 261 /* special treatment of iso-2022 escape sequence */ | |
| 262 if (c == 0x1b) { | |
| 263 if (i < buflen - 1) { | |
| 264 c = (unsigned char) buf[i + 1]; | |
| 265 c2 = (unsigned char) buf[i + 2]; | |
| 266 if (c == '$' && c2 == ')') | |
| 267 return "ISO-2022-KR"; | |
| 268 } | |
| 269 } | |
| 270 | |
| 271 /* special treatment of BOM */ | |
| 272 if (i == 0 && c == 0xff) { | |
| 273 if (i < buflen - 1) { | |
| 274 c = (unsigned char) buf[i + 1]; | |
| 275 if (c == 0xfe) | |
| 276 return UCS_2LE; | |
| 277 } | |
| 278 } | |
| 279 if (i == 0 && c == 0xfe) { | |
| 280 if (i < buflen - 1) { | |
| 281 c = (unsigned char) buf[i + 1]; | |
| 282 if (c == 0xff) | |
| 283 return UCS_2BE; | |
| 284 } | |
| 285 } | |
| 286 | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
287 rv = dfa_process(order, c); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
288 if(rv) |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
289 return rv; |
| 0 | 290 |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
291 if (dfa_none(order)) { |
| 0 | 292 /* we ran out the possibilities */ |
| 293 return NULL; | |
| 294 } | |
| 295 } | |
| 296 | |
|
3
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
297 top = dfa_top(order); |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
298 if(top) |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
299 return top->name; |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
300 else |
|
70e2c306231e
- implemented dfa utility functions.
Yoshiki Yazawa <yaz@cc.rim.or.jp>
parents:
2
diff
changeset
|
301 return NULL; |
| 0 | 302 } |
