Mercurial > libguess
comparison cjk_impl.c @ 2:754a4550c64e
- added arabic, greek, hebrew and turkish DFAs
- new UCS-2LE/BE DFAs
- now arabic_impl.c uses arabic DFAs
- dfa common macros have been moved to dfa.h
- minor cleanups
| author | Yoshiki Yazawa <yaz@cc.rim.or.jp> |
|---|---|
| date | Wed, 11 Jun 2008 00:11:30 +0900 |
| parents | d9b6ff839eab |
| children | 70e2c306231e |
comparison
equal
deleted
inserted
replaced
| 1:04f2be1c8464 | 2:754a4550c64e |
|---|---|
| 2 * This code is derivative of guess.c of Gauche-0.8.3. | 2 * This code is derivative of guess.c of Gauche-0.8.3. |
| 3 * The following is the original copyright notice. | 3 * The following is the original copyright notice. |
| 4 */ | 4 */ |
| 5 | 5 |
| 6 /* | 6 /* |
| 7 * guess.c - guessing character encoding | 7 * guess.c - guessing character encoding |
| 8 * | 8 * |
| 9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. | 9 * Copyright (c) 2000-2003 Shiro Kawai, All rights reserved. |
| 10 * | 10 * |
| 11 * Redistribution and use in source and binary forms, with or without | 11 * Redistribution and use in source and binary forms, with or without |
| 12 * modification, are permitted provided that the following conditions | 12 * modification, are permitted provided that the following conditions |
| 13 * are met: | 13 * are met: |
| 14 * | 14 * |
| 15 * 1. Redistributions of source code must retain the above copyright | 15 * 1. Redistributions of source code must retain the above copyright |
| 16 * notice, this list of conditions and the following disclaimer. | 16 * notice, this list of conditions and the following disclaimer. |
| 17 * | 17 * |
| 18 * 2. Redistributions in binary form must reproduce the above copyright | 18 * 2. Redistributions in binary form must reproduce the above copyright |
| 19 * notice, this list of conditions and the following disclaimer in the | 19 * notice, this list of conditions and the following disclaimer in the |
| 36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 36 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 37 * | 37 * |
| 38 */ | 38 */ |
| 39 | 39 |
| 40 #include "libguess.h" | 40 #include "libguess.h" |
| 41 | 41 #include "dfa.h" |
| 42 /* take precedence if scores are same. you can customize the order as: */ | |
| 43 /* ORDER_** &highest, &second, ... &lowest */ | |
| 44 #define ORDER_JP &utf8, &sjis, &eucj | |
| 45 #define ORDER_TW &utf8, &big5 | |
| 46 #define ORDER_CN &utf8, &gb2312, &gb18030 | |
| 47 #define ORDER_KR &utf8, &euck, &johab | |
| 48 | 42 |
| 49 /* workaround for that glib's g_convert can't convert | 43 /* workaround for that glib's g_convert can't convert |
| 50 properly from UCS-2BE/LE trailing after BOM. */ | 44 properly from UCS-2BE/LE trailing after BOM. */ |
| 51 #define WITH_G_CONVERT 1 | 45 #define WITH_G_CONVERT 1 |
| 52 /* #undef WITH_G_CONVERT */ | 46 /* #undef WITH_G_CONVERT */ |
| 57 #else | 51 #else |
| 58 const char UCS_2BE[] = "UCS-2BE"; | 52 const char UCS_2BE[] = "UCS-2BE"; |
| 59 const char UCS_2LE[] = "UCS-2LE"; | 53 const char UCS_2LE[] = "UCS-2LE"; |
| 60 #endif | 54 #endif |
| 61 | 55 |
| 62 /* data types */ | 56 /* take precedence if scores are same. you can customize the order as: */ |
| 63 typedef struct guess_arc_rec | 57 /* ORDER_** &highest, &second, ... &lowest */ |
| 64 { | 58 #define ORDER_JP &utf8, &sjis, &eucj |
| 65 unsigned int next; /* next state */ | 59 #define ORDER_TW &utf8, &big5 |
| 66 double score; /* score */ | 60 #define ORDER_CN &utf8, &gb2312, &gb18030 |
| 67 } guess_arc; | 61 #define ORDER_KR &utf8, &euck, &johab |
| 68 | |
| 69 typedef struct guess_dfa_rec | |
| 70 { | |
| 71 signed char (*states)[256]; | |
| 72 guess_arc *arcs; | |
| 73 int state; | |
| 74 double score; | |
| 75 } guess_dfa; | |
| 76 | |
| 77 /* macros */ | |
| 78 #define DFA_INIT(st, ar) \ | |
| 79 { st, ar, 0, 1.0 } | |
| 80 | |
| 81 #define DFA_NEXT(dfa, ch) \ | |
| 82 do { \ | |
| 83 int arc__; \ | |
| 84 if (dfa.state >= 0) { \ | |
| 85 arc__ = dfa.states[dfa.state][ch]; \ | |
| 86 if (arc__ < 0) { \ | |
| 87 dfa.state = -1; \ | |
| 88 } else { \ | |
| 89 dfa.state = dfa.arcs[arc__].next; \ | |
| 90 dfa.score *= dfa.arcs[arc__].score; \ | |
| 91 } \ | |
| 92 } \ | |
| 93 } while (0) | |
| 94 | |
| 95 #define DFA_ALIVE(dfa) (dfa.state >= 0) | |
| 96 | 62 |
| 97 /* include DFA table generated by guess.scm */ | 63 /* include DFA table generated by guess.scm */ |
| 98 #include "guess_tab.c" | 64 #include "guess_tab.c" |
| 99 | 65 |
| 100 | 66 |
| 114 | 80 |
| 115 DFA_NEXT(utf8, '\0'); //Bug #53 | 81 DFA_NEXT(utf8, '\0'); //Bug #53 |
| 116 | 82 |
| 117 if(DFA_ALIVE(utf8)) | 83 if(DFA_ALIVE(utf8)) |
| 118 return 1; | 84 return 1; |
| 119 else | 85 else |
| 120 return 0; | 86 return 0; |
| 121 } | 87 } |
| 122 | 88 |
| 123 const char *guess_jp(const char *buf, int buflen) | 89 const char *guess_jp(const char *buf, int buflen) |
| 124 { | 90 { |
