Mercurial > emacs
comparison src/coding.c @ 87676:2aeceff24280
(detect_coding_iso2022): New arg latin_extra_code_state. Allow Latin
extra codes only when *latin_extra_code_state is nonzero.
(detect_coding_mask): If there is a NULL byte, detect the encoding as
UTF-16 or binary. If there is a Latin extra code, detect the encoding
as ISO-2022 only when no other proper encoding is found.
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Wed, 09 Jan 2008 06:05:23 +0000 |
| parents | 107ccd98fa12 |
| children | df9e1c663162 |
comparison
equal
deleted
inserted
replaced
| 87675:e4a11c2d5016 | 87676:2aeceff24280 |
|---|---|
| 1404 CODING_CATEGORY_MASK_ISO_8_1 | 1404 CODING_CATEGORY_MASK_ISO_8_1 |
| 1405 CODING_CATEGORY_MASK_ISO_8_2 | 1405 CODING_CATEGORY_MASK_ISO_8_2 |
| 1406 CODING_CATEGORY_MASK_ISO_7_ELSE | 1406 CODING_CATEGORY_MASK_ISO_7_ELSE |
| 1407 CODING_CATEGORY_MASK_ISO_8_ELSE | 1407 CODING_CATEGORY_MASK_ISO_8_ELSE |
| 1408 are set. If a code which should never appear in ISO2022 is found, | 1408 are set. If a code which should never appear in ISO2022 is found, |
| 1409 returns 0. */ | 1409 returns 0. |
| 1410 | |
| 1411 If *latin_extra_code_state is zero and Latin extra codes are found, | |
| 1412 set *latin_extra_code_state to 1 and return 0. If it is nonzero, | |
| 1413 accept Latin extra codes. */ | |
| 1410 | 1414 |
| 1411 static int | 1415 static int |
| 1412 detect_coding_iso2022 (src, src_end, multibytep) | 1416 detect_coding_iso2022 (src, src_end, multibytep, latin_extra_code_state) |
| 1413 unsigned char *src, *src_end; | 1417 unsigned char *src, *src_end; |
| 1414 int multibytep; | 1418 int multibytep; |
| 1419 int *latin_extra_code_state; | |
| 1415 { | 1420 { |
| 1416 int mask = CODING_CATEGORY_MASK_ISO; | 1421 int mask = CODING_CATEGORY_MASK_ISO; |
| 1417 int mask_found = 0; | 1422 int mask_found = 0; |
| 1418 int reg[4], shift_out = 0, single_shifting = 0; | 1423 int reg[4], shift_out = 0, single_shifting = 0; |
| 1419 int c, c1, charset; | 1424 int c, c1, charset; |
| 1572 single_shifting = 1; | 1577 single_shifting = 1; |
| 1573 } | 1578 } |
| 1574 if (VECTORP (Vlatin_extra_code_table) | 1579 if (VECTORP (Vlatin_extra_code_table) |
| 1575 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | 1580 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
| 1576 { | 1581 { |
| 1582 if (! *latin_extra_code_state) | |
| 1583 { | |
| 1584 *latin_extra_code_state = 1; | |
| 1585 return 0; | |
| 1586 } | |
| 1577 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | 1587 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags |
| 1578 & CODING_FLAG_ISO_LATIN_EXTRA) | 1588 & CODING_FLAG_ISO_LATIN_EXTRA) |
| 1579 newmask |= CODING_CATEGORY_MASK_ISO_8_1; | 1589 newmask |= CODING_CATEGORY_MASK_ISO_8_1; |
| 1580 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags | 1590 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags |
| 1581 & CODING_FLAG_ISO_LATIN_EXTRA) | 1591 & CODING_FLAG_ISO_LATIN_EXTRA) |
| 1598 if (VECTORP (Vlatin_extra_code_table) | 1608 if (VECTORP (Vlatin_extra_code_table) |
| 1599 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | 1609 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
| 1600 { | 1610 { |
| 1601 int newmask = 0; | 1611 int newmask = 0; |
| 1602 | 1612 |
| 1613 if (! *latin_extra_code_state) | |
| 1614 { | |
| 1615 *latin_extra_code_state = 1; | |
| 1616 return 0; | |
| 1617 } | |
| 1603 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags | 1618 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_1]->flags |
| 1604 & CODING_FLAG_ISO_LATIN_EXTRA) | 1619 & CODING_FLAG_ISO_LATIN_EXTRA) |
| 1605 newmask |= CODING_CATEGORY_MASK_ISO_8_1; | 1620 newmask |= CODING_CATEGORY_MASK_ISO_8_1; |
| 1606 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags | 1621 if (coding_system_table[CODING_CATEGORY_IDX_ISO_8_2]->flags |
| 1607 & CODING_FLAG_ISO_LATIN_EXTRA) | 1622 & CODING_FLAG_ISO_LATIN_EXTRA) |
| 4125 { | 4140 { |
| 4126 register unsigned char c; | 4141 register unsigned char c; |
| 4127 unsigned char *src = source, *src_end = source + src_bytes; | 4142 unsigned char *src = source, *src_end = source + src_bytes; |
| 4128 unsigned int mask, utf16_examined_p, iso2022_examined_p; | 4143 unsigned int mask, utf16_examined_p, iso2022_examined_p; |
| 4129 int i; | 4144 int i; |
| 4145 int null_byte_found; | |
| 4146 int latin_extra_code_state = 1; | |
| 4130 | 4147 |
| 4131 /* At first, skip all ASCII characters and control characters except | 4148 /* At first, skip all ASCII characters and control characters except |
| 4132 for three ISO2022 specific control characters. */ | 4149 for three ISO2022 specific control characters. */ |
| 4133 ascii_skip_code[ISO_CODE_SO] = 0; | 4150 ascii_skip_code[ISO_CODE_SO] = 0; |
| 4134 ascii_skip_code[ISO_CODE_SI] = 0; | 4151 ascii_skip_code[ISO_CODE_SI] = 0; |
| 4135 ascii_skip_code[ISO_CODE_ESC] = 0; | 4152 ascii_skip_code[ISO_CODE_ESC] = 0; |
| 4136 | 4153 |
| 4137 label_loop_detect_coding: | 4154 label_loop_detect_coding: |
| 4138 while (src < src_end && ascii_skip_code[*src]) src++; | 4155 null_byte_found = 0; |
| 4156 while (src < src_end && ascii_skip_code[*src]) | |
| 4157 null_byte_found |= (! *src++); | |
| 4158 if (! null_byte_found) | |
| 4159 { | |
| 4160 unsigned char *p = src + 1; | |
| 4161 while (p < src_end) | |
| 4162 null_byte_found |= (! *p++); | |
| 4163 } | |
| 4139 *skip = src - source; | 4164 *skip = src - source; |
| 4140 | 4165 |
| 4141 if (src >= src_end) | 4166 if (src >= src_end) |
| 4142 /* We found nothing other than ASCII. There's nothing to do. */ | 4167 /* We found nothing other than ASCII (and NULL byte). There's |
| 4168 nothing to do. */ | |
| 4143 return 0; | 4169 return 0; |
| 4144 | 4170 |
| 4145 c = *src; | 4171 c = *src; |
| 4146 /* The text seems to be encoded in some multilingual coding system. | 4172 /* The text seems to be encoded in some multilingual coding system. |
| 4147 Now, try to find in which coding system the text is encoded. */ | 4173 Now, try to find in which coding system the text is encoded. */ |
| 4148 if (c < 0x80) | 4174 if (! null_byte_found && c < 0x80) |
| 4149 { | 4175 { |
| 4150 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ | 4176 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ |
| 4151 /* C is an ISO2022 specific control code of C0. */ | 4177 /* C is an ISO2022 specific control code of C0. */ |
| 4152 mask = detect_coding_iso2022 (src, src_end, multibytep); | 4178 latin_extra_code_state = 1; |
| 4179 mask = detect_coding_iso2022 (src, src_end, multibytep, | |
| 4180 &latin_extra_code_state); | |
| 4153 if (mask == 0) | 4181 if (mask == 0) |
| 4154 { | 4182 { |
| 4155 /* No valid ISO2022 code follows C. Try again. */ | 4183 /* No valid ISO2022 code follows C. Try again. */ |
| 4156 src++; | 4184 src++; |
| 4157 if (c == ISO_CODE_ESC) | 4185 if (c == ISO_CODE_ESC) |
| 4175 int try; | 4203 int try; |
| 4176 | 4204 |
| 4177 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) | 4205 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) |
| 4178 c = src[1] - 0x20; | 4206 c = src[1] - 0x20; |
| 4179 | 4207 |
| 4180 if (c < 0xA0) | 4208 if (null_byte_found) |
| 4209 { | |
| 4210 try = (CODING_CATEGORY_MASK_UTF_16_BE | |
| 4211 | CODING_CATEGORY_MASK_UTF_16_LE); | |
| 4212 } | |
| 4213 else if (c < 0xA0) | |
| 4181 { | 4214 { |
| 4182 /* C is the first byte of SJIS character code, | 4215 /* C is the first byte of SJIS character code, |
| 4183 or a leading-code of Emacs' internal format (emacs-mule), | 4216 or a leading-code of Emacs' internal format (emacs-mule), |
| 4184 or the first byte of UTF-16. */ | 4217 or the first byte of UTF-16. */ |
| 4185 try = (CODING_CATEGORY_MASK_SJIS | 4218 try = (CODING_CATEGORY_MASK_SJIS |
| 4186 | CODING_CATEGORY_MASK_EMACS_MULE | 4219 | CODING_CATEGORY_MASK_EMACS_MULE |
| 4187 | CODING_CATEGORY_MASK_UTF_16_BE | 4220 | CODING_CATEGORY_MASK_UTF_16_BE |
| 4188 | CODING_CATEGORY_MASK_UTF_16_LE); | 4221 | CODING_CATEGORY_MASK_UTF_16_LE); |
| 4189 | 4222 |
| 4190 /* Or, if C is a special latin extra code, | 4223 /* Or, if C is a special latin extra code, |
| 4191 or is an ISO2022 specific control code of C1 (SS2 or SS3), | 4224 or is an ISO2022 specific control code of C1 (SS2 or SS3), |
| 4192 or is an ISO2022 control-sequence-introducer (CSI), | 4225 or is an ISO2022 control-sequence-introducer (CSI), |
| 4193 we should also consider the possibility of ISO2022 codings. */ | 4226 we should also consider the possibility of ISO2022 codings. */ |
| 4194 if ((VECTORP (Vlatin_extra_code_table) | 4227 if ((latin_extra_code_state |
| 4228 && VECTORP (Vlatin_extra_code_table) | |
| 4195 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) | 4229 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])) |
| 4196 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) | 4230 || (c == ISO_CODE_SS2 || c == ISO_CODE_SS3) |
| 4197 || (c == ISO_CODE_CSI | 4231 || (c == ISO_CODE_CSI |
| 4198 && (src < src_end | 4232 && (src < src_end |
| 4199 && (*src == ']' | 4233 && (*src == ']' |
| 4200 || ((*src == '0' || *src == '1' || *src == '2') | 4234 || ((*src == '0' || *src == '1' || *src == '2') |
| 4201 && src + 1 < src_end | 4235 && src + 1 < src_end |
| 4202 && src[1] == ']'))))) | 4236 && src[1] == ']'))))) |
| 4203 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE | 4237 try |= (CODING_CATEGORY_MASK_ISO_8_ELSE |
| 4204 | CODING_CATEGORY_MASK_ISO_8BIT); | 4238 | CODING_CATEGORY_MASK_ISO_8BIT); |
| 4205 } | 4239 } |
| 4206 else | 4240 else |
| 4207 /* C is a character of ISO2022 in graphic plane right, | 4241 /* C is a character of ISO2022 in graphic plane right, |
| 4208 or a SJIS's 1-byte character code (i.e. JISX0201), | 4242 or a SJIS's 1-byte character code (i.e. JISX0201), |
| 4209 or the first byte of BIG5's 2-byte code, | 4243 or the first byte of BIG5's 2-byte code, |
| 4210 or the first byte of UTF-8/16. */ | 4244 or the first byte of UTF-8/16. */ |
| 4211 try = (CODING_CATEGORY_MASK_ISO_8_ELSE | 4245 try = (CODING_CATEGORY_MASK_ISO_8_ELSE |
| 4212 | CODING_CATEGORY_MASK_ISO_8BIT | 4246 | CODING_CATEGORY_MASK_ISO_8BIT |
| 4213 | CODING_CATEGORY_MASK_SJIS | 4247 | CODING_CATEGORY_MASK_SJIS |
| 4214 | CODING_CATEGORY_MASK_BIG5 | 4248 | CODING_CATEGORY_MASK_BIG5 |
| 4215 | CODING_CATEGORY_MASK_UTF_8 | 4249 | CODING_CATEGORY_MASK_UTF_8 |
| 4216 | CODING_CATEGORY_MASK_UTF_16_BE | 4250 | CODING_CATEGORY_MASK_UTF_16_BE |
| 4217 | CODING_CATEGORY_MASK_UTF_16_LE); | 4251 | CODING_CATEGORY_MASK_UTF_16_LE); |
| 4218 | 4252 |
| 4219 /* Or, we may have to consider the possibility of CCL. */ | 4253 /* Or, we may have to consider the possibility of CCL. */ |
| 4220 if (coding_system_table[CODING_CATEGORY_IDX_CCL] | 4254 if (! null_byte_found |
| 4255 && coding_system_table[CODING_CATEGORY_IDX_CCL] | |
| 4221 && (coding_system_table[CODING_CATEGORY_IDX_CCL] | 4256 && (coding_system_table[CODING_CATEGORY_IDX_CCL] |
| 4222 ->spec.ccl.valid_codes)[c]) | 4257 ->spec.ccl.valid_codes)[c]) |
| 4223 try |= CODING_CATEGORY_MASK_CCL; | 4258 try |= CODING_CATEGORY_MASK_CCL; |
| 4224 | 4259 |
| 4225 mask = 0; | 4260 mask = 0; |
| 4226 utf16_examined_p = iso2022_examined_p = 0; | |
| 4227 if (priorities) | 4261 if (priorities) |
| 4228 { | 4262 { |
| 4263 /* At first try detection with Latin extra codes not-allowed. | |
| 4264 If no proper coding system is found because of Latin extra | |
| 4265 codes, try detection with Latin extra codes allowed. */ | |
| 4266 latin_extra_code_state = 0; | |
| 4267 label_retry: | |
| 4268 utf16_examined_p = iso2022_examined_p = 0; | |
| 4229 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) | 4269 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) |
| 4230 { | 4270 { |
| 4231 if (!iso2022_examined_p | 4271 if (!iso2022_examined_p |
| 4232 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) | 4272 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) |
| 4233 { | 4273 { |
| 4234 mask |= detect_coding_iso2022 (src, src_end, multibytep); | 4274 mask |= detect_coding_iso2022 (src, src_end, multibytep, |
| 4275 &latin_extra_code_state); | |
| 4235 iso2022_examined_p = 1; | 4276 iso2022_examined_p = 1; |
| 4236 } | 4277 } |
| 4237 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) | 4278 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) |
| 4238 mask |= detect_coding_sjis (src, src_end, multibytep); | 4279 mask |= detect_coding_sjis (src, src_end, multibytep); |
| 4239 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8) | 4280 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8) |
| 4250 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) | 4291 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) |
| 4251 mask |= detect_coding_emacs_mule (src, src_end, multibytep); | 4292 mask |= detect_coding_emacs_mule (src, src_end, multibytep); |
| 4252 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) | 4293 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) |
| 4253 mask |= detect_coding_ccl (src, src_end, multibytep); | 4294 mask |= detect_coding_ccl (src, src_end, multibytep); |
| 4254 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) | 4295 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) |
| 4255 mask |= CODING_CATEGORY_MASK_RAW_TEXT; | 4296 { |
| 4297 if (latin_extra_code_state == 1) | |
| 4298 { | |
| 4299 /* Detection of ISO-2022 based coding system | |
| 4300 failed because of Latin extra codes. Before | |
| 4301 falling back to raw-text, try again with | |
| 4302 Latin extra codes allowed. */ | |
| 4303 latin_extra_code_state = 2; | |
| 4304 try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE | |
| 4305 | CODING_CATEGORY_MASK_ISO_8BIT); | |
| 4306 goto label_retry; | |
| 4307 } | |
| 4308 mask |= CODING_CATEGORY_MASK_RAW_TEXT; | |
| 4309 } | |
| 4256 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) | 4310 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) |
| 4257 mask |= CODING_CATEGORY_MASK_BINARY; | 4311 { |
| 4312 if (latin_extra_code_state == 1) | |
| 4313 { | |
| 4314 /* See the above comment. */ | |
| 4315 latin_extra_code_state = 2; | |
| 4316 try = (mask | CODING_CATEGORY_MASK_ISO_8_ELSE | |
| 4317 | CODING_CATEGORY_MASK_ISO_8BIT); | |
| 4318 goto label_retry; | |
| 4319 } | |
| 4320 mask |= CODING_CATEGORY_MASK_BINARY; | |
| 4321 } | |
| 4258 if (mask & priorities[i]) | 4322 if (mask & priorities[i]) |
| 4259 return priorities[i]; | 4323 return priorities[i]; |
| 4260 } | 4324 } |
| 4261 return CODING_CATEGORY_MASK_RAW_TEXT; | 4325 return CODING_CATEGORY_MASK_RAW_TEXT; |
| 4262 } | 4326 } |
| 4263 if (try & CODING_CATEGORY_MASK_ISO) | 4327 if (try & CODING_CATEGORY_MASK_ISO) |
| 4264 mask |= detect_coding_iso2022 (src, src_end, multibytep); | 4328 mask |= detect_coding_iso2022 (src, src_end, multibytep, |
| 4329 &latin_extra_code_state); | |
| 4265 if (try & CODING_CATEGORY_MASK_SJIS) | 4330 if (try & CODING_CATEGORY_MASK_SJIS) |
| 4266 mask |= detect_coding_sjis (src, src_end, multibytep); | 4331 mask |= detect_coding_sjis (src, src_end, multibytep); |
| 4267 if (try & CODING_CATEGORY_MASK_BIG5) | 4332 if (try & CODING_CATEGORY_MASK_BIG5) |
| 4268 mask |= detect_coding_big5 (src, src_end, multibytep); | 4333 mask |= detect_coding_big5 (src, src_end, multibytep); |
| 4269 if (try & CODING_CATEGORY_MASK_UTF_8) | 4334 if (try & CODING_CATEGORY_MASK_UTF_8) |
