comparison src/coding.c @ 89331:1892a75ffcac

(CATEGORY_MASK_RAW_TEXT): New macro. (detect_coding_utf_8, detect_coding_utf_16) (detect_coding_emacs_mule, detect_coding_iso_2022) (detect_coding_sjis, detect_coding_big5) (detect_coding_ccl, detect_coding_charset): Change argument MASK to DETECT_INFO. Update DETECT_INFO and return 1 if the byte sequence is valid in this coding system. Callers changed. (MAX_ANNOTATION_LENGTH): New macro. (ADD_ANNOTATION_DATA): New macro. (ADD_COMPOSITION_DATA): Argument changed. Callers changed. Call ADD_ANNOTATION_DATA. The format of annotation data changed. (ADD_CHARSET_DATA): New macro. (emacs_mule_char): New argument ID. Callers changed. (decode_coding_emacs_mule, decode_coding_iso_2022) (decode_coding_sjis, decode_coding_big5, decode_coding_charset): Produce charset annotation data in coding->charbuf. (encode_coding_emacs_mule, encode_coding_iso_2022): Pay attention to charset annotation data in coding->charbuf. (setup_coding_system): Add CODING_ANNOTATE_CHARSET_MASK coding->common_flags if the coding system is iso-2022 based and uses designation. (produce_composition): Adjusted for the new annotation data format. (produce_charset): New function. (produce_annotation): Handle charset annotation. (handle_composition_annotation, handle_charset_annotation): New functions. (consume_chars): Handle charset annotation. Utilize the above two functions. (encode_coding_object): If SRC_OBJECT and DST_OBJECT are the same buffer, get the deleted text as a string and set coding->src_object to that string. (detect_coding, detect_coding_system): Use the new struct coding_detection_info.
author Kenichi Handa <handa@m17n.org>
date Mon, 06 Jan 2003 11:37:17 +0000
parents 1fd77c471ee6
children 4cc9e57fcabc
comparison
equal deleted inserted replaced
89330:ee0338e83a2b 89331:1892a75ffcac
142 142
143 143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions *** 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145 145
146 These functions check if a byte sequence specified as a source in 146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX. Return 1 if the data contains 147 CODING conforms to the format of XXX, and update the members of
148 a byte sequence which can be decoded into non-ASCII characters by 148 DETECT_INFO.
149 the coding system. Otherwize (i.e. the data contains only ASCII 149
150 characters or invalid sequence) return 0. 150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 It also resets some bits of an integer pointed by MASK. The macros
153 CATEGORY_MASK_XXX specifies each bit of this integer.
154 151
155 Below is the template of these functions. */ 152 Below is the template of these functions. */
156 153
157 #if 0 154 #if 0
158 static int 155 static int
159 detect_coding_XXX (coding, mask) 156 detect_coding_XXX (coding, detect_info)
160 struct coding_system *coding; 157 struct coding_system *coding;
161 int *mask; 158 struct coding_detection_info *detect_info;
162 { 159 {
163 unsigned char *src = coding->source; 160 unsigned char *src = coding->source;
164 unsigned char *src_end = coding->source + coding->src_bytes; 161 unsigned char *src_end = coding->source + coding->src_bytes;
165 int multibytep = coding->src_multibyte; 162 int multibytep = coding->src_multibyte;
166 int c; 163 int consumed_chars = 0;
167 int found = 0; 164 int found = 0;
168 ...; 165 ...;
169 166
170 while (1) 167 while (1)
171 { 168 {
172 /* Get one byte from the source. If the souce is exausted, jump 169 /* Get one byte from the source. If the souce is exausted, jump
173 to no_more_source:. */ 170 to no_more_source:. */
174 ONE_MORE_BYTE (c); 171 ONE_MORE_BYTE (c);
175 /* Check if it conforms to XXX. If not, break the loop. */ 172
176 } 173 if (! __C_conforms_to_XXX___ (c))
177 /* As the data is invalid for XXX, reset a proper bits. */ 174 break;
178 *mask &= ~CODING_CATEGORY_XXX; 175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
179 return 0; 180 return 0;
181
180 no_more_source: 182 no_more_source:
181 /* The source exausted. */ 183 /* The source exausted successfully. */
182 if (!found) 184 detect_info->found |= found;
183 /* ASCII characters only. */
184 return 0;
185 /* Some data should be decoded into non-ASCII characters. */
186 *mask &= CODING_CATEGORY_XXX;
187 return 1; 185 return 1;
188 } 186 }
189 #endif 187 #endif
190 188
191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions *** 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
406 /* Two special coding systems. */ 404 /* Two special coding systems. */
407 Lisp_Object Vsjis_coding_system; 405 Lisp_Object Vsjis_coding_system;
408 Lisp_Object Vbig5_coding_system; 406 Lisp_Object Vbig5_coding_system;
409 407
410 408
411 static int detect_coding_utf_8 P_ ((struct coding_system *, int *)); 409 static int detect_coding_utf_8 P_ ((struct coding_system *,
410 struct coding_detection_info *info));
412 static void decode_coding_utf_8 P_ ((struct coding_system *)); 411 static void decode_coding_utf_8 P_ ((struct coding_system *));
413 static int encode_coding_utf_8 P_ ((struct coding_system *)); 412 static int encode_coding_utf_8 P_ ((struct coding_system *));
414 413
415 static int detect_coding_utf_16 P_ ((struct coding_system *, int *)); 414 static int detect_coding_utf_16 P_ ((struct coding_system *,
415 struct coding_detection_info *info));
416 static void decode_coding_utf_16 P_ ((struct coding_system *)); 416 static void decode_coding_utf_16 P_ ((struct coding_system *));
417 static int encode_coding_utf_16 P_ ((struct coding_system *)); 417 static int encode_coding_utf_16 P_ ((struct coding_system *));
418 418
419 static int detect_coding_iso_2022 P_ ((struct coding_system *, int *)); 419 static int detect_coding_iso_2022 P_ ((struct coding_system *,
420 struct coding_detection_info *info));
420 static void decode_coding_iso_2022 P_ ((struct coding_system *)); 421 static void decode_coding_iso_2022 P_ ((struct coding_system *));
421 static int encode_coding_iso_2022 P_ ((struct coding_system *)); 422 static int encode_coding_iso_2022 P_ ((struct coding_system *));
422 423
423 static int detect_coding_emacs_mule P_ ((struct coding_system *, int *)); 424 static int detect_coding_emacs_mule P_ ((struct coding_system *,
425 struct coding_detection_info *info));
424 static void decode_coding_emacs_mule P_ ((struct coding_system *)); 426 static void decode_coding_emacs_mule P_ ((struct coding_system *));
425 static int encode_coding_emacs_mule P_ ((struct coding_system *)); 427 static int encode_coding_emacs_mule P_ ((struct coding_system *));
426 428
427 static int detect_coding_sjis P_ ((struct coding_system *, int *)); 429 static int detect_coding_sjis P_ ((struct coding_system *,
430 struct coding_detection_info *info));
428 static void decode_coding_sjis P_ ((struct coding_system *)); 431 static void decode_coding_sjis P_ ((struct coding_system *));
429 static int encode_coding_sjis P_ ((struct coding_system *)); 432 static int encode_coding_sjis P_ ((struct coding_system *));
430 433
431 static int detect_coding_big5 P_ ((struct coding_system *, int *)); 434 static int detect_coding_big5 P_ ((struct coding_system *,
435 struct coding_detection_info *info));
432 static void decode_coding_big5 P_ ((struct coding_system *)); 436 static void decode_coding_big5 P_ ((struct coding_system *));
433 static int encode_coding_big5 P_ ((struct coding_system *)); 437 static int encode_coding_big5 P_ ((struct coding_system *));
434 438
435 static int detect_coding_ccl P_ ((struct coding_system *, int *)); 439 static int detect_coding_ccl P_ ((struct coding_system *,
440 struct coding_detection_info *info));
436 static void decode_coding_ccl P_ ((struct coding_system *)); 441 static void decode_coding_ccl P_ ((struct coding_system *));
437 static int encode_coding_ccl P_ ((struct coding_system *)); 442 static int encode_coding_ccl P_ ((struct coding_system *));
438 443
439 static void decode_coding_raw_text P_ ((struct coding_system *)); 444 static void decode_coding_raw_text P_ ((struct coding_system *));
440 static int encode_coding_raw_text P_ ((struct coding_system *)); 445 static int encode_coding_raw_text P_ ((struct coding_system *));
629 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset) 634 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
630 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis) 635 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
631 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5) 636 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
632 #define CATEGORY_MASK_CCL (1 << coding_category_ccl) 637 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
633 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule) 638 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
639 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
634 640
635 /* This value is returned if detect_coding_mask () find nothing other 641 /* This value is returned if detect_coding_mask () find nothing other
636 than ASCII characters. */ 642 than ASCII characters. */
637 #define CATEGORY_MASK_ANY \ 643 #define CATEGORY_MASK_ANY \
638 (CATEGORY_MASK_ISO_7 \ 644 (CATEGORY_MASK_ISO_7 \
1000 coding_set_destination (coding); 1006 coding_set_destination (coding);
1001 dst = coding->destination + offset; 1007 dst = coding->destination + offset;
1002 return dst; 1008 return dst;
1003 } 1009 }
1004 1010
1011 /** Macros for annotations. */
1012
1013 /* Maximum length of annotation data (sum of annotations for
1014 composition and charset). */
1015 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
1016
1017 /* An annotation data is stored in the array coding->charbuf in this
1018 format:
1019 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
1020 LENGTH is the number of elements in the annotation.
1021 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1022 FROM and TO specify the range of text annotated. They are relative
1023 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
1024
1025 The format of the following elements depend on ANNOTATION_MASK.
1026
1027 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1028 follows:
1029 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1030 METHOD is one of enum composition_method.
1031 Optionnal COMPOSITION-COMPONENTS are characters and composition
1032 rules.
1033
1034 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1035 follows. */
1036
1037 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1038 do { \
1039 *(buf)++ = -(len); \
1040 *(buf)++ = (mask); \
1041 *(buf)++ = (from); \
1042 *(buf)++ = (to); \
1043 coding->annotated = 1; \
1044 } while (0);
1045
1046 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1047 do { \
1048 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1049 *buf++ = method; \
1050 } while (0)
1051
1052
1053 #define ADD_CHARSET_DATA(buf, from, to, id) \
1054 do { \
1055 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1056 *buf++ = id; \
1057 } while (0)
1058
1005 1059
1006 /*** 2. Emacs' internal format (emacs-utf-8) ***/ 1060 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1007 1061
1008 1062
1009 1063
1010 1064
1011 /*** 3. UTF-8 ***/ 1065 /*** 3. UTF-8 ***/
1012 1066
1013 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 1067 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1014 Check if a text is encoded in UTF-8. If it is, return 1068 Check if a text is encoded in UTF-8. If it is, return 1, else
1015 CATEGORY_MASK_UTF_8, else return 0. */ 1069 return 0. */
1016 1070
1017 #define UTF_8_1_OCTET_P(c) ((c) < 0x80) 1071 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1018 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80) 1072 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1019 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0) 1073 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1020 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0) 1074 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1021 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) 1075 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1022 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) 1076 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1023 1077
1024 static int 1078 static int
1025 detect_coding_utf_8 (coding, mask) 1079 detect_coding_utf_8 (coding, detect_info)
1026 struct coding_system *coding; 1080 struct coding_system *coding;
1027 int *mask; 1081 struct coding_detection_info *detect_info;
1028 { 1082 {
1029 unsigned char *src = coding->source, *src_base = src; 1083 unsigned char *src = coding->source, *src_base = src;
1030 unsigned char *src_end = coding->source + coding->src_bytes; 1084 unsigned char *src_end = coding->source + coding->src_bytes;
1031 int multibytep = coding->src_multibyte; 1085 int multibytep = coding->src_multibyte;
1032 int consumed_chars = 0; 1086 int consumed_chars = 0;
1033 int found = 0; 1087 int found = 0;
1034 int incomplete; 1088 int incomplete;
1035 1089
1090 detect_info->checked |= CATEGORY_MASK_UTF_8;
1036 /* A coding system of this category is always ASCII compatible. */ 1091 /* A coding system of this category is always ASCII compatible. */
1037 src += coding->head_ascii; 1092 src += coding->head_ascii;
1038 1093
1039 while (1) 1094 while (1)
1040 { 1095 {
1048 ONE_MORE_BYTE (c1); 1103 ONE_MORE_BYTE (c1);
1049 if (! UTF_8_EXTRA_OCTET_P (c1)) 1104 if (! UTF_8_EXTRA_OCTET_P (c1))
1050 break; 1105 break;
1051 if (UTF_8_2_OCTET_LEADING_P (c)) 1106 if (UTF_8_2_OCTET_LEADING_P (c))
1052 { 1107 {
1053 found++; 1108 found = CATEGORY_MASK_UTF_8;
1054 continue; 1109 continue;
1055 } 1110 }
1056 ONE_MORE_BYTE (c2); 1111 ONE_MORE_BYTE (c2);
1057 if (! UTF_8_EXTRA_OCTET_P (c2)) 1112 if (! UTF_8_EXTRA_OCTET_P (c2))
1058 break; 1113 break;
1059 if (UTF_8_3_OCTET_LEADING_P (c)) 1114 if (UTF_8_3_OCTET_LEADING_P (c))
1060 { 1115 {
1061 found++; 1116 found = CATEGORY_MASK_UTF_8;
1062 continue; 1117 continue;
1063 } 1118 }
1064 ONE_MORE_BYTE (c3); 1119 ONE_MORE_BYTE (c3);
1065 if (! UTF_8_EXTRA_OCTET_P (c3)) 1120 if (! UTF_8_EXTRA_OCTET_P (c3))
1066 break; 1121 break;
1067 if (UTF_8_4_OCTET_LEADING_P (c)) 1122 if (UTF_8_4_OCTET_LEADING_P (c))
1068 { 1123 {
1069 found++; 1124 found = CATEGORY_MASK_UTF_8;
1070 continue; 1125 continue;
1071 } 1126 }
1072 ONE_MORE_BYTE (c4); 1127 ONE_MORE_BYTE (c4);
1073 if (! UTF_8_EXTRA_OCTET_P (c4)) 1128 if (! UTF_8_EXTRA_OCTET_P (c4))
1074 break; 1129 break;
1075 if (UTF_8_5_OCTET_LEADING_P (c)) 1130 if (UTF_8_5_OCTET_LEADING_P (c))
1076 { 1131 {
1077 found++; 1132 found = CATEGORY_MASK_UTF_8;
1078 continue; 1133 continue;
1079 } 1134 }
1080 break; 1135 break;
1081 } 1136 }
1082 *mask &= ~CATEGORY_MASK_UTF_8; 1137 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1083 return 0; 1138 return 0;
1084 1139
1085 no_more_source: 1140 no_more_source:
1086 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) 1141 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1087 { 1142 {
1088 *mask &= ~CATEGORY_MASK_UTF_8; 1143 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1089 return 0; 1144 return 0;
1090 } 1145 }
1091 return found; 1146 detect_info->found |= found;
1147 return 1;
1092 } 1148 }
1093 1149
1094 1150
1095 static void 1151 static void
1096 decode_coding_utf_8 (coding) 1152 decode_coding_utf_8 (coding)
1267 return 0; 1323 return 0;
1268 } 1324 }
1269 1325
1270 1326
1271 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 1327 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1272 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or 1328 Check if a text is encoded in one of UTF-16 based coding systems.
1273 Little Endian (otherwise). If it is, return 1329 If it is, return 1, else return 0. */
1274 CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1275 else return 0. */
1276 1330
1277 #define UTF_16_HIGH_SURROGATE_P(val) \ 1331 #define UTF_16_HIGH_SURROGATE_P(val) \
1278 (((val) & 0xFC00) == 0xD800) 1332 (((val) & 0xFC00) == 0xD800)
1279 1333
1280 #define UTF_16_LOW_SURROGATE_P(val) \ 1334 #define UTF_16_LOW_SURROGATE_P(val) \
1285 || ((val) == 0xFFFF) \ 1339 || ((val) == 0xFFFF) \
1286 || UTF_16_LOW_SURROGATE_P (val)) 1340 || UTF_16_LOW_SURROGATE_P (val))
1287 1341
1288 1342
1289 static int 1343 static int
1290 detect_coding_utf_16 (coding, mask) 1344 detect_coding_utf_16 (coding, detect_info)
1291 struct coding_system *coding; 1345 struct coding_system *coding;
1292 int *mask; 1346 struct coding_detection_info *detect_info;
1293 { 1347 {
1294 unsigned char *src = coding->source, *src_base = src; 1348 unsigned char *src = coding->source, *src_base = src;
1295 unsigned char *src_end = coding->source + coding->src_bytes; 1349 unsigned char *src_end = coding->source + coding->src_bytes;
1296 int multibytep = coding->src_multibyte; 1350 int multibytep = coding->src_multibyte;
1297 int consumed_chars = 0; 1351 int consumed_chars = 0;
1298 int c1, c2; 1352 int c1, c2;
1299 1353
1300 *mask &= ~CATEGORY_MASK_UTF_16; 1354 detect_info->checked |= CATEGORY_MASK_UTF_16;
1301 1355
1356 if (coding->mode & CODING_MODE_LAST_BLOCK
1357 && (coding->src_bytes & 1))
1358 {
1359 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1360 return 0;
1361 }
1302 ONE_MORE_BYTE (c1); 1362 ONE_MORE_BYTE (c1);
1303 ONE_MORE_BYTE (c2); 1363 ONE_MORE_BYTE (c2);
1304 1364
1305 if ((c1 == 0xFF) && (c2 == 0xFE)) 1365 if ((c1 == 0xFF) && (c2 == 0xFE))
1306 *mask |= CATEGORY_MASK_UTF_16_LE; 1366 {
1367 detect_info->found |= CATEGORY_MASK_UTF_16_LE;
1368 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1369 }
1307 else if ((c1 == 0xFE) && (c2 == 0xFF)) 1370 else if ((c1 == 0xFE) && (c2 == 0xFF))
1308 *mask |= CATEGORY_MASK_UTF_16_BE; 1371 {
1309 else 1372 detect_info->found |= CATEGORY_MASK_UTF_16_BE;
1310 *mask |= CATEGORY_MASK_UTF_16_BE_NOSIG | CATEGORY_MASK_UTF_16_LE_NOSIG; 1373 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1374 }
1375 no_more_source:
1311 return 1; 1376 return 1;
1312
1313 no_more_source:
1314 return 0;
1315 } 1377 }
1316 1378
1317 static void 1379 static void
1318 decode_coding_utf_16 (coding) 1380 decode_coding_utf_16 (coding)
1319 struct coding_system *coding; 1381 struct coding_system *coding;
1557 */ 1619 */
1558 1620
1559 char emacs_mule_bytes[256]; 1621 char emacs_mule_bytes[256];
1560 1622
1561 int 1623 int
1562 emacs_mule_char (coding, src, nbytes, nchars) 1624 emacs_mule_char (coding, src, nbytes, nchars, id)
1563 struct coding_system *coding; 1625 struct coding_system *coding;
1564 unsigned char *src; 1626 unsigned char *src;
1565 int *nbytes, *nchars; 1627 int *nbytes, *nchars, *id;
1566 { 1628 {
1567 unsigned char *src_end = coding->source + coding->src_bytes; 1629 unsigned char *src_end = coding->source + coding->src_bytes;
1568 int multibytep = coding->src_multibyte; 1630 int multibytep = coding->src_multibyte;
1569 unsigned char *src_base = src; 1631 unsigned char *src_base = src;
1570 struct charset *charset; 1632 struct charset *charset;
1625 c = DECODE_CHAR (charset, code); 1687 c = DECODE_CHAR (charset, code);
1626 if (c < 0) 1688 if (c < 0)
1627 goto invalid_code; 1689 goto invalid_code;
1628 *nbytes = src - src_base; 1690 *nbytes = src - src_base;
1629 *nchars = consumed_chars; 1691 *nchars = consumed_chars;
1692 if (id)
1693 *id = charset->id;
1630 return c; 1694 return c;
1631 1695
1632 no_more_source: 1696 no_more_source:
1633 return -2; 1697 return -2;
1634 1698
1636 return -1; 1700 return -1;
1637 } 1701 }
1638 1702
1639 1703
1640 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 1704 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1641 Check if a text is encoded in `emacs-mule'. */ 1705 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1706 else return 0. */
1642 1707
1643 static int 1708 static int
1644 detect_coding_emacs_mule (coding, mask) 1709 detect_coding_emacs_mule (coding, detect_info)
1645 struct coding_system *coding; 1710 struct coding_system *coding;
1646 int *mask; 1711 struct coding_detection_info *detect_info;
1647 { 1712 {
1648 unsigned char *src = coding->source, *src_base = src; 1713 unsigned char *src = coding->source, *src_base = src;
1649 unsigned char *src_end = coding->source + coding->src_bytes; 1714 unsigned char *src_end = coding->source + coding->src_bytes;
1650 int multibytep = coding->src_multibyte; 1715 int multibytep = coding->src_multibyte;
1651 int consumed_chars = 0; 1716 int consumed_chars = 0;
1652 int c; 1717 int c;
1653 int found = 0; 1718 int found = 0;
1654 int incomplete; 1719 int incomplete;
1655 1720
1721 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1656 /* A coding system of this category is always ASCII compatible. */ 1722 /* A coding system of this category is always ASCII compatible. */
1657 src += coding->head_ascii; 1723 src += coding->head_ascii;
1658 1724
1659 while (1) 1725 while (1)
1660 { 1726 {
1678 } 1744 }
1679 while (c >= 0xA0); 1745 while (c >= 0xA0);
1680 1746
1681 if (src - src_base <= 4) 1747 if (src - src_base <= 4)
1682 break; 1748 break;
1683 found = 1; 1749 found = CATEGORY_MASK_EMACS_MULE;
1684 if (c == 0x80) 1750 if (c == 0x80)
1685 goto repeat; 1751 goto repeat;
1686 } 1752 }
1687 1753
1688 if (c < 0x80) 1754 if (c < 0x80)
1700 ONE_MORE_BYTE (c); 1766 ONE_MORE_BYTE (c);
1701 } 1767 }
1702 while (c >= 0xA0); 1768 while (c >= 0xA0);
1703 if (src - src_base != emacs_mule_bytes[*src_base]) 1769 if (src - src_base != emacs_mule_bytes[*src_base])
1704 break; 1770 break;
1705 found = 1; 1771 found = CATEGORY_MASK_EMACS_MULE;
1706 } 1772 }
1707 } 1773 }
1708 *mask &= ~CATEGORY_MASK_EMACS_MULE; 1774 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1709 return 0; 1775 return 0;
1710 1776
1711 no_more_source: 1777 no_more_source:
1712 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) 1778 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1713 { 1779 {
1714 *mask &= ~CATEGORY_MASK_EMACS_MULE; 1780 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1715 return 0; 1781 return 0;
1716 } 1782 }
1717 return found; 1783 detect_info->found |= found;
1784 return 1;
1718 } 1785 }
1719 1786
1720 1787
1721 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ 1788 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1722 1789
1733 int c; \ 1800 int c; \
1734 int nbytes, nchars; \ 1801 int nbytes, nchars; \
1735 \ 1802 \
1736 if (src == src_end) \ 1803 if (src == src_end) \
1737 break; \ 1804 break; \
1738 c = emacs_mule_char (coding, src, &nbytes, &nchars); \ 1805 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1739 if (c < 0) \ 1806 if (c < 0) \
1740 { \ 1807 { \
1741 if (c == -2) \ 1808 if (c == -2) \
1742 break; \ 1809 break; \
1743 goto invalid_code; \ 1810 goto invalid_code; \
1790 goto invalid_code; \ 1857 goto invalid_code; \
1791 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \ 1858 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1792 } while (0) 1859 } while (0)
1793 1860
1794 1861
1795 #define ADD_COMPOSITION_DATA(buf, method, nchars) \
1796 do { \
1797 *buf++ = -5; \
1798 *buf++ = coding->produced_char + char_offset; \
1799 *buf++ = CODING_ANNOTATE_COMPOSITION_MASK; \
1800 *buf++ = method; \
1801 *buf++ = nchars; \
1802 } while (0)
1803
1804
1805 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \ 1862 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1806 do { \ 1863 do { \
1807 /* Emacs 21 style format. The first three bytes at SRC are \ 1864 /* Emacs 21 style format. The first three bytes at SRC are \
1808 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \ 1865 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1809 the byte length of this composition information, CHARS is the \ 1866 the byte length of this composition information, CHARS is the \
1810 number of characters composed by this composition. */ \ 1867 number of characters composed by this composition. */ \
1811 enum composition_method method = c - 0xF2; \ 1868 enum composition_method method = c - 0xF2; \
1812 int *charbuf_base = charbuf; \ 1869 int *charbuf_base = charbuf; \
1870 int from, to; \
1813 int consumed_chars_limit; \ 1871 int consumed_chars_limit; \
1814 int nbytes, nchars; \ 1872 int nbytes, nchars; \
1815 \ 1873 \
1816 ONE_MORE_BYTE (c); \ 1874 ONE_MORE_BYTE (c); \
1817 nbytes = c - 0xA0; \ 1875 nbytes = c - 0xA0; \
1818 if (nbytes < 3) \ 1876 if (nbytes < 3) \
1819 goto invalid_code; \ 1877 goto invalid_code; \
1820 ONE_MORE_BYTE (c); \ 1878 ONE_MORE_BYTE (c); \
1821 nchars = c - 0xA0; \ 1879 nchars = c - 0xA0; \
1822 ADD_COMPOSITION_DATA (charbuf, method, nchars); \ 1880 from = coding->produced + char_offset; \
1881 to = from + nchars; \
1882 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1823 consumed_chars_limit = consumed_chars_base + nbytes; \ 1883 consumed_chars_limit = consumed_chars_base + nbytes; \
1824 if (method != COMPOSITION_RELATIVE) \ 1884 if (method != COMPOSITION_RELATIVE) \
1825 { \ 1885 { \
1826 int i = 0; \ 1886 int i = 0; \
1827 while (consumed_chars < consumed_chars_limit) \ 1887 while (consumed_chars < consumed_chars_limit) \
1841 1901
1842 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \ 1902 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1843 do { \ 1903 do { \
1844 /* Emacs 20 style format for relative composition. */ \ 1904 /* Emacs 20 style format for relative composition. */ \
1845 /* Store multibyte form of characters to be composed. */ \ 1905 /* Store multibyte form of characters to be composed. */ \
1906 enum composition_method method = COMPOSITION_RELATIVE; \
1846 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ 1907 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1847 int *buf = components; \ 1908 int *buf = components; \
1848 int i, j; \ 1909 int i, j; \
1910 int from, to; \
1849 \ 1911 \
1850 src = src_base; \ 1912 src = src_base; \
1851 ONE_MORE_BYTE (c); /* skip 0x80 */ \ 1913 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1852 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \ 1914 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1853 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ 1915 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1854 if (i < 2) \ 1916 if (i < 2) \
1855 goto invalid_code; \ 1917 goto invalid_code; \
1856 ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i); \ 1918 from = coding->produced_char + char_offset; \
1919 to = from + i; \
1920 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1857 for (j = 0; j < i; j++) \ 1921 for (j = 0; j < i; j++) \
1858 *charbuf++ = components[j]; \ 1922 *charbuf++ = components[j]; \
1859 } while (0) 1923 } while (0)
1860 1924
1861 1925
1862 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \ 1926 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1863 do { \ 1927 do { \
1864 /* Emacs 20 style format for rule-base composition. */ \ 1928 /* Emacs 20 style format for rule-base composition. */ \
1865 /* Store multibyte form of characters to be composed. */ \ 1929 /* Store multibyte form of characters to be composed. */ \
1930 enum composition_method method = COMPOSITION_WITH_RULE; \
1866 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \ 1931 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1867 int *buf = components; \ 1932 int *buf = components; \
1868 int i, j; \ 1933 int i, j; \
1934 int from, to; \
1869 \ 1935 \
1870 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \ 1936 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1871 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \ 1937 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1872 { \ 1938 { \
1873 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \ 1939 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1875 } \ 1941 } \
1876 if (i < 1 || (buf - components) % 2 == 0) \ 1942 if (i < 1 || (buf - components) % 2 == 0) \
1877 goto invalid_code; \ 1943 goto invalid_code; \
1878 if (charbuf + i + (i / 2) + 1 < charbuf_end) \ 1944 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1879 goto no_more_source; \ 1945 goto no_more_source; \
1880 ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i); \ 1946 from = coding->produced_char + char_offset; \
1947 to = from + i; \
1948 ADD_COMPOSITION_DATA (buf, from, to, method); \
1881 for (j = 0; j < i; j++) \ 1949 for (j = 0; j < i; j++) \
1882 *charbuf++ = components[j]; \ 1950 *charbuf++ = components[j]; \
1883 for (j = 0; j < i; j += 2) \ 1951 for (j = 0; j < i; j += 2) \
1884 *charbuf++ = components[j]; \ 1952 *charbuf++ = components[j]; \
1885 } while (0) 1953 } while (0)
1891 { 1959 {
1892 unsigned char *src = coding->source + coding->consumed; 1960 unsigned char *src = coding->source + coding->consumed;
1893 unsigned char *src_end = coding->source + coding->src_bytes; 1961 unsigned char *src_end = coding->source + coding->src_bytes;
1894 unsigned char *src_base; 1962 unsigned char *src_base;
1895 int *charbuf = coding->charbuf; 1963 int *charbuf = coding->charbuf;
1896 int *charbuf_end = charbuf + coding->charbuf_size; 1964 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1897 int consumed_chars = 0, consumed_chars_base; 1965 int consumed_chars = 0, consumed_chars_base;
1898 int char_offset = 0;
1899 int multibytep = coding->src_multibyte; 1966 int multibytep = coding->src_multibyte;
1900 Lisp_Object attrs, eol_type, charset_list; 1967 Lisp_Object attrs, eol_type, charset_list;
1968 int char_offset = coding->produced_char;
1969 int last_offset = char_offset;
1970 int last_id = charset_ascii;
1901 1971
1902 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 1972 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1903 1973
1904 while (1) 1974 while (1)
1905 { 1975 {
1933 *charbuf++ = c; 2003 *charbuf++ = c;
1934 char_offset++; 2004 char_offset++;
1935 } 2005 }
1936 else if (c == 0x80) 2006 else if (c == 0x80)
1937 { 2007 {
1938 if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1939 break;
1940 ONE_MORE_BYTE (c); 2008 ONE_MORE_BYTE (c);
1941 if (c - 0xF2 >= COMPOSITION_RELATIVE 2009 if (c - 0xF2 >= COMPOSITION_RELATIVE
1942 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS) 2010 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1943 DECODE_EMACS_MULE_21_COMPOSITION (c); 2011 DECODE_EMACS_MULE_21_COMPOSITION (c);
1944 else if (c < 0xC0) 2012 else if (c < 0xC0)
1945 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c); 2013 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1946 else if (c == 0xFF) 2014 else if (c == 0xFF)
1947 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c); 2015 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1948 else 2016 else
1949 goto invalid_code; 2017 goto invalid_code;
1950 coding->annotated = 1;
1951 } 2018 }
1952 else if (c < 0xA0 && emacs_mule_bytes[c] > 1) 2019 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1953 { 2020 {
1954 int nbytes, nchars; 2021 int nbytes, nchars;
2022 int id;
2023
1955 src = src_base; 2024 src = src_base;
1956 consumed_chars = consumed_chars_base; 2025 consumed_chars = consumed_chars_base;
1957 c = emacs_mule_char (coding, src, &nbytes, &nchars); 2026 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1958 if (c < 0) 2027 if (c < 0)
1959 { 2028 {
1960 if (c == -2) 2029 if (c == -2)
1961 break; 2030 break;
1962 goto invalid_code; 2031 goto invalid_code;
1963 } 2032 }
2033 if (last_id != id)
2034 {
2035 if (last_id != charset_ascii)
2036 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2037 last_id = id;
2038 last_offset = char_offset;
2039 }
1964 *charbuf++ = c; 2040 *charbuf++ = c;
1965 src += nbytes; 2041 src += nbytes;
1966 consumed_chars += nchars; 2042 consumed_chars += nchars;
1967 char_offset++; 2043 char_offset++;
1968 } 2044 }
1971 invalid_code: 2047 invalid_code:
1972 src = src_base; 2048 src = src_base;
1973 consumed_chars = consumed_chars_base; 2049 consumed_chars = consumed_chars_base;
1974 ONE_MORE_BYTE (c); 2050 ONE_MORE_BYTE (c);
1975 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 2051 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2052 char_offset++;
1976 coding->errors++; 2053 coding->errors++;
1977 } 2054 }
1978 2055
1979 no_more_source: 2056 no_more_source:
2057 if (last_id != charset_ascii)
2058 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
1980 coding->consumed_char += consumed_chars_base; 2059 coding->consumed_char += consumed_chars_base;
1981 coding->consumed = src_base - coding->source; 2060 coding->consumed = src_base - coding->source;
1982 coding->charbuf_used = charbuf - coding->charbuf; 2061 coding->charbuf_used = charbuf - coding->charbuf;
1983 } 2062 }
1984 2063
2009 unsigned char *dst_end = coding->destination + coding->dst_bytes; 2088 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2010 int safe_room = 8; 2089 int safe_room = 8;
2011 int produced_chars = 0; 2090 int produced_chars = 0;
2012 Lisp_Object attrs, eol_type, charset_list; 2091 Lisp_Object attrs, eol_type, charset_list;
2013 int c; 2092 int c;
2093 int preferred_charset_id = -1;
2014 2094
2015 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 2095 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2016 2096
2017 while (charbuf < charbuf_end) 2097 while (charbuf < charbuf_end)
2018 { 2098 {
2019 ASSURE_DESTINATION (safe_room); 2099 ASSURE_DESTINATION (safe_room);
2020 c = *charbuf++; 2100 c = *charbuf++;
2101
2102 if (c < 0)
2103 {
2104 /* Handle an annotation. */
2105 switch (*charbuf)
2106 {
2107 case CODING_ANNOTATE_COMPOSITION_MASK:
2108 /* Not yet implemented. */
2109 break;
2110 case CODING_ANNOTATE_CHARSET_MASK:
2111 preferred_charset_id = charbuf[3];
2112 if (preferred_charset_id >= 0
2113 && NILP (Fmemq (make_number (preferred_charset_id),
2114 charset_list)))
2115 preferred_charset_id = -1;
2116 break;
2117 default:
2118 abort ();
2119 }
2120 charbuf += -c - 1;
2121 continue;
2122 }
2123
2021 if (ASCII_CHAR_P (c)) 2124 if (ASCII_CHAR_P (c))
2022 EMIT_ONE_ASCII_BYTE (c); 2125 EMIT_ONE_ASCII_BYTE (c);
2023 else if (CHAR_BYTE8_P (c)) 2126 else if (CHAR_BYTE8_P (c))
2024 { 2127 {
2025 c = CHAR_TO_BYTE8 (c); 2128 c = CHAR_TO_BYTE8 (c);
2031 unsigned code; 2134 unsigned code;
2032 int dimension; 2135 int dimension;
2033 int emacs_mule_id; 2136 int emacs_mule_id;
2034 unsigned char leading_codes[2]; 2137 unsigned char leading_codes[2];
2035 2138
2036 charset = char_charset (c, charset_list, &code); 2139 if (preferred_charset_id >= 0)
2140 {
2141 charset = CHARSET_FROM_ID (preferred_charset_id);
2142 if (! CHAR_CHARSET_P (c, charset))
2143 charset = char_charset (c, charset_list, NULL);
2144 }
2145 else
2146 charset = char_charset (c, charset_list, &code);
2037 if (! charset) 2147 if (! charset)
2038 { 2148 {
2039 c = coding->default_char; 2149 c = coding->default_char;
2040 if (ASCII_CHAR_P (c)) 2150 if (ASCII_CHAR_P (c))
2041 { 2151 {
2317 ASET (attrs, coding_attr_safe_charsets, safe_charsets); 2427 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2318 } 2428 }
2319 2429
2320 2430
2321 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 2431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2322 Check if a text is encoded in ISO2022. If it is, returns an 2432 Check if a text is encoded in one of ISO-2022 based codig systems.
2323 integer in which appropriate flag bits any of: 2433 If it is, return 1, else return 0. */
2324 CATEGORY_MASK_ISO_7
2325 CATEGORY_MASK_ISO_7_TIGHT
2326 CATEGORY_MASK_ISO_8_1
2327 CATEGORY_MASK_ISO_8_2
2328 CATEGORY_MASK_ISO_7_ELSE
2329 CATEGORY_MASK_ISO_8_ELSE
2330 are set. If a code which should never appear in ISO2022 is found,
2331 returns 0. */
2332 2434
2333 static int 2435 static int
2334 detect_coding_iso_2022 (coding, mask) 2436 detect_coding_iso_2022 (coding, detect_info)
2335 struct coding_system *coding; 2437 struct coding_system *coding;
2336 int *mask; 2438 struct coding_detection_info *detect_info;
2337 { 2439 {
2338 unsigned char *src = coding->source, *src_base = src; 2440 unsigned char *src = coding->source, *src_base = src;
2339 unsigned char *src_end = coding->source + coding->src_bytes; 2441 unsigned char *src_end = coding->source + coding->src_bytes;
2340 int multibytep = coding->src_multibyte; 2442 int multibytep = coding->src_multibyte;
2341 int mask_iso = CATEGORY_MASK_ISO; 2443 int single_shifting = 0;
2342 int mask_found = 0, mask_8bit_found = 0;
2343 int reg[4], shift_out = 0, single_shifting = 0;
2344 int id; 2444 int id;
2345 int c, c1; 2445 int c, c1;
2346 int consumed_chars = 0; 2446 int consumed_chars = 0;
2347 int i; 2447 int i;
2448 int rejected = 0;
2449 int found = 0;
2450
2451 detect_info->checked |= CATEGORY_MASK_ISO;
2348 2452
2349 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++) 2453 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2350 { 2454 {
2351 struct coding_system *this = &(coding_categories[i]); 2455 struct coding_system *this = &(coding_categories[i]);
2352 Lisp_Object attrs, val; 2456 Lisp_Object attrs, val;
2361 } 2465 }
2362 2466
2363 /* A coding system of this category is always ASCII compatible. */ 2467 /* A coding system of this category is always ASCII compatible. */
2364 src += coding->head_ascii; 2468 src += coding->head_ascii;
2365 2469
2366 reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1; 2470 while (rejected != CATEGORY_MASK_ISO)
2367 while (mask_iso && src < src_end)
2368 { 2471 {
2369 ONE_MORE_BYTE (c); 2472 ONE_MORE_BYTE (c);
2370 switch (c) 2473 switch (c)
2371 { 2474 {
2372 case ISO_CODE_ESC: 2475 case ISO_CODE_ESC:
2380 ONE_MORE_BYTE (c1); 2483 ONE_MORE_BYTE (c1);
2381 if (c1 < ' ' || c1 >= 0x80 2484 if (c1 < ' ' || c1 >= 0x80
2382 || (id = iso_charset_table[0][c >= ','][c1]) < 0) 2485 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2383 /* Invalid designation sequence. Just ignore. */ 2486 /* Invalid designation sequence. Just ignore. */
2384 break; 2487 break;
2385 reg[(c - '(') % 4] = id;
2386 } 2488 }
2387 else if (c == '$') 2489 else if (c == '$')
2388 { 2490 {
2389 /* Designation sequence for a charset of dimension 2. */ 2491 /* Designation sequence for a charset of dimension 2. */
2390 ONE_MORE_BYTE (c); 2492 ONE_MORE_BYTE (c);
2391 if (c >= '@' && c <= 'B') 2493 if (c >= '@' && c <= 'B')
2392 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ 2494 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2393 reg[0] = id = iso_charset_table[1][0][c]; 2495 id = iso_charset_table[1][0][c];
2394 else if (c >= '(' && c <= '/') 2496 else if (c >= '(' && c <= '/')
2395 { 2497 {
2396 ONE_MORE_BYTE (c1); 2498 ONE_MORE_BYTE (c1);
2397 if (c1 < ' ' || c1 >= 0x80 2499 if (c1 < ' ' || c1 >= 0x80
2398 || (id = iso_charset_table[1][c >= ','][c1]) < 0) 2500 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2399 /* Invalid designation sequence. Just ignore. */ 2501 /* Invalid designation sequence. Just ignore. */
2400 break; 2502 break;
2401 reg[(c - '(') % 4] = id;
2402 } 2503 }
2403 else 2504 else
2404 /* Invalid designation sequence. Just ignore. */ 2505 /* Invalid designation sequence. Just ignore it. */
2405 break; 2506 break;
2406 } 2507 }
2407 else if (c == 'N' || c == 'O') 2508 else if (c == 'N' || c == 'O')
2408 { 2509 {
2409 /* ESC <Fe> for SS2 or SS3. */ 2510 /* ESC <Fe> for SS2 or SS3. */
2410 mask_iso &= CATEGORY_MASK_ISO_7_ELSE; 2511 single_shifting = 1;
2512 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2411 break; 2513 break;
2412 } 2514 }
2413 else if (c >= '0' && c <= '4') 2515 else if (c >= '0' && c <= '4')
2414 { 2516 {
2415 /* ESC <Fp> for start/end composition. */ 2517 /* ESC <Fp> for start/end composition. */
2416 mask_found |= CATEGORY_MASK_ISO; 2518 found |= CATEGORY_MASK_ISO;
2417 break; 2519 break;
2418 } 2520 }
2419 else 2521 else
2420 { 2522 {
2421 /* Invalid escape sequence. */ 2523 /* Invalid escape sequence. Just ignore it. */
2422 mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2423 break; 2524 break;
2424 } 2525 }
2425 2526
2426 /* We found a valid designation sequence for CHARSET. */ 2527 /* We found a valid designation sequence for CHARSET. */
2427 mask_iso &= ~CATEGORY_MASK_ISO_8BIT; 2528 rejected |= CATEGORY_MASK_ISO_8BIT;
2428 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7], 2529 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2429 id)) 2530 id))
2430 mask_found |= CATEGORY_MASK_ISO_7; 2531 found |= CATEGORY_MASK_ISO_7;
2431 else 2532 else
2432 mask_iso &= ~CATEGORY_MASK_ISO_7; 2533 rejected |= CATEGORY_MASK_ISO_7;
2433 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight], 2534 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2434 id)) 2535 id))
2435 mask_found |= CATEGORY_MASK_ISO_7_TIGHT; 2536 found |= CATEGORY_MASK_ISO_7_TIGHT;
2436 else 2537 else
2437 mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT; 2538 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2438 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else], 2539 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2439 id)) 2540 id))
2440 mask_found |= CATEGORY_MASK_ISO_7_ELSE; 2541 found |= CATEGORY_MASK_ISO_7_ELSE;
2441 else 2542 else
2442 mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE; 2543 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2443 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else], 2544 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2444 id)) 2545 id))
2445 mask_found |= CATEGORY_MASK_ISO_8_ELSE; 2546 found |= CATEGORY_MASK_ISO_8_ELSE;
2446 else 2547 else
2447 mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE; 2548 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2448 break; 2549 break;
2449 2550
2450 case ISO_CODE_SO: 2551 case ISO_CODE_SO:
2552 case ISO_CODE_SI:
2553 /* Locking shift out/in. */
2451 if (inhibit_iso_escape_detection) 2554 if (inhibit_iso_escape_detection)
2452 break; 2555 break;
2453 single_shifting = 0; 2556 single_shifting = 0;
2454 if (shift_out == 0 2557 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2455 && (reg[1] >= 0 2558 found |= CATEGORY_MASK_ISO_ELSE;
2456 || SHIFT_OUT_OK (coding_category_iso_7_else)
2457 || SHIFT_OUT_OK (coding_category_iso_8_else)))
2458 {
2459 /* Locking shift out. */
2460 mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2461 mask_found |= CATEGORY_MASK_ISO_ELSE;
2462 }
2463 break; 2559 break;
2464 2560
2465 case ISO_CODE_SI: 2561 case ISO_CODE_CSI:
2562 /* Control sequence introducer. */
2563 single_shifting = 0;
2564 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2565 found |= CATEGORY_MASK_ISO_8_ELSE;
2566 goto check_extra_latin;
2567
2568
2569 case ISO_CODE_SS2:
2570 case ISO_CODE_SS3:
2571 /* Single shift. */
2466 if (inhibit_iso_escape_detection) 2572 if (inhibit_iso_escape_detection)
2467 break; 2573 break;
2468 single_shifting = 0; 2574 single_shifting = 1;
2469 if (shift_out == 1) 2575 rejected |= CATEGORY_MASK_ISO_7BIT;
2470 { 2576 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2471 /* Locking shift in. */ 2577 & CODING_ISO_FLAG_SINGLE_SHIFT)
2472 mask_iso &= ~CATEGORY_MASK_ISO_7BIT; 2578 found |= CATEGORY_MASK_ISO_8_1;
2473 mask_found |= CATEGORY_MASK_ISO_ELSE; 2579 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2474 } 2580 & CODING_ISO_FLAG_SINGLE_SHIFT)
2475 break; 2581 found |= CATEGORY_MASK_ISO_8_2;
2476 2582 goto check_extra_latin;
2477 case ISO_CODE_CSI:
2478 single_shifting = 0;
2479 case ISO_CODE_SS2:
2480 case ISO_CODE_SS3:
2481 {
2482 int newmask = CATEGORY_MASK_ISO_8_ELSE;
2483
2484 mask_8bit_found = 1;
2485 if (inhibit_iso_escape_detection)
2486 break;
2487 if (c != ISO_CODE_CSI)
2488 {
2489 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2490 & CODING_ISO_FLAG_SINGLE_SHIFT)
2491 newmask |= CATEGORY_MASK_ISO_8_1;
2492 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2493 & CODING_ISO_FLAG_SINGLE_SHIFT)
2494 newmask |= CATEGORY_MASK_ISO_8_2;
2495 single_shifting = 1;
2496 }
2497 if (VECTORP (Vlatin_extra_code_table)
2498 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2499 {
2500 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2501 & CODING_ISO_FLAG_LATIN_EXTRA)
2502 newmask |= CATEGORY_MASK_ISO_8_1;
2503 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2504 & CODING_ISO_FLAG_LATIN_EXTRA)
2505 newmask |= CATEGORY_MASK_ISO_8_2;
2506 }
2507 mask_iso &= newmask;
2508 mask_found |= newmask;
2509 }
2510 break;
2511 2583
2512 default: 2584 default:
2513 if (c < 0x80) 2585 if (c < 0x80)
2514 { 2586 {
2515 single_shifting = 0; 2587 single_shifting = 0;
2516 break; 2588 break;
2517 } 2589 }
2518 else if (c < 0xA0) 2590 if (c >= 0xA0)
2519 { 2591 {
2520 single_shifting = 0; 2592 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2521 mask_8bit_found = 1; 2593 found |= CATEGORY_MASK_ISO_8_1;
2522 if (VECTORP (Vlatin_extra_code_table)
2523 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2524 {
2525 int newmask = 0;
2526
2527 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2528 & CODING_ISO_FLAG_LATIN_EXTRA)
2529 newmask |= CATEGORY_MASK_ISO_8_1;
2530 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2531 & CODING_ISO_FLAG_LATIN_EXTRA)
2532 newmask |= CATEGORY_MASK_ISO_8_2;
2533 mask_iso &= newmask;
2534 mask_found |= newmask;
2535 }
2536 else
2537 return 0;
2538 }
2539 else
2540 {
2541 mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2542 | CATEGORY_MASK_ISO_7_ELSE);
2543 mask_found |= CATEGORY_MASK_ISO_8_1;
2544 mask_8bit_found = 1;
2545 /* Check the length of succeeding codes of the range 2594 /* Check the length of succeeding codes of the range
2546 0xA0..0FF. If the byte length is odd, we exclude 2595 0xA0..0FF. If the byte length is even, we include
2547 CATEGORY_MASK_ISO_8_2. We can check this only 2596 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2548 when we are not single shifting. */ 2597 only when we are not single shifting. */
2549 if (!single_shifting 2598 if (! single_shifting
2550 && mask_iso & CATEGORY_MASK_ISO_8_2) 2599 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2551 { 2600 {
2552 int i = 1; 2601 int i = 1;
2553 while (src < src_end) 2602 while (src < src_end)
2554 { 2603 {
2555 ONE_MORE_BYTE (c); 2604 ONE_MORE_BYTE (c);
2557 break; 2606 break;
2558 i++; 2607 i++;
2559 } 2608 }
2560 2609
2561 if (i & 1 && src < src_end) 2610 if (i & 1 && src < src_end)
2562 mask_iso &= ~CATEGORY_MASK_ISO_8_2; 2611 rejected |= CATEGORY_MASK_ISO_8_2;
2563 else 2612 else
2564 mask_found |= CATEGORY_MASK_ISO_8_2; 2613 found |= CATEGORY_MASK_ISO_8_2;
2565 } 2614 }
2615 break;
2566 } 2616 }
2567 break; 2617 check_extra_latin:
2568 } 2618 single_shifting = 0;
2569 } 2619 if (! VECTORP (Vlatin_extra_code_table)
2620 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2621 {
2622 rejected = CATEGORY_MASK_ISO;
2623 break;
2624 }
2625 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2626 & CODING_ISO_FLAG_LATIN_EXTRA)
2627 found |= CATEGORY_MASK_ISO_8_1;
2628 else
2629 rejected |= CATEGORY_MASK_ISO_8_1;
2630 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2631 & CODING_ISO_FLAG_LATIN_EXTRA)
2632 found |= CATEGORY_MASK_ISO_8_2;
2633 else
2634 rejected |= CATEGORY_MASK_ISO_8_2;
2635 }
2636 }
2637 detect_info->rejected |= CATEGORY_MASK_ISO;
2638 return 0;
2639
2570 no_more_source: 2640 no_more_source:
2571 if (!mask_iso) 2641 detect_info->rejected |= rejected;
2572 { 2642 detect_info->found |= (found & ~rejected);
2573 *mask &= ~CATEGORY_MASK_ISO;
2574 return 0;
2575 }
2576 if (!mask_found)
2577 return 0;
2578 *mask &= ~CATEGORY_MASK_ISO;
2579 *mask |= mask_iso & mask_found;
2580 if (! mask_8bit_found)
2581 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2582 return 1; 2643 return 1;
2583 } 2644 }
2584 2645
2585 2646
2586 /* Set designation state into CODING. */ 2647 /* Set designation state into CODING. */
2692 int nchars = (component_len > 0 ? component_idx - component_len \ 2753 int nchars = (component_len > 0 ? component_idx - component_len \
2693 : method == COMPOSITION_RELATIVE ? component_idx \ 2754 : method == COMPOSITION_RELATIVE ? component_idx \
2694 : (component_idx + 1) / 2); \ 2755 : (component_idx + 1) / 2); \
2695 int i; \ 2756 int i; \
2696 int *saved_charbuf = charbuf; \ 2757 int *saved_charbuf = charbuf; \
2758 int from = coding->produced_char + char_offset; \
2759 int to = from + nchars; \
2697 \ 2760 \
2698 ADD_COMPOSITION_DATA (charbuf, method, nchars); \ 2761 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2699 if (method != COMPOSITION_RELATIVE) \ 2762 if (method != COMPOSITION_RELATIVE) \
2700 { \ 2763 { \
2701 if (component_len == 0) \ 2764 if (component_len == 0) \
2702 for (i = 0; i < component_idx; i++) \ 2765 for (i = 0; i < component_idx; i++) \
2703 *charbuf++ = components[i]; \ 2766 *charbuf++ = components[i]; \
2750 { 2813 {
2751 unsigned char *src = coding->source + coding->consumed; 2814 unsigned char *src = coding->source + coding->consumed;
2752 unsigned char *src_end = coding->source + coding->src_bytes; 2815 unsigned char *src_end = coding->source + coding->src_bytes;
2753 unsigned char *src_base; 2816 unsigned char *src_base;
2754 int *charbuf = coding->charbuf; 2817 int *charbuf = coding->charbuf;
2755 int *charbuf_end = charbuf + coding->charbuf_size - 4; 2818 int *charbuf_end
2819 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2756 int consumed_chars = 0, consumed_chars_base; 2820 int consumed_chars = 0, consumed_chars_base;
2757 int char_offset = 0;
2758 int multibytep = coding->src_multibyte; 2821 int multibytep = coding->src_multibyte;
2759 /* Charsets invoked to graphic plane 0 and 1 respectively. */ 2822 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2760 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0); 2823 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2761 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1); 2824 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2762 struct charset *charset; 2825 struct charset *charset;
2772 enum composition_method method; 2835 enum composition_method method;
2773 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1]; 2836 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2774 int component_idx; 2837 int component_idx;
2775 int component_len; 2838 int component_len;
2776 Lisp_Object attrs, eol_type, charset_list; 2839 Lisp_Object attrs, eol_type, charset_list;
2840 int char_offset = coding->produced_char;
2841 int last_offset = char_offset;
2842 int last_id = charset_ascii;
2777 2843
2778 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 2844 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2779 setup_iso_safe_charsets (attrs); 2845 setup_iso_safe_charsets (attrs);
2780 2846
2781 while (1) 2847 while (1)
3049 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1); 3115 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3050 continue; 3116 continue;
3051 } 3117 }
3052 } 3118 }
3053 3119
3120 if (charset->id != charset_ascii
3121 && last_id != charset->id)
3122 {
3123 if (last_id != charset_ascii)
3124 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3125 last_id = charset->id;
3126 last_offset = char_offset;
3127 }
3128
3054 /* Now we know CHARSET and 1st position code C1 of a character. 3129 /* Now we know CHARSET and 1st position code C1 of a character.
3055 Produce a decoded character while getting 2nd position code 3130 Produce a decoded character while getting 2nd position code
3056 C2 if necessary. */ 3131 C2 if necessary. */
3057 c1 &= 0x7F; 3132 c1 &= 0x7F;
3058 if (CHARSET_DIMENSION (charset) > 1) 3133 if (CHARSET_DIMENSION (charset) > 1)
3080 { 3155 {
3081 if (ASCII_BYTE_P (*src_base)) 3156 if (ASCII_BYTE_P (*src_base))
3082 *charbuf++ = *src_base; 3157 *charbuf++ = *src_base;
3083 else 3158 else
3084 *charbuf++ = BYTE8_TO_CHAR (*src_base); 3159 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3160 char_offset++;
3085 } 3161 }
3086 } 3162 }
3087 else if (composition_state == COMPOSING_NO) 3163 else if (composition_state == COMPOSING_NO)
3088 { 3164 {
3089 *charbuf++ = c; 3165 *charbuf++ = c;
3103 MAYBE_FINISH_COMPOSITION (); 3179 MAYBE_FINISH_COMPOSITION ();
3104 src = src_base; 3180 src = src_base;
3105 consumed_chars = consumed_chars_base; 3181 consumed_chars = consumed_chars_base;
3106 ONE_MORE_BYTE (c); 3182 ONE_MORE_BYTE (c);
3107 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 3183 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3184 char_offset++;
3108 coding->errors++; 3185 coding->errors++;
3109 } 3186 }
3110 3187
3111 no_more_source: 3188 no_more_source:
3189 if (last_id != charset_ascii)
3190 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3112 coding->consumed_char += consumed_chars_base; 3191 coding->consumed_char += consumed_chars_base;
3113 coding->consumed = src_base - coding->source; 3192 coding->consumed = src_base - coding->source;
3114 coding->charbuf_used = charbuf - coding->charbuf; 3193 coding->charbuf_used = charbuf - coding->charbuf;
3115 } 3194 }
3116 3195
3528 && CODING_ISO_BOL (coding)); 3607 && CODING_ISO_BOL (coding));
3529 int produced_chars = 0; 3608 int produced_chars = 0;
3530 Lisp_Object attrs, eol_type, charset_list; 3609 Lisp_Object attrs, eol_type, charset_list;
3531 int ascii_compatible; 3610 int ascii_compatible;
3532 int c; 3611 int c;
3612 int preferred_charset_id = -1;
3533 3613
3534 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 3614 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3535 setup_iso_safe_charsets (attrs); 3615 setup_iso_safe_charsets (attrs);
3616 /* Charset list may have been changed. */
3617 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3536 coding->safe_charsets 3618 coding->safe_charsets
3537 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data; 3619 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3538 3620
3539 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)); 3621 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3540 3622
3552 /* We are sure that designation sequences are all ASCII bytes. */ 3634 /* We are sure that designation sequences are all ASCII bytes. */
3553 produced_chars += dst - dst_prev; 3635 produced_chars += dst - dst_prev;
3554 } 3636 }
3555 3637
3556 c = *charbuf++; 3638 c = *charbuf++;
3639
3640 if (c < 0)
3641 {
3642 /* Handle an annotation. */
3643 switch (*charbuf)
3644 {
3645 case CODING_ANNOTATE_COMPOSITION_MASK:
3646 /* Not yet implemented. */
3647 break;
3648 case CODING_ANNOTATE_CHARSET_MASK:
3649 preferred_charset_id = charbuf[3];
3650 if (preferred_charset_id >= 0
3651 && NILP (Fmemq (make_number (preferred_charset_id),
3652 charset_list)))
3653 preferred_charset_id = -1;
3654 break;
3655 default:
3656 abort ();
3657 }
3658 charbuf += -c - 1;
3659 continue;
3660 }
3557 3661
3558 /* Now encode the character C. */ 3662 /* Now encode the character C. */
3559 if (c < 0x20 || c == 0x7F) 3663 if (c < 0x20 || c == 0x7F)
3560 { 3664 {
3561 if (c == '\n' 3665 if (c == '\n'
3593 c = CHAR_TO_BYTE8 (c); 3697 c = CHAR_TO_BYTE8 (c);
3594 EMIT_ONE_BYTE (c); 3698 EMIT_ONE_BYTE (c);
3595 } 3699 }
3596 else 3700 else
3597 { 3701 {
3598 struct charset *charset = char_charset (c, charset_list, NULL); 3702 struct charset *charset;
3599 3703
3704 if (preferred_charset_id >= 0)
3705 {
3706 charset = CHARSET_FROM_ID (preferred_charset_id);
3707 if (! CHAR_CHARSET_P (c, charset))
3708 charset = char_charset (c, charset_list, NULL);
3709 }
3710 else
3711 charset = char_charset (c, charset_list, NULL);
3600 if (!charset) 3712 if (!charset)
3601 { 3713 {
3602 if (coding->mode & CODING_MODE_SAFE_ENCODING) 3714 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3603 { 3715 {
3604 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION; 3716 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3667 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 3779 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3668 Check if a text is encoded in SJIS. If it is, return 3780 Check if a text is encoded in SJIS. If it is, return
3669 CATEGORY_MASK_SJIS, else return 0. */ 3781 CATEGORY_MASK_SJIS, else return 0. */
3670 3782
3671 static int 3783 static int
3672 detect_coding_sjis (coding, mask) 3784 detect_coding_sjis (coding, detect_info)
3673 struct coding_system *coding; 3785 struct coding_system *coding;
3674 int *mask; 3786 struct coding_detection_info *detect_info;
3675 { 3787 {
3676 unsigned char *src = coding->source, *src_base = src; 3788 unsigned char *src = coding->source, *src_base = src;
3677 unsigned char *src_end = coding->source + coding->src_bytes; 3789 unsigned char *src_end = coding->source + coding->src_bytes;
3678 int multibytep = coding->src_multibyte; 3790 int multibytep = coding->src_multibyte;
3679 int consumed_chars = 0; 3791 int consumed_chars = 0;
3680 int found = 0; 3792 int found = 0;
3681 int c; 3793 int c;
3682 int incomplete; 3794 int incomplete;
3683 3795
3796 detect_info->checked |= CATEGORY_MASK_SJIS;
3684 /* A coding system of this category is always ASCII compatible. */ 3797 /* A coding system of this category is always ASCII compatible. */
3685 src += coding->head_ascii; 3798 src += coding->head_ascii;
3686 3799
3687 while (1) 3800 while (1)
3688 { 3801 {
3694 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF)) 3807 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3695 { 3808 {
3696 ONE_MORE_BYTE (c); 3809 ONE_MORE_BYTE (c);
3697 if (c < 0x40 || c == 0x7F || c > 0xFC) 3810 if (c < 0x40 || c == 0x7F || c > 0xFC)
3698 break; 3811 break;
3699 found = 1; 3812 found = CATEGORY_MASK_SJIS;
3700 } 3813 }
3701 else if (c >= 0xA0 && c < 0xE0) 3814 else if (c >= 0xA0 && c < 0xE0)
3702 found = 1; 3815 found = CATEGORY_MASK_SJIS;
3703 else 3816 else
3704 break; 3817 break;
3705 } 3818 }
3706 *mask &= ~CATEGORY_MASK_SJIS; 3819 detect_info->rejected |= CATEGORY_MASK_SJIS;
3707 return 0; 3820 return 0;
3708 3821
3709 no_more_source: 3822 no_more_source:
3710 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) 3823 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3711 { 3824 {
3712 *mask &= ~CATEGORY_MASK_SJIS; 3825 detect_info->rejected |= CATEGORY_MASK_SJIS;
3713 return 0; 3826 return 0;
3714 } 3827 }
3715 return found; 3828 detect_info->found |= found;
3829 return 1;
3716 } 3830 }
3717 3831
3718 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 3832 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3719 Check if a text is encoded in BIG5. If it is, return 3833 Check if a text is encoded in BIG5. If it is, return
3720 CATEGORY_MASK_BIG5, else return 0. */ 3834 CATEGORY_MASK_BIG5, else return 0. */
3721 3835
3722 static int 3836 static int
3723 detect_coding_big5 (coding, mask) 3837 detect_coding_big5 (coding, detect_info)
3724 struct coding_system *coding; 3838 struct coding_system *coding;
3725 int *mask; 3839 struct coding_detection_info *detect_info;
3726 { 3840 {
3727 unsigned char *src = coding->source, *src_base = src; 3841 unsigned char *src = coding->source, *src_base = src;
3728 unsigned char *src_end = coding->source + coding->src_bytes; 3842 unsigned char *src_end = coding->source + coding->src_bytes;
3729 int multibytep = coding->src_multibyte; 3843 int multibytep = coding->src_multibyte;
3730 int consumed_chars = 0; 3844 int consumed_chars = 0;
3731 int found = 0; 3845 int found = 0;
3732 int c; 3846 int c;
3733 int incomplete; 3847 int incomplete;
3734 3848
3849 detect_info->checked |= CATEGORY_MASK_BIG5;
3735 /* A coding system of this category is always ASCII compatible. */ 3850 /* A coding system of this category is always ASCII compatible. */
3736 src += coding->head_ascii; 3851 src += coding->head_ascii;
3737 3852
3738 while (1) 3853 while (1)
3739 { 3854 {
3745 if (c >= 0xA1) 3860 if (c >= 0xA1)
3746 { 3861 {
3747 ONE_MORE_BYTE (c); 3862 ONE_MORE_BYTE (c);
3748 if (c < 0x40 || (c >= 0x7F && c <= 0xA0)) 3863 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3749 return 0; 3864 return 0;
3750 found = 1; 3865 found = CATEGORY_MASK_BIG5;
3751 } 3866 }
3752 else 3867 else
3753 break; 3868 break;
3754 } 3869 }
3755 *mask &= ~CATEGORY_MASK_BIG5; 3870 detect_info->rejected |= CATEGORY_MASK_BIG5;
3756 return 0; 3871 return 0;
3757 3872
3758 no_more_source: 3873 no_more_source:
3759 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) 3874 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3760 { 3875 {
3761 *mask &= ~CATEGORY_MASK_BIG5; 3876 detect_info->rejected |= CATEGORY_MASK_BIG5;
3762 return 0; 3877 return 0;
3763 } 3878 }
3764 return found; 3879 detect_info->found |= found;
3880 return 1;
3765 } 3881 }
3766 3882
3767 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". 3883 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3768 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ 3884 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3769 3885
3773 { 3889 {
3774 unsigned char *src = coding->source + coding->consumed; 3890 unsigned char *src = coding->source + coding->consumed;
3775 unsigned char *src_end = coding->source + coding->src_bytes; 3891 unsigned char *src_end = coding->source + coding->src_bytes;
3776 unsigned char *src_base; 3892 unsigned char *src_base;
3777 int *charbuf = coding->charbuf; 3893 int *charbuf = coding->charbuf;
3778 int *charbuf_end = charbuf + coding->charbuf_size; 3894 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3779 int consumed_chars = 0, consumed_chars_base; 3895 int consumed_chars = 0, consumed_chars_base;
3780 int multibytep = coding->src_multibyte; 3896 int multibytep = coding->src_multibyte;
3781 struct charset *charset_roman, *charset_kanji, *charset_kana; 3897 struct charset *charset_roman, *charset_kanji, *charset_kana;
3782 Lisp_Object attrs, eol_type, charset_list, val; 3898 Lisp_Object attrs, eol_type, charset_list, val;
3899 int char_offset = coding->produced_char;
3900 int last_offset = char_offset;
3901 int last_id = charset_ascii;
3783 3902
3784 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 3903 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3785 3904
3786 val = charset_list; 3905 val = charset_list;
3787 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); 3906 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3840 /* SJIS -> JISX0201-Kana */ 3959 /* SJIS -> JISX0201-Kana */
3841 c &= 0x7F; 3960 c &= 0x7F;
3842 charset = charset_kana; 3961 charset = charset_kana;
3843 } 3962 }
3844 } 3963 }
3964 if (charset->id != charset_ascii
3965 && last_id != charset->id)
3966 {
3967 if (last_id != charset_ascii)
3968 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3969 last_id = charset->id;
3970 last_offset = char_offset;
3971 }
3845 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); 3972 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3846 } 3973 }
3847 *charbuf++ = c; 3974 *charbuf++ = c;
3975 char_offset++;
3848 continue; 3976 continue;
3849 3977
3850 invalid_code: 3978 invalid_code:
3851 src = src_base; 3979 src = src_base;
3852 consumed_chars = consumed_chars_base; 3980 consumed_chars = consumed_chars_base;
3853 ONE_MORE_BYTE (c); 3981 ONE_MORE_BYTE (c);
3854 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 3982 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3983 char_offset++;
3855 coding->errors++; 3984 coding->errors++;
3856 } 3985 }
3857 3986
3858 no_more_source: 3987 no_more_source:
3988 if (last_id != charset_ascii)
3989 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3859 coding->consumed_char += consumed_chars_base; 3990 coding->consumed_char += consumed_chars_base;
3860 coding->consumed = src_base - coding->source; 3991 coding->consumed = src_base - coding->source;
3861 coding->charbuf_used = charbuf - coding->charbuf; 3992 coding->charbuf_used = charbuf - coding->charbuf;
3862 } 3993 }
3863 3994
3867 { 3998 {
3868 unsigned char *src = coding->source + coding->consumed; 3999 unsigned char *src = coding->source + coding->consumed;
3869 unsigned char *src_end = coding->source + coding->src_bytes; 4000 unsigned char *src_end = coding->source + coding->src_bytes;
3870 unsigned char *src_base; 4001 unsigned char *src_base;
3871 int *charbuf = coding->charbuf; 4002 int *charbuf = coding->charbuf;
3872 int *charbuf_end = charbuf + coding->charbuf_size; 4003 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3873 int consumed_chars = 0, consumed_chars_base; 4004 int consumed_chars = 0, consumed_chars_base;
3874 int multibytep = coding->src_multibyte; 4005 int multibytep = coding->src_multibyte;
3875 struct charset *charset_roman, *charset_big5; 4006 struct charset *charset_roman, *charset_big5;
3876 Lisp_Object attrs, eol_type, charset_list, val; 4007 Lisp_Object attrs, eol_type, charset_list, val;
4008 int char_offset = coding->produced_char;
4009 int last_offset = char_offset;
4010 int last_id = charset_ascii;
3877 4011
3878 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 4012 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3879 val = charset_list; 4013 val = charset_list;
3880 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); 4014 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3881 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val))); 4015 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3921 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE) 4055 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3922 goto invalid_code; 4056 goto invalid_code;
3923 c = c << 8 | c1; 4057 c = c << 8 | c1;
3924 charset = charset_big5; 4058 charset = charset_big5;
3925 } 4059 }
4060 if (charset->id != charset_ascii
4061 && last_id != charset->id)
4062 {
4063 if (last_id != charset_ascii)
4064 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4065 last_id = charset->id;
4066 last_offset = char_offset;
4067 }
3926 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); 4068 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3927 } 4069 }
3928 4070
3929 *charbuf++ = c; 4071 *charbuf++ = c;
4072 char_offset++;
3930 continue; 4073 continue;
3931 4074
3932 invalid_code: 4075 invalid_code:
3933 src = src_base; 4076 src = src_base;
3934 consumed_chars = consumed_chars_base; 4077 consumed_chars = consumed_chars_base;
3935 ONE_MORE_BYTE (c); 4078 ONE_MORE_BYTE (c);
3936 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 4079 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4080 char_offset++;
3937 coding->errors++; 4081 coding->errors++;
3938 } 4082 }
3939 4083
3940 no_more_source: 4084 no_more_source:
4085 if (last_id != charset_ascii)
4086 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3941 coding->consumed_char += consumed_chars_base; 4087 coding->consumed_char += consumed_chars_base;
3942 coding->consumed = src_base - coding->source; 4088 coding->consumed = src_base - coding->source;
3943 coding->charbuf_used = charbuf - coding->charbuf; 4089 coding->charbuf_used = charbuf - coding->charbuf;
3944 } 4090 }
3945 4091
4104 Check if a text is encoded in a coding system of which 4250 Check if a text is encoded in a coding system of which
4105 encoder/decoder are written in CCL program. If it is, return 4251 encoder/decoder are written in CCL program. If it is, return
4106 CATEGORY_MASK_CCL, else return 0. */ 4252 CATEGORY_MASK_CCL, else return 0. */
4107 4253
4108 static int 4254 static int
4109 detect_coding_ccl (coding, mask) 4255 detect_coding_ccl (coding, detect_info)
4110 struct coding_system *coding; 4256 struct coding_system *coding;
4111 int *mask; 4257 struct coding_detection_info *detect_info;
4112 { 4258 {
4113 unsigned char *src = coding->source, *src_base = src; 4259 unsigned char *src = coding->source, *src_base = src;
4114 unsigned char *src_end = coding->source + coding->src_bytes; 4260 unsigned char *src_end = coding->source + coding->src_bytes;
4115 int multibytep = coding->src_multibyte; 4261 int multibytep = coding->src_multibyte;
4116 int consumed_chars = 0; 4262 int consumed_chars = 0;
4117 int found = 0; 4263 int found = 0;
4118 unsigned char *valids = CODING_CCL_VALIDS (coding); 4264 unsigned char *valids = CODING_CCL_VALIDS (coding);
4119 int head_ascii = coding->head_ascii; 4265 int head_ascii = coding->head_ascii;
4120 Lisp_Object attrs; 4266 Lisp_Object attrs;
4121 4267
4268 detect_info->checked |= CATEGORY_MASK_CCL;
4269
4122 coding = &coding_categories[coding_category_ccl]; 4270 coding = &coding_categories[coding_category_ccl];
4123 attrs = CODING_ID_ATTRS (coding->id); 4271 attrs = CODING_ID_ATTRS (coding->id);
4124 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) 4272 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4125 src += head_ascii; 4273 src += head_ascii;
4126 4274
4128 { 4276 {
4129 int c; 4277 int c;
4130 ONE_MORE_BYTE (c); 4278 ONE_MORE_BYTE (c);
4131 if (! valids[c]) 4279 if (! valids[c])
4132 break; 4280 break;
4133 if (!found && valids[c] > 1) 4281 if ((valids[c] > 1))
4134 found = 1; 4282 found = CATEGORY_MASK_CCL;
4135 } 4283 }
4136 *mask &= ~CATEGORY_MASK_CCL; 4284 detect_info->rejected |= CATEGORY_MASK_CCL;
4137 return 0; 4285 return 0;
4138 4286
4139 no_more_source: 4287 no_more_source:
4140 return found; 4288 detect_info->found |= found;
4289 return 1;
4141 } 4290 }
4142 4291
4143 static void 4292 static void
4144 decode_coding_ccl (coding) 4293 decode_coding_ccl (coding)
4145 struct coding_system *coding; 4294 struct coding_system *coding;
4373 coding->produced_char += produced_chars; 4522 coding->produced_char += produced_chars;
4374 coding->produced = dst - coding->destination; 4523 coding->produced = dst - coding->destination;
4375 return 0; 4524 return 0;
4376 } 4525 }
4377 4526
4527 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4528 Check if a text is encoded in a charset-based coding system. If it
4529 is, return 1, else return 0. */
4530
4378 static int 4531 static int
4379 detect_coding_charset (coding, mask) 4532 detect_coding_charset (coding, detect_info)
4380 struct coding_system *coding; 4533 struct coding_system *coding;
4381 int *mask; 4534 struct coding_detection_info *detect_info;
4382 { 4535 {
4383 unsigned char *src = coding->source, *src_base = src; 4536 unsigned char *src = coding->source, *src_base = src;
4384 unsigned char *src_end = coding->source + coding->src_bytes; 4537 unsigned char *src_end = coding->source + coding->src_bytes;
4385 int multibytep = coding->src_multibyte; 4538 int multibytep = coding->src_multibyte;
4386 int consumed_chars = 0; 4539 int consumed_chars = 0;
4387 Lisp_Object attrs, valids; 4540 Lisp_Object attrs, valids;
4388 int found = 0; 4541 int found = 0;
4389 4542
4543 detect_info->checked |= CATEGORY_MASK_CHARSET;
4544
4390 coding = &coding_categories[coding_category_charset]; 4545 coding = &coding_categories[coding_category_charset];
4391 attrs = CODING_ID_ATTRS (coding->id); 4546 attrs = CODING_ID_ATTRS (coding->id);
4392 valids = AREF (attrs, coding_attr_charset_valids); 4547 valids = AREF (attrs, coding_attr_charset_valids);
4393 4548
4394 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))) 4549 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4400 4555
4401 ONE_MORE_BYTE (c); 4556 ONE_MORE_BYTE (c);
4402 if (NILP (AREF (valids, c))) 4557 if (NILP (AREF (valids, c)))
4403 break; 4558 break;
4404 if (c >= 0x80) 4559 if (c >= 0x80)
4405 found = 1; 4560 found = CATEGORY_MASK_CHARSET;
4406 } 4561 }
4407 *mask &= ~CATEGORY_MASK_CHARSET; 4562 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4408 return 0; 4563 return 0;
4409 4564
4410 no_more_source: 4565 no_more_source:
4411 return (found || NILP (CODING_ATTR_ASCII_COMPAT (attrs))); 4566 detect_info->found |= found;
4567 return 1;
4412 } 4568 }
4413 4569
4414 static void 4570 static void
4415 decode_coding_charset (coding) 4571 decode_coding_charset (coding)
4416 struct coding_system *coding; 4572 struct coding_system *coding;
4417 { 4573 {
4418 unsigned char *src = coding->source + coding->consumed; 4574 unsigned char *src = coding->source + coding->consumed;
4419 unsigned char *src_end = coding->source + coding->src_bytes; 4575 unsigned char *src_end = coding->source + coding->src_bytes;
4420 unsigned char *src_base; 4576 unsigned char *src_base;
4421 int *charbuf = coding->charbuf; 4577 int *charbuf = coding->charbuf;
4422 int *charbuf_end = charbuf + coding->charbuf_size; 4578 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4423 int consumed_chars = 0, consumed_chars_base; 4579 int consumed_chars = 0, consumed_chars_base;
4424 int multibytep = coding->src_multibyte; 4580 int multibytep = coding->src_multibyte;
4425 Lisp_Object attrs, eol_type, charset_list, valids; 4581 Lisp_Object attrs, eol_type, charset_list, valids;
4582 int char_offset = coding->produced_char;
4583 int last_offset = char_offset;
4584 int last_id = charset_ascii;
4426 4585
4427 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 4586 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4428 valids = AREF (attrs, coding_attr_charset_valids); 4587 valids = AREF (attrs, coding_attr_charset_valids);
4429 4588
4430 while (1) 4589 while (1)
4501 val = XCDR (val); 4660 val = XCDR (val);
4502 } 4661 }
4503 } 4662 }
4504 if (c < 0) 4663 if (c < 0)
4505 goto invalid_code; 4664 goto invalid_code;
4665 if (charset->id != charset_ascii
4666 && last_id != charset->id)
4667 {
4668 if (last_id != charset_ascii)
4669 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4670 last_id = charset->id;
4671 last_offset = char_offset;
4672 }
4506 } 4673 }
4507 *charbuf++ = c; 4674 *charbuf++ = c;
4675 char_offset++;
4508 continue; 4676 continue;
4509 4677
4510 invalid_code: 4678 invalid_code:
4511 src = src_base; 4679 src = src_base;
4512 consumed_chars = consumed_chars_base; 4680 consumed_chars = consumed_chars_base;
4513 ONE_MORE_BYTE (c); 4681 ONE_MORE_BYTE (c);
4514 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c); 4682 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4683 char_offset++;
4515 coding->errors++; 4684 coding->errors++;
4516 } 4685 }
4517 4686
4518 no_more_source: 4687 no_more_source:
4688 if (last_id != charset_ascii)
4689 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4519 coding->consumed_char += consumed_chars_base; 4690 coding->consumed_char += consumed_chars_base;
4520 coding->consumed = src_base - coding->source; 4691 coding->consumed = src_base - coding->source;
4521 coding->charbuf_used = charbuf - coding->charbuf; 4692 coding->charbuf_used = charbuf - coding->charbuf;
4522 } 4693 }
4523 4694
4630 } 4801 }
4631 else if (EQ (coding_type, Qiso_2022)) 4802 else if (EQ (coding_type, Qiso_2022))
4632 { 4803 {
4633 int i; 4804 int i;
4634 int flags = XINT (AREF (attrs, coding_attr_iso_flags)); 4805 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4806 enum coding_category category = XINT (CODING_ATTR_CATEGORY (attrs));
4635 4807
4636 /* Invoke graphic register 0 to plane 0. */ 4808 /* Invoke graphic register 0 to plane 0. */
4637 CODING_ISO_INVOCATION (coding, 0) = 0; 4809 CODING_ISO_INVOCATION (coding, 0) = 0;
4638 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */ 4810 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4639 CODING_ISO_INVOCATION (coding, 1) 4811 CODING_ISO_INVOCATION (coding, 1)
4653 coding->common_flags 4825 coding->common_flags
4654 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK 4826 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4655 | CODING_REQUIRE_FLUSHING_MASK); 4827 | CODING_REQUIRE_FLUSHING_MASK);
4656 if (flags & CODING_ISO_FLAG_COMPOSITION) 4828 if (flags & CODING_ISO_FLAG_COMPOSITION)
4657 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK; 4829 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4830 if (flags & CODING_ISO_FLAG_DESIGNATION)
4831 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4658 if (flags & CODING_ISO_FLAG_FULL_SUPPORT) 4832 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4659 { 4833 {
4660 setup_iso_safe_charsets (attrs); 4834 setup_iso_safe_charsets (attrs);
4661 val = CODING_ATTR_SAFE_CHARSETS (attrs); 4835 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4662 coding->max_charset_id = XSTRING (val)->size - 1; 4836 coding->max_charset_id = XSTRING (val)->size - 1;
4928 #define EOL_SEEN_NONE 0 5102 #define EOL_SEEN_NONE 0
4929 #define EOL_SEEN_LF 1 5103 #define EOL_SEEN_LF 1
4930 #define EOL_SEEN_CR 2 5104 #define EOL_SEEN_CR 2
4931 #define EOL_SEEN_CRLF 4 5105 #define EOL_SEEN_CRLF 4
4932 5106
4933 /* Detect how end-of-line of a text of length CODING->src_bytes 5107 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
4934 pointed by CODING->source is encoded. Return one of 5108 SOURCE is encoded. If CATEGORY is one of
4935 EOL_SEEN_XXX. */ 5109 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5110 two-byte, else they are encoded by one-byte.
5111
5112 Return one of EOL_SEEN_XXX. */
4936 5113
4937 #define MAX_EOL_CHECK_COUNT 3 5114 #define MAX_EOL_CHECK_COUNT 3
4938 5115
4939 static int 5116 static int
4940 detect_eol (source, src_bytes, category) 5117 detect_eol (source, src_bytes, category)
5055 5232
5056 /* If we have not yet decided the text encoding type, detect it 5233 /* If we have not yet decided the text encoding type, detect it
5057 now. */ 5234 now. */
5058 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) 5235 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5059 { 5236 {
5060 int mask = CATEGORY_MASK_ANY;
5061 int c, i; 5237 int c, i;
5062 5238
5063 for (src = coding->source; src < src_end; src++) 5239 for (src = coding->source; src < src_end; src++)
5064 { 5240 {
5065 c = *src; 5241 c = *src;
5070 } 5246 }
5071 coding->head_ascii = src - (coding->source + coding->consumed); 5247 coding->head_ascii = src - (coding->source + coding->consumed);
5072 5248
5073 if (coding->head_ascii < coding->src_bytes) 5249 if (coding->head_ascii < coding->src_bytes)
5074 { 5250 {
5075 int detected = 0; 5251 struct coding_detection_info detect_info;
5076 5252 enum coding_category category;
5253 struct coding_system *this;
5254
5255 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5077 for (i = 0; i < coding_category_raw_text; i++) 5256 for (i = 0; i < coding_category_raw_text; i++)
5078 { 5257 {
5079 enum coding_category category = coding_priorities[i]; 5258 category = coding_priorities[i];
5080 struct coding_system *this = coding_categories + category; 5259 this = coding_categories + category;
5081
5082 if (this->id < 0) 5260 if (this->id < 0)
5083 { 5261 {
5084 /* No coding system of this category is defined. */ 5262 /* No coding system of this category is defined. */
5085 mask &= ~(1 << category); 5263 detect_info.rejected |= (1 << category);
5086 } 5264 }
5087 else if (category >= coding_category_raw_text 5265 else if (category >= coding_category_raw_text)
5088 || detected & (1 << category))
5089 continue; 5266 continue;
5090 else 5267 else if (detect_info.checked & (1 << category))
5091 { 5268 {
5092 detected |= detected_mask[category]; 5269 if (detect_info.found & (1 << category))
5093 if ((*(this->detector)) (coding, &mask) 5270 break;
5094 && (mask & (1 << category)))
5095 {
5096 mask = 1 << category;
5097 break;
5098 }
5099 } 5271 }
5272 else if ((*(this->detector)) (coding, &detect_info)
5273 && detect_info.found & (1 << category))
5274 break;
5100 } 5275 }
5101 if (! mask) 5276 if (i < coding_category_raw_text)
5277 setup_coding_system (CODING_ID_NAME (this->id), coding);
5278 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5102 setup_coding_system (Qraw_text, coding); 5279 setup_coding_system (Qraw_text, coding);
5103 else if (mask != CATEGORY_MASK_ANY) 5280 else if (detect_info.rejected)
5104 for (i = 0; i < coding_category_raw_text; i++) 5281 for (i = 0; i < coding_category_raw_text; i++)
5105 { 5282 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5106 enum coding_category category = coding_priorities[i]; 5283 {
5107 struct coding_system *this = coding_categories + category; 5284 this = coding_categories + coding_priorities[i];
5108 5285 setup_coding_system (CODING_ID_NAME (this->id), coding);
5109 if (mask & (1 << category)) 5286 break;
5110 { 5287 }
5111 setup_coding_system (CODING_ID_NAME (this->id), coding);
5112 break;
5113 }
5114 }
5115 } 5288 }
5116 } 5289 }
5117 5290
5118 attrs = CODING_ID_ATTRS (coding->id); 5291 attrs = CODING_ID_ATTRS (coding->id);
5119 coding_type = CODING_ATTR_TYPE (attrs); 5292 coding_type = CODING_ATTR_TYPE (attrs);
5406 coding->produced += produced; 5579 coding->produced += produced;
5407 coding->produced_char += produced_chars; 5580 coding->produced_char += produced_chars;
5408 return produced_chars; 5581 return produced_chars;
5409 } 5582 }
5410 5583
5411 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ] 5584 /* Compose text in CODING->object according to the annotation data at
5412 or 5585 CHARBUF. CHARBUF is an array:
5413 [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ] 5586 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5414 */ 5587 */
5415 5588
5416 static INLINE void 5589 static INLINE void
5417 produce_composition (coding, charbuf) 5590 produce_composition (coding, charbuf)
5418 struct coding_system *coding; 5591 struct coding_system *coding;
5419 int *charbuf; 5592 int *charbuf;
5420 { 5593 {
5421 Lisp_Object buffer;
5422 int len; 5594 int len;
5423 EMACS_INT pos; 5595 EMACS_INT from, to;
5424 enum composition_method method; 5596 enum composition_method method;
5425 int cmp_len;
5426 Lisp_Object components; 5597 Lisp_Object components;
5427 5598
5428 buffer = coding->dst_object;
5429 len = -charbuf[0]; 5599 len = -charbuf[0];
5430 pos = coding->dst_pos + charbuf[1]; 5600 from = coding->dst_pos + charbuf[2];
5431 method = (enum composition_method) (charbuf[3]); 5601 to = coding->dst_pos + charbuf[3];
5432 cmp_len = charbuf[4]; 5602 method = (enum composition_method) (charbuf[4]);
5433 5603
5434 if (method == COMPOSITION_RELATIVE) 5604 if (method == COMPOSITION_RELATIVE)
5435 components = Qnil; 5605 components = Qnil;
5436 else 5606 else
5437 { 5607 {
5443 for (i = 0; i < len; i++) 5613 for (i = 0; i < len; i++)
5444 args[i] = make_number (charbuf[i]); 5614 args[i] = make_number (charbuf[i]);
5445 components = (method == COMPOSITION_WITH_ALTCHARS 5615 components = (method == COMPOSITION_WITH_ALTCHARS
5446 ? Fstring (len, args) : Fvector (len, args)); 5616 ? Fstring (len, args) : Fvector (len, args));
5447 } 5617 }
5448 compose_text (pos, pos + cmp_len, components, Qnil, Qnil); 5618 compose_text (from, to, components, Qnil, coding->dst_object);
5449 } 5619 }
5450 5620
5451 static int * 5621
5452 save_composition_data (buf, buf_end, prop) 5622 /* Put `charset' property on text in CODING->object according to
5453 int *buf, *buf_end; 5623 the annotation data at CHARBUF. CHARBUF is an array:
5454 Lisp_Object prop; 5624 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5455 { 5625 */
5456 enum composition_method method = COMPOSITION_METHOD (prop); 5626
5457 int cmp_len = COMPOSITION_LENGTH (prop); 5627 static INLINE void
5458 5628 produce_charset (coding, charbuf)
5459 if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end) 5629 struct coding_system *coding;
5460 return NULL; 5630 int *charbuf;
5461 5631 {
5462 buf[1] = CODING_ANNOTATE_COMPOSITION_MASK; 5632 EMACS_INT from = coding->dst_pos + charbuf[2];
5463 buf[2] = method; 5633 EMACS_INT to = coding->dst_pos + charbuf[3];
5464 buf[3] = cmp_len; 5634 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5465 5635
5466 if (method == COMPOSITION_RELATIVE) 5636 Fput_text_property (make_number (from), make_number (to),
5467 buf[0] = 4; 5637 Qcharset, CHARSET_NAME (charset),
5468 else 5638 coding->dst_object);
5469 { 5639 }
5470 Lisp_Object components; 5640
5471 int len, i;
5472
5473 components = COMPOSITION_COMPONENTS (prop);
5474 if (VECTORP (components))
5475 {
5476 len = XVECTOR (components)->size;
5477 for (i = 0; i < len; i++)
5478 buf[4 + i] = XINT (AREF (components, i));
5479 }
5480 else if (STRINGP (components))
5481 {
5482 int i_byte;
5483
5484 len = XSTRING (components)->size;
5485 i = i_byte = 0;
5486 while (i < len)
5487 FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5488 }
5489 else if (INTEGERP (components))
5490 {
5491 len = 1;
5492 buf[4] = XINT (components);
5493 }
5494 else if (CONSP (components))
5495 {
5496 for (len = 0; CONSP (components);
5497 len++, components = XCDR (components))
5498 buf[4 + len] = XINT (XCAR (components));
5499 }
5500 else
5501 abort ();
5502 buf[0] = 4 + len;
5503 }
5504 return (buf + buf[0]);
5505 }
5506 5641
5507 #define CHARBUF_SIZE 0x4000 5642 #define CHARBUF_SIZE 0x4000
5508 5643
5509 #define ALLOC_CONVERSION_WORK_AREA(coding) \ 5644 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5510 do { \ 5645 do { \
5532 struct coding_system *coding; 5667 struct coding_system *coding;
5533 { 5668 {
5534 int *charbuf = coding->charbuf; 5669 int *charbuf = coding->charbuf;
5535 int *charbuf_end = charbuf + coding->charbuf_used; 5670 int *charbuf_end = charbuf + coding->charbuf_used;
5536 5671
5672 if (NILP (coding->dst_object))
5673 return;
5674
5537 while (charbuf < charbuf_end) 5675 while (charbuf < charbuf_end)
5538 { 5676 {
5539 if (*charbuf >= 0) 5677 if (*charbuf >= 0)
5540 charbuf++; 5678 charbuf++;
5541 else 5679 else
5542 { 5680 {
5543 int len = -*charbuf; 5681 int len = -*charbuf;
5544 switch (charbuf[2]) 5682 switch (charbuf[1])
5545 { 5683 {
5546 case CODING_ANNOTATE_COMPOSITION_MASK: 5684 case CODING_ANNOTATE_COMPOSITION_MASK:
5547 produce_composition (coding, charbuf); 5685 produce_composition (coding, charbuf);
5686 break;
5687 case CODING_ANNOTATE_CHARSET_MASK:
5688 produce_charset (coding, charbuf);
5548 break; 5689 break;
5549 default: 5690 default:
5550 abort (); 5691 abort ();
5551 } 5692 }
5552 charbuf += len; 5693 charbuf += len;
5667 } 5808 }
5668 5809
5669 return coding->result; 5810 return coding->result;
5670 } 5811 }
5671 5812
5813
5814 /* Extract an annotation data from a composition starting at POS and
5815 ending before LIMIT of CODING->src_object (buffer or string), store
5816 the data in BUF, set *STOP to a starting position of the next
5817 composition (if any) or to LIMIT, and return the address of the
5818 next element of BUF.
5819
5820 If such an annotation is not found, set *STOP to a starting
5821 position of a composition after POS (if any) or to LIMIT, and
5822 return BUF. */
5823
5824 static INLINE int *
5825 handle_composition_annotation (pos, limit, coding, buf, stop)
5826 EMACS_INT pos, limit;
5827 struct coding_system *coding;
5828 int *buf;
5829 EMACS_INT *stop;
5830 {
5831 EMACS_INT start, end;
5832 Lisp_Object prop;
5833
5834 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5835 || end > limit)
5836 *stop = limit;
5837 else if (start > pos)
5838 *stop = start;
5839 else
5840 {
5841 if (start == pos)
5842 {
5843 /* We found a composition. Store the corresponding
5844 annotation data in BUF. */
5845 int *head = buf;
5846 enum composition_method method = COMPOSITION_METHOD (prop);
5847 int nchars = COMPOSITION_LENGTH (prop);
5848
5849 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5850 if (method != COMPOSITION_RELATIVE)
5851 {
5852 Lisp_Object components;
5853 int len, i, i_byte;
5854
5855 components = COMPOSITION_COMPONENTS (prop);
5856 if (VECTORP (components))
5857 {
5858 len = XVECTOR (components)->size;
5859 for (i = 0; i < len; i++)
5860 *buf++ = XINT (AREF (components, i));
5861 }
5862 else if (STRINGP (components))
5863 {
5864 len = XSTRING (components)->size;
5865 i = i_byte = 0;
5866 while (i < len)
5867 {
5868 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5869 buf++;
5870 }
5871 }
5872 else if (INTEGERP (components))
5873 {
5874 len = 1;
5875 *buf++ = XINT (components);
5876 }
5877 else if (CONSP (components))
5878 {
5879 for (len = 0; CONSP (components);
5880 len++, components = XCDR (components))
5881 *buf++ = XINT (XCAR (components));
5882 }
5883 else
5884 abort ();
5885 *head -= len;
5886 }
5887 }
5888
5889 if (find_composition (end, limit, &start, &end, &prop,
5890 coding->src_object)
5891 && end <= limit)
5892 *stop = start;
5893 else
5894 *stop = limit;
5895 }
5896 return buf;
5897 }
5898
5899
5900 /* Extract an annotation data from a text property `charset' at POS of
5901 CODING->src_object (buffer of string), store the data in BUF, set
5902 *STOP to the position where the value of `charset' property changes
5903 (limiting by LIMIT), and return the address of the next element of
5904 BUF.
5905
5906 If the property value is nil, set *STOP to the position where the
5907 property value is non-nil (limiting by LIMIT), and return BUF. */
5908
5909 static INLINE int *
5910 handle_charset_annotation (pos, limit, coding, buf, stop)
5911 EMACS_INT pos, limit;
5912 struct coding_system *coding;
5913 int *buf;
5914 EMACS_INT *stop;
5915 {
5916 Lisp_Object val, next;
5917 int id;
5918
5919 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5920 if (! NILP (val) && CHARSETP (val))
5921 id = XINT (CHARSET_SYMBOL_ID (val));
5922 else
5923 id = -1;
5924 ADD_CHARSET_DATA (buf, 0, 0, id);
5925 next = Fnext_single_property_change (make_number (pos), Qcharset,
5926 coding->src_object,
5927 make_number (limit));
5928 *stop = XINT (next);
5929 return buf;
5930 }
5931
5932
5672 static void 5933 static void
5673 consume_chars (coding) 5934 consume_chars (coding)
5674 struct coding_system *coding; 5935 struct coding_system *coding;
5675 { 5936 {
5676 int *buf = coding->charbuf; 5937 int *buf = coding->charbuf;
5677 /* -1 is to compensate for CRLF. */ 5938 int *buf_end = coding->charbuf + coding->charbuf_size;
5678 int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5679 const unsigned char *src = coding->source + coding->consumed; 5939 const unsigned char *src = coding->source + coding->consumed;
5680 int pos = coding->src_pos + coding->consumed_char; 5940 EMACS_INT pos = coding->src_pos + coding->consumed_char;
5681 int end_pos = coding->src_pos + coding->src_chars; 5941 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
5682 int multibytep = coding->src_multibyte; 5942 int multibytep = coding->src_multibyte;
5683 Lisp_Object eol_type; 5943 Lisp_Object eol_type;
5684 int c; 5944 int c;
5685 int start, end, stop; 5945 EMACS_INT stop, stop_composition, stop_charset;
5686 Lisp_Object object, prop; 5946 int id;
5687 5947
5688 eol_type = CODING_ID_EOL_TYPE (coding->id); 5948 eol_type = CODING_ID_EOL_TYPE (coding->id);
5689 if (VECTORP (eol_type)) 5949 if (VECTORP (eol_type))
5690 eol_type = Qunix; 5950 eol_type = Qunix;
5691 5951
5692 object = coding->src_object;
5693
5694 /* Note: composition handling is not yet implemented. */ 5952 /* Note: composition handling is not yet implemented. */
5695 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK; 5953 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5696 5954
5697 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK 5955 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
5698 && find_composition (pos, end_pos, &start, &end, &prop, object) 5956 stop = stop_composition = pos;
5699 && end <= end_pos
5700 && (start >= pos
5701 || (find_composition (end, end_pos, &start, &end, &prop, object)
5702 && end <= end_pos)))
5703 stop = start;
5704 else 5957 else
5705 stop = end_pos; 5958 stop = stop_composition = end_pos;
5706 5959 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
5960 stop = stop_charset = pos;
5961 else
5962 stop_charset = end_pos;
5963
5964 /* Compensate for CRLF and annotation. */
5965 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
5707 while (buf < buf_end) 5966 while (buf < buf_end)
5708 { 5967 {
5709 if (pos == stop) 5968 if (pos == stop)
5710 { 5969 {
5711 int *p; 5970 int *p;
5712 5971
5713 if (pos == end_pos) 5972 if (pos == end_pos)
5714 break; 5973 break;
5715 p = save_composition_data (buf, buf_end, prop); 5974 if (pos == stop_composition)
5716 if (p == NULL) 5975 buf = handle_composition_annotation (pos, end_pos, coding,
5717 break; 5976 buf, &stop_composition);
5718 buf = p; 5977 if (pos == stop_charset)
5719 if (find_composition (end, end_pos, &start, &end, &prop, object) 5978 buf = handle_charset_annotation (pos, end_pos, coding,
5720 && end <= end_pos) 5979 buf, &stop_charset);
5721 stop = start; 5980 stop = (stop_composition < stop_charset
5722 else 5981 ? stop_composition : stop_charset);
5723 stop = end_pos;
5724 } 5982 }
5725 5983
5726 if (! multibytep) 5984 if (! multibytep)
5727 c = *src++; 5985 c = *src++;
5728 else 5986 else
6160 coding->src_pos_byte = from_byte; 6418 coding->src_pos_byte = from_byte;
6161 } 6419 }
6162 else if (BUFFERP (src_object)) 6420 else if (BUFFERP (src_object))
6163 { 6421 {
6164 set_buffer_internal (XBUFFER (src_object)); 6422 set_buffer_internal (XBUFFER (src_object));
6165 if (from != GPT)
6166 move_gap_both (from, from_byte);
6167 if (EQ (src_object, dst_object)) 6423 if (EQ (src_object, dst_object))
6168 { 6424 {
6169 del_range_both (from, from_byte, to, to_byte, 1); 6425 coding->src_object = del_range_1 (from, to, 1, 1);
6170 coding->src_pos = -chars; 6426 coding->src_pos = 0;
6171 coding->src_pos_byte = -bytes; 6427 coding->src_pos_byte = 0;
6172 } 6428 }
6173 else 6429 else
6174 { 6430 {
6431 if (from < GPT && to >= GPT)
6432 move_gap_both (from, from_byte);
6175 coding->src_pos = from; 6433 coding->src_pos = from;
6176 coding->src_pos_byte = from_byte; 6434 coding->src_pos_byte = from_byte;
6177 } 6435 }
6178 } 6436 }
6179 6437
6318 int multibytep; 6576 int multibytep;
6319 Lisp_Object coding_system; 6577 Lisp_Object coding_system;
6320 { 6578 {
6321 unsigned char *src_end = src + src_bytes; 6579 unsigned char *src_end = src + src_bytes;
6322 int mask = CATEGORY_MASK_ANY; 6580 int mask = CATEGORY_MASK_ANY;
6323 int detected = 0;
6324 int c, i;
6325 Lisp_Object attrs, eol_type; 6581 Lisp_Object attrs, eol_type;
6326 Lisp_Object val; 6582 Lisp_Object val;
6327 struct coding_system coding; 6583 struct coding_system coding;
6328 int id; 6584 int id;
6585 struct coding_detection_info detect_info;
6329 6586
6330 if (NILP (coding_system)) 6587 if (NILP (coding_system))
6331 coding_system = Qundecided; 6588 coding_system = Qundecided;
6332 setup_coding_system (coding_system, &coding); 6589 setup_coding_system (coding_system, &coding);
6333 attrs = CODING_ID_ATTRS (coding.id); 6590 attrs = CODING_ID_ATTRS (coding.id);
6338 coding.src_bytes = src_bytes; 6595 coding.src_bytes = src_bytes;
6339 coding.src_multibyte = multibytep; 6596 coding.src_multibyte = multibytep;
6340 coding.consumed = 0; 6597 coding.consumed = 0;
6341 coding.mode |= CODING_MODE_LAST_BLOCK; 6598 coding.mode |= CODING_MODE_LAST_BLOCK;
6342 6599
6600 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6601
6343 /* At first, detect text-format if necessary. */ 6602 /* At first, detect text-format if necessary. */
6344 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided) 6603 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6345 { 6604 {
6605 enum coding_category category;
6606 struct coding_system *this;
6607 int c, i;
6608
6346 for (; src < src_end; src++) 6609 for (; src < src_end; src++)
6347 { 6610 {
6348 c = *src; 6611 c = *src;
6349 if (c & 0x80 6612 if (c & 0x80
6350 || (c < 0x20 && (c == ISO_CODE_ESC 6613 || (c < 0x20 && (c == ISO_CODE_ESC
6355 coding.head_ascii = src - coding.source; 6618 coding.head_ascii = src - coding.source;
6356 6619
6357 if (src < src_end) 6620 if (src < src_end)
6358 for (i = 0; i < coding_category_raw_text; i++) 6621 for (i = 0; i < coding_category_raw_text; i++)
6359 { 6622 {
6360 enum coding_category category = coding_priorities[i]; 6623 category = coding_priorities[i];
6361 struct coding_system *this = coding_categories + category; 6624 this = coding_categories + category;
6362 6625
6363 if (this->id < 0) 6626 if (this->id < 0)
6364 { 6627 {
6365 /* No coding system of this category is defined. */ 6628 /* No coding system of this category is defined. */
6366 mask &= ~(1 << category); 6629 detect_info.rejected |= (1 << category);
6367 } 6630 }
6368 else if (category >= coding_category_raw_text 6631 else if (category >= coding_category_raw_text)
6369 || detected & (1 << category))
6370 continue; 6632 continue;
6633 else if (detect_info.checked & (1 << category))
6634 {
6635 if (highest
6636 && (detect_info.found & (1 << category)))
6637 break;
6638 }
6371 else 6639 else
6372 { 6640 {
6373 detected |= detected_mask[category]; 6641 if ((*(this->detector)) (&coding, &detect_info)
6374 if ((*(coding_categories[category].detector)) (&coding, &mask)
6375 && highest 6642 && highest
6376 && (mask & (1 << category))) 6643 && (detect_info.found & (1 << category)))
6377 { 6644 break;
6378 mask = 1 << category;
6379 break;
6380 }
6381 } 6645 }
6382 } 6646 }
6383 6647
6384 if (!mask) 6648
6385 { 6649 if (detect_info.rejected == CATEGORY_MASK_ANY)
6650 {
6651 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6386 id = coding_categories[coding_category_raw_text].id; 6652 id = coding_categories[coding_category_raw_text].id;
6387 val = Fcons (make_number (id), Qnil); 6653 val = Fcons (make_number (id), Qnil);
6388 } 6654 }
6389 else if (mask == CATEGORY_MASK_ANY) 6655 else if (! detect_info.rejected && ! detect_info.found)
6390 { 6656 {
6657 detect_info.found = CATEGORY_MASK_ANY;
6391 id = coding_categories[coding_category_undecided].id; 6658 id = coding_categories[coding_category_undecided].id;
6392 val = Fcons (make_number (id), Qnil); 6659 val = Fcons (make_number (id), Qnil);
6393 } 6660 }
6394 else if (highest) 6661 else if (highest)
6395 { 6662 {
6396 for (i = 0; i < coding_category_raw_text; i++) 6663 if (detect_info.found)
6397 if (mask & (1 << coding_priorities[i])) 6664 {
6398 { 6665 detect_info.found = 1 << category;
6399 id = coding_categories[coding_priorities[i]].id; 6666 val = Fcons (make_number (this->id), Qnil);
6400 val = Fcons (make_number (id), Qnil); 6667 }
6401 break; 6668 else
6402 } 6669 for (i = 0; i < coding_category_raw_text; i++)
6403 } 6670 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6671 {
6672 detect_info.found = 1 << coding_priorities[i];
6673 id = coding_categories[coding_priorities[i]].id;
6674 val = Fcons (make_number (id), Qnil);
6675 break;
6676 }
6677 }
6404 else 6678 else
6405 { 6679 {
6680 int mask = detect_info.rejected | detect_info.found;
6681 int found = 0;
6406 val = Qnil; 6682 val = Qnil;
6683
6407 for (i = coding_category_raw_text - 1; i >= 0; i--) 6684 for (i = coding_category_raw_text - 1; i >= 0; i--)
6408 if (mask & (1 << coding_priorities[i])) 6685 {
6409 { 6686 category = coding_priorities[i];
6410 id = coding_categories[coding_priorities[i]].id; 6687 if (! (mask & (1 << category)))
6411 val = Fcons (make_number (id), val); 6688 {
6412 } 6689 found |= 1 << category;
6690 id = coding_categories[category].id;
6691 val = Fcons (make_number (id), val);
6692 }
6693 }
6694 for (i = coding_category_raw_text - 1; i >= 0; i--)
6695 {
6696 category = coding_priorities[i];
6697 if (detect_info.found & (1 << category))
6698 {
6699 id = coding_categories[category].id;
6700 val = Fcons (make_number (id), val);
6701 }
6702 }
6703 detect_info.found |= found;
6413 } 6704 }
6414 } 6705 }
6415 else 6706 else
6416 { 6707 {
6417 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); 6708 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6418 val = Fcons (make_number (coding.id), Qnil); 6709 val = Fcons (make_number (coding.id), Qnil);
6419 } 6710 }
6420 6711
6421 /* Then, detect eol-format if necessary. */ 6712 /* Then, detect eol-format if necessary. */
6422 { 6713 {
6423 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol; 6714 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6424 Lisp_Object tail; 6715 Lisp_Object tail;
6425 6716
6426 if (VECTORP (eol_type)) 6717 if (VECTORP (eol_type))
6427 { 6718 {
6428 if (mask & ~CATEGORY_MASK_UTF_16) 6719 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6429 normal_eol = detect_eol (coding.source, src_bytes, 6720 normal_eol = detect_eol (coding.source, src_bytes,
6430 coding_category_raw_text); 6721 coding_category_raw_text);
6431 if (mask & (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_BE_NOSIG)) 6722 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6723 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6432 utf_16_be_eol = detect_eol (coding.source, src_bytes, 6724 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6433 coding_category_utf_16_be); 6725 coding_category_utf_16_be);
6434 if (mask & (CATEGORY_MASK_UTF_16_LE | CATEGORY_MASK_UTF_16_LE_NOSIG)) 6726 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6727 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6435 utf_16_le_eol = detect_eol (coding.source, src_bytes, 6728 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6436 coding_category_utf_16_le); 6729 coding_category_utf_16_le);
6437 } 6730 }
6438 else 6731 else
6439 { 6732 {