comparison src/coding.c @ 89193:311d061195ef

(detect_coding_utf_8): Check incomplete byte sequence. Don't update *mask when correctly detected. (detect_coding_utf_16): Likewise. (detect_coding_emacs_mule): Likewise. (detect_coding_iso_2022): Likewise. (detect_coding_sjis): Likewise. (detect_coding_big5): Likewise. (detect_coding_ccl): Likewise. (decode_coding_sjis): Fix decoding of katakana-jisx0201. (detect_eol): Delete the argument CODING, and add the argument CATEGORY. (detect_coding): Adjusted for the changes above. (detect_coding_system): Likewise.
author Kenichi Handa <handa@m17n.org>
date Thu, 10 Oct 2002 09:05:37 +0000
parents 88a9e962e183
children c232917f49f7
comparison
equal deleted inserted replaced
89192:f6651a1271ae 89193:311d061195ef
1029 unsigned char *src = coding->source, *src_base = src; 1029 unsigned char *src = coding->source, *src_base = src;
1030 unsigned char *src_end = coding->source + coding->src_bytes; 1030 unsigned char *src_end = coding->source + coding->src_bytes;
1031 int multibytep = coding->src_multibyte; 1031 int multibytep = coding->src_multibyte;
1032 int consumed_chars = 0; 1032 int consumed_chars = 0;
1033 int found = 0; 1033 int found = 0;
1034 int incomplete;
1034 1035
1035 /* A coding system of this category is always ASCII compatible. */ 1036 /* A coding system of this category is always ASCII compatible. */
1036 src += coding->head_ascii; 1037 src += coding->head_ascii;
1037 1038
1038 while (1) 1039 while (1)
1039 { 1040 {
1040 int c, c1, c2, c3, c4; 1041 int c, c1, c2, c3, c4;
1041 1042
1043 incomplete = 0;
1042 ONE_MORE_BYTE (c); 1044 ONE_MORE_BYTE (c);
1043 if (UTF_8_1_OCTET_P (c)) 1045 if (UTF_8_1_OCTET_P (c))
1044 continue; 1046 continue;
1047 incomplete = 1;
1045 ONE_MORE_BYTE (c1); 1048 ONE_MORE_BYTE (c1);
1046 if (! UTF_8_EXTRA_OCTET_P (c1)) 1049 if (! UTF_8_EXTRA_OCTET_P (c1))
1047 break; 1050 break;
1048 if (UTF_8_2_OCTET_LEADING_P (c)) 1051 if (UTF_8_2_OCTET_LEADING_P (c))
1049 { 1052 {
1078 } 1081 }
1079 *mask &= ~CATEGORY_MASK_UTF_8; 1082 *mask &= ~CATEGORY_MASK_UTF_8;
1080 return 0; 1083 return 0;
1081 1084
1082 no_more_source: 1085 no_more_source:
1083 if (! found) 1086 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1084 return 0; 1087 {
1085 *mask &= CATEGORY_MASK_UTF_8; 1088 *mask &= ~CATEGORY_MASK_UTF_8;
1086 return 1; 1089 return 0;
1090 }
1091 return found;
1087 } 1092 }
1088 1093
1089 1094
1090 static void 1095 static void
1091 decode_coding_utf_8 (coding) 1096 decode_coding_utf_8 (coding)
1287 unsigned char *src_end = coding->source + coding->src_bytes; 1292 unsigned char *src_end = coding->source + coding->src_bytes;
1288 int multibytep = coding->src_multibyte; 1293 int multibytep = coding->src_multibyte;
1289 int consumed_chars = 0; 1294 int consumed_chars = 0;
1290 int c1, c2; 1295 int c1, c2;
1291 1296
1297 *mask &= ~CATEGORY_MASK_UTF_16;
1298
1292 ONE_MORE_BYTE (c1); 1299 ONE_MORE_BYTE (c1);
1293 ONE_MORE_BYTE (c2); 1300 ONE_MORE_BYTE (c2);
1294 1301
1295 if ((c1 == 0xFF) && (c2 == 0xFE)) 1302 if ((c1 == 0xFF) && (c2 == 0xFE))
1296 { 1303 *mask |= CATEGORY_MASK_UTF_16_LE;
1297 *mask &= CATEGORY_MASK_UTF_16_LE;
1298 return 1;
1299 }
1300 else if ((c1 == 0xFE) && (c2 == 0xFF)) 1304 else if ((c1 == 0xFE) && (c2 == 0xFF))
1301 { 1305 *mask |= CATEGORY_MASK_UTF_16_BE;
1302 *mask &= CATEGORY_MASK_UTF_16_BE; 1306 else
1303 return 1; 1307 *mask |= CATEGORY_MASK_UTF_16_BE_NOSIG | CATEGORY_MASK_UTF_16_LE_NOSIG;
1304 } 1308 return 1;
1309
1305 no_more_source: 1310 no_more_source:
1306 return 0; 1311 return 0;
1307 } 1312 }
1308 1313
1309 static void 1314 static void
1641 unsigned char *src_end = coding->source + coding->src_bytes; 1646 unsigned char *src_end = coding->source + coding->src_bytes;
1642 int multibytep = coding->src_multibyte; 1647 int multibytep = coding->src_multibyte;
1643 int consumed_chars = 0; 1648 int consumed_chars = 0;
1644 int c; 1649 int c;
1645 int found = 0; 1650 int found = 0;
1651 int incomplete;
1646 1652
1647 /* A coding system of this category is always ASCII compatible. */ 1653 /* A coding system of this category is always ASCII compatible. */
1648 src += coding->head_ascii; 1654 src += coding->head_ascii;
1649 1655
1650 while (1) 1656 while (1)
1651 { 1657 {
1658 incomplete = 0;
1652 ONE_MORE_BYTE (c); 1659 ONE_MORE_BYTE (c);
1660 incomplete = 1;
1653 1661
1654 if (c == 0x80) 1662 if (c == 0x80)
1655 { 1663 {
1656 /* Perhaps the start of composite character. We simple skip 1664 /* Perhaps the start of composite character. We simple skip
1657 it because analyzing it is too heavy for detecting. But, 1665 it because analyzing it is too heavy for detecting. But,
1696 } 1704 }
1697 *mask &= ~CATEGORY_MASK_EMACS_MULE; 1705 *mask &= ~CATEGORY_MASK_EMACS_MULE;
1698 return 0; 1706 return 0;
1699 1707
1700 no_more_source: 1708 no_more_source:
1701 if (!found) 1709 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1702 return 0; 1710 {
1703 *mask &= CATEGORY_MASK_EMACS_MULE; 1711 *mask &= ~CATEGORY_MASK_EMACS_MULE;
1704 return 1; 1712 return 0;
1713 }
1714 return found;
1705 } 1715 }
1706 1716
1707 1717
1708 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ 1718 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1709 1719
2463 case ISO_CODE_SS2: 2473 case ISO_CODE_SS2:
2464 case ISO_CODE_SS3: 2474 case ISO_CODE_SS3:
2465 { 2475 {
2466 int newmask = CATEGORY_MASK_ISO_8_ELSE; 2476 int newmask = CATEGORY_MASK_ISO_8_ELSE;
2467 2477
2478 mask_8bit_found = 1;
2468 if (inhibit_iso_escape_detection) 2479 if (inhibit_iso_escape_detection)
2469 break; 2480 break;
2470 if (c != ISO_CODE_CSI) 2481 if (c != ISO_CODE_CSI)
2471 { 2482 {
2472 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) 2483 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2556 *mask &= ~CATEGORY_MASK_ISO; 2567 *mask &= ~CATEGORY_MASK_ISO;
2557 return 0; 2568 return 0;
2558 } 2569 }
2559 if (!mask_found) 2570 if (!mask_found)
2560 return 0; 2571 return 0;
2561 *mask &= mask_iso & mask_found; 2572 *mask &= ~CATEGORY_MASK_ISO;
2573 *mask |= mask_iso & mask_found;
2562 if (! mask_8bit_found) 2574 if (! mask_8bit_found)
2563 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE); 2575 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2564 return 1; 2576 return 1;
2565 } 2577 }
2566 2578
3656 unsigned char *src_end = coding->source + coding->src_bytes; 3668 unsigned char *src_end = coding->source + coding->src_bytes;
3657 int multibytep = coding->src_multibyte; 3669 int multibytep = coding->src_multibyte;
3658 int consumed_chars = 0; 3670 int consumed_chars = 0;
3659 int found = 0; 3671 int found = 0;
3660 int c; 3672 int c;
3673 int incomplete;
3661 3674
3662 /* A coding system of this category is always ASCII compatible. */ 3675 /* A coding system of this category is always ASCII compatible. */
3663 src += coding->head_ascii; 3676 src += coding->head_ascii;
3664 3677
3665 while (1) 3678 while (1)
3666 { 3679 {
3680 incomplete = 0;
3667 ONE_MORE_BYTE (c); 3681 ONE_MORE_BYTE (c);
3682 incomplete = 1;
3668 if (c < 0x80) 3683 if (c < 0x80)
3669 continue; 3684 continue;
3670 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF)) 3685 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3671 { 3686 {
3672 ONE_MORE_BYTE (c); 3687 ONE_MORE_BYTE (c);
3681 } 3696 }
3682 *mask &= ~CATEGORY_MASK_SJIS; 3697 *mask &= ~CATEGORY_MASK_SJIS;
3683 return 0; 3698 return 0;
3684 3699
3685 no_more_source: 3700 no_more_source:
3686 if (!found) 3701 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3687 return 0; 3702 {
3688 *mask &= CATEGORY_MASK_SJIS; 3703 *mask &= ~CATEGORY_MASK_SJIS;
3689 return 1; 3704 return 0;
3705 }
3706 return found;
3690 } 3707 }
3691 3708
3692 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". 3709 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3693 Check if a text is encoded in BIG5. If it is, return 3710 Check if a text is encoded in BIG5. If it is, return
3694 CATEGORY_MASK_BIG5, else return 0. */ 3711 CATEGORY_MASK_BIG5, else return 0. */
3702 unsigned char *src_end = coding->source + coding->src_bytes; 3719 unsigned char *src_end = coding->source + coding->src_bytes;
3703 int multibytep = coding->src_multibyte; 3720 int multibytep = coding->src_multibyte;
3704 int consumed_chars = 0; 3721 int consumed_chars = 0;
3705 int found = 0; 3722 int found = 0;
3706 int c; 3723 int c;
3724 int incomplete;
3707 3725
3708 /* A coding system of this category is always ASCII compatible. */ 3726 /* A coding system of this category is always ASCII compatible. */
3709 src += coding->head_ascii; 3727 src += coding->head_ascii;
3710 3728
3711 while (1) 3729 while (1)
3712 { 3730 {
3731 incomplete = 0;
3713 ONE_MORE_BYTE (c); 3732 ONE_MORE_BYTE (c);
3733 incomplete = 1;
3714 if (c < 0x80) 3734 if (c < 0x80)
3715 continue; 3735 continue;
3716 if (c >= 0xA1) 3736 if (c >= 0xA1)
3717 { 3737 {
3718 ONE_MORE_BYTE (c); 3738 ONE_MORE_BYTE (c);
3725 } 3745 }
3726 *mask &= ~CATEGORY_MASK_BIG5; 3746 *mask &= ~CATEGORY_MASK_BIG5;
3727 return 0; 3747 return 0;
3728 3748
3729 no_more_source: 3749 no_more_source:
3730 if (!found) 3750 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3731 return 0; 3751 {
3732 *mask &= CATEGORY_MASK_BIG5; 3752 *mask &= ~CATEGORY_MASK_BIG5;
3733 return 1; 3753 return 0;
3754 }
3755 return found;
3734 } 3756 }
3735 3757
3736 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". 3758 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3737 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ 3759 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3738 3760
3752 3774
3753 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 3775 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3754 3776
3755 val = charset_list; 3777 val = charset_list;
3756 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); 3778 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3757 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); 3779 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3758 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))); 3780 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3759 3781
3760 while (1) 3782 while (1)
3761 { 3783 {
3762 int c, c1; 3784 int c, c1;
3763 3785
3800 c = (c << 8) | c1; 3822 c = (c << 8) | c1;
3801 SJIS_TO_JIS (c); 3823 SJIS_TO_JIS (c);
3802 charset = charset_kanji; 3824 charset = charset_kanji;
3803 } 3825 }
3804 else 3826 else
3805 /* SJIS -> JISX0201-Kana */ 3827 {
3806 charset = charset_kana; 3828 /* SJIS -> JISX0201-Kana */
3829 c &= 0x7F;
3830 charset = charset_kana;
3831 }
3807 } 3832 }
3808 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); 3833 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3809 } 3834 }
3810 *charbuf++ = c; 3835 *charbuf++ = c;
3811 continue; 3836 continue;
4095 } 4120 }
4096 *mask &= ~CATEGORY_MASK_CCL; 4121 *mask &= ~CATEGORY_MASK_CCL;
4097 return 0; 4122 return 0;
4098 4123
4099 no_more_source: 4124 no_more_source:
4100 if (!found) 4125 return found;
4101 return 0;
4102 *mask &= CATEGORY_MASK_CCL;
4103 return 1;
4104 } 4126 }
4105 4127
4106 static void 4128 static void
4107 decode_coding_ccl (coding) 4129 decode_coding_ccl (coding)
4108 struct coding_system *coding; 4130 struct coding_system *coding;
4366 } 4388 }
4367 *mask &= ~CATEGORY_MASK_CHARSET; 4389 *mask &= ~CATEGORY_MASK_CHARSET;
4368 return 0; 4390 return 0;
4369 4391
4370 no_more_source: 4392 no_more_source:
4371 *mask &= CATEGORY_MASK_CHARSET;
4372 return 1; 4393 return 1;
4373 } 4394 }
4374 4395
4375 static void 4396 static void
4376 decode_coding_charset (coding) 4397 decode_coding_charset (coding)
4892 EOL_SEEN_XXX. */ 4913 EOL_SEEN_XXX. */
4893 4914
4894 #define MAX_EOL_CHECK_COUNT 3 4915 #define MAX_EOL_CHECK_COUNT 3
4895 4916
4896 static int 4917 static int
4897 detect_eol (coding, source, src_bytes) 4918 detect_eol (source, src_bytes, category)
4898 struct coding_system *coding;
4899 unsigned char *source; 4919 unsigned char *source;
4900 EMACS_INT src_bytes; 4920 EMACS_INT src_bytes;
4901 { 4921 enum coding_category category;
4902 Lisp_Object attrs, coding_type; 4922 {
4903 unsigned char *src = source, *src_end = src + src_bytes; 4923 unsigned char *src = source, *src_end = src + src_bytes;
4904 unsigned char c; 4924 unsigned char c;
4905 int total = 0; 4925 int total = 0;
4906 int eol_seen = EOL_SEEN_NONE; 4926 int eol_seen = EOL_SEEN_NONE;
4907 4927
4908 attrs = CODING_ID_ATTRS (coding->id); 4928 if ((1 << category) & CATEGORY_MASK_UTF_16)
4909 coding_type = CODING_ATTR_TYPE (attrs);
4910
4911 if (EQ (coding_type, Qccl))
4912 { 4929 {
4913 int msb, lsb; 4930 int msb, lsb;
4914 4931
4915 msb = coding->spec.utf_16.endian == utf_16_little_endian; 4932 msb = category == (coding_category_utf_16_le
4933 | coding_category_utf_16_le_nosig);
4916 lsb = 1 - msb; 4934 lsb = 1 - msb;
4917 4935
4918 while (src + 1 < src_end) 4936 while (src + 1 < src_end)
4919 { 4937 {
4920 c = src[lsb]; 4938 c = src[lsb];
5037 for (i = 0; i < coding_category_raw_text; i++) 5055 for (i = 0; i < coding_category_raw_text; i++)
5038 { 5056 {
5039 enum coding_category category = coding_priorities[i]; 5057 enum coding_category category = coding_priorities[i];
5040 struct coding_system *this = coding_categories + category; 5058 struct coding_system *this = coding_categories + category;
5041 5059
5042 if (category >= coding_category_raw_text
5043 || detected & (1 << category))
5044 continue;
5045
5046 if (this->id < 0) 5060 if (this->id < 0)
5047 { 5061 {
5048 /* No coding system of this category is defined. */ 5062 /* No coding system of this category is defined. */
5049 mask &= ~(1 << category); 5063 mask &= ~(1 << category);
5050 } 5064 }
5065 else if (category >= coding_category_raw_text
5066 || detected & (1 << category))
5067 continue;
5051 else 5068 else
5052 { 5069 {
5053 detected |= detected_mask[category]; 5070 detected |= detected_mask[category];
5054 if ((*(this->detector)) (coding, &mask)) 5071 if ((*(this->detector)) (coding, &mask)
5072 && (mask & (1 << category)))
5055 break; 5073 break;
5056 } 5074 }
5057 } 5075 }
5058 if (! mask) 5076 if (! mask)
5059 setup_coding_system (Qraw_text, coding); 5077 setup_coding_system (Qraw_text, coding);
5079 detection is impossible for a CCL based coding system, in which 5097 detection is impossible for a CCL based coding system, in which
5080 case, we detct the EOL type after decoding. */ 5098 case, we detct the EOL type after decoding. */
5081 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)) 5099 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5082 && ! EQ (coding_type, Qccl)) 5100 && ! EQ (coding_type, Qccl))
5083 { 5101 {
5084 int eol_seen = detect_eol (coding, coding->source, coding->src_bytes); 5102 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5103 XINT (CODING_ATTR_CATEGORY (attrs)));
5085 5104
5086 if (eol_seen != EOL_SEEN_NONE) 5105 if (eol_seen != EOL_SEEN_NONE)
5087 adjust_coding_eol_type (coding, eol_seen); 5106 adjust_coding_eol_type (coding, eol_seen);
5088 } 5107 }
5089 } 5108 }
6243 while (1) 6262 while (1)
6244 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); 6263 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6245 } 6264 }
6246 6265
6247 6266
6267 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6268 HIGHEST is nonzero, return the coding system of the highest
6269 priority among the detected coding systems. Otherwize return a
6270 list of detected coding systems sorted by their priorities. If
6271 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6272 multibyte form but contains only ASCII and eight-bit chars.
6273 Otherwise, the bytes are raw bytes.
6274
6275 CODING-SYSTEM controls the detection as below:
6276
6277 If it is nil, detect both text-format and eol-format. If the
6278 text-format part of CODING-SYSTEM is already specified
6279 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6280 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6281 detect only text-format. */
6282
6248 Lisp_Object 6283 Lisp_Object
6249 detect_coding_system (src, src_bytes, highest, multibytep, coding_system) 6284 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6250 unsigned char *src; 6285 unsigned char *src;
6251 int src_bytes, highest; 6286 int src_bytes, highest;
6252 int multibytep; 6287 int multibytep;
6257 int detected = 0; 6292 int detected = 0;
6258 int c, i; 6293 int c, i;
6259 Lisp_Object attrs, eol_type; 6294 Lisp_Object attrs, eol_type;
6260 Lisp_Object val; 6295 Lisp_Object val;
6261 struct coding_system coding; 6296 struct coding_system coding;
6297 int id;
6262 6298
6263 if (NILP (coding_system)) 6299 if (NILP (coding_system))
6264 coding_system = Qundecided; 6300 coding_system = Qundecided;
6265 setup_coding_system (coding_system, &coding); 6301 setup_coding_system (coding_system, &coding);
6266 attrs = CODING_ID_ATTRS (coding.id); 6302 attrs = CODING_ID_ATTRS (coding.id);
6267 eol_type = CODING_ID_EOL_TYPE (coding.id); 6303 eol_type = CODING_ID_EOL_TYPE (coding.id);
6304 coding_system = CODING_ATTR_BASE_NAME (attrs);
6268 6305
6269 coding.source = src; 6306 coding.source = src;
6270 coding.src_bytes = src_bytes; 6307 coding.src_bytes = src_bytes;
6271 coding.src_multibyte = multibytep; 6308 coding.src_multibyte = multibytep;
6272 coding.consumed = 0; 6309 coding.consumed = 0;
6273 6310 coding.mode |= CODING_MODE_LAST_BLOCK;
6274 if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided) 6311
6275 { 6312 /* At first, detect text-format if necessary. */
6276 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); 6313 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6277 } 6314 {
6278 else
6279 {
6280 coding_system = Qnil;
6281 for (; src < src_end; src++) 6315 for (; src < src_end; src++)
6282 { 6316 {
6283 c = *src; 6317 c = *src;
6284 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC 6318 if (c & 0x80
6285 || c == ISO_CODE_SI 6319 || (c < 0x20 && (c == ISO_CODE_ESC
6286 || c == ISO_CODE_SO))) 6320 || c == ISO_CODE_SI
6321 || c == ISO_CODE_SO
6322 /* Most UTF-16 text contains '\0'. */
6323 || !c)))
6287 break; 6324 break;
6288 } 6325 }
6289 coding.head_ascii = src - coding.source; 6326 coding.head_ascii = src - coding.source;
6290 6327
6291 if (src < src_end) 6328 if (src < src_end)
6292 for (i = 0; i < coding_category_raw_text; i++) 6329 for (i = 0; i < coding_category_raw_text; i++)
6293 { 6330 {
6294 enum coding_category category = coding_priorities[i]; 6331 enum coding_category category = coding_priorities[i];
6295 struct coding_system *this = coding_categories + category; 6332 struct coding_system *this = coding_categories + category;
6296 6333
6297 if (category >= coding_category_raw_text
6298 || detected & (1 << category))
6299 continue;
6300
6301 if (this->id < 0) 6334 if (this->id < 0)
6302 { 6335 {
6303 /* No coding system of this category is defined. */ 6336 /* No coding system of this category is defined. */
6304 mask &= ~(1 << category); 6337 mask &= ~(1 << category);
6305 } 6338 }
6339 else if (category >= coding_category_raw_text
6340 || detected & (1 << category))
6341 continue;
6306 else 6342 else
6307 { 6343 {
6308 detected |= detected_mask[category]; 6344 detected |= detected_mask[category];
6309 if ((*(coding_categories[category].detector)) (&coding, &mask) 6345 if ((*(coding_categories[category].detector)) (&coding, &mask)
6310 && highest) 6346 && highest
6347 && (mask & (1 << category)))
6311 { 6348 {
6312 mask &= detected_mask[category]; 6349 mask = 1 << category;
6313 break; 6350 break;
6314 } 6351 }
6315 } 6352 }
6316 } 6353 }
6317 } 6354
6318 6355 if (!mask)
6319 if (!mask) 6356 {
6320 val = Fcons (make_number (coding_category_raw_text), Qnil); 6357 id = coding_categories[coding_category_raw_text].id;
6321 else if (mask == CATEGORY_MASK_ANY) 6358 val = Fcons (make_number (id), Qnil);
6322 val = Fcons (make_number (coding_category_undecided), Qnil); 6359 }
6323 else if (highest) 6360 else if (mask == CATEGORY_MASK_ANY)
6324 { 6361 {
6325 for (i = 0; i < coding_category_raw_text; i++) 6362 id = coding_categories[coding_category_undecided].id;
6326 if (mask & (1 << coding_priorities[i])) 6363 val = Fcons (make_number (id), Qnil);
6327 { 6364 }
6328 val = Fcons (make_number (coding_priorities[i]), Qnil); 6365 else if (highest)
6329 break; 6366 {
6330 } 6367 for (i = 0; i < coding_category_raw_text; i++)
6331 } 6368 if (mask & (1 << coding_priorities[i]))
6369 {
6370 id = coding_categories[coding_priorities[i]].id;
6371 val = Fcons (make_number (id), Qnil);
6372 break;
6373 }
6374 }
6375 else
6376 {
6377 val = Qnil;
6378 for (i = coding_category_raw_text - 1; i >= 0; i--)
6379 if (mask & (1 << coding_priorities[i]))
6380 {
6381 id = coding_categories[coding_priorities[i]].id;
6382 val = Fcons (make_number (id), val);
6383 }
6384 }
6385 }
6332 else 6386 else
6333 { 6387 {
6334 val = Qnil; 6388 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6335 for (i = coding_category_raw_text - 1; i >= 0; i--) 6389 val = Fcons (make_number (coding.id), Qnil);
6336 if (mask & (1 << coding_priorities[i])) 6390 }
6337 val = Fcons (make_number (coding_priorities[i]), val); 6391
6338 } 6392 /* Then, detect eol-format if necessary. */
6339
6340 { 6393 {
6341 int one_byte_eol = -1, two_byte_eol = -1; 6394 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6342 Lisp_Object tail; 6395 Lisp_Object tail;
6396
6397 if (VECTORP (eol_type))
6398 {
6399 if (mask & ~CATEGORY_MASK_UTF_16)
6400 normal_eol = detect_eol (coding.source, src_bytes,
6401 coding_category_raw_text);
6402 if (mask & (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_BE_NOSIG))
6403 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6404 coding_category_utf_16_be);
6405 if (mask & (CATEGORY_MASK_UTF_16_LE | CATEGORY_MASK_UTF_16_LE_NOSIG))
6406 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6407 coding_category_utf_16_le);
6408 }
6409 else
6410 {
6411 if (EQ (eol_type, Qunix))
6412 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6413 else if (EQ (eol_type, Qdos))
6414 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6415 else
6416 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6417 }
6343 6418
6344 for (tail = val; CONSP (tail); tail = XCDR (tail)) 6419 for (tail = val; CONSP (tail); tail = XCDR (tail))
6345 { 6420 {
6346 struct coding_system *this 6421 enum coding_category category;
6347 = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6348 : &coding);
6349 int this_eol; 6422 int this_eol;
6350 6423
6351 attrs = CODING_ID_ATTRS (this->id); 6424 id = XINT (XCAR (tail));
6352 eol_type = CODING_ID_EOL_TYPE (this->id); 6425 attrs = CODING_ID_ATTRS (id);
6353 XSETCAR (tail, CODING_ID_NAME (this->id)); 6426 category = XINT (CODING_ATTR_CATEGORY (attrs));
6427 eol_type = CODING_ID_EOL_TYPE (id);
6354 if (VECTORP (eol_type)) 6428 if (VECTORP (eol_type))
6355 { 6429 {
6356 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16)) 6430 if (category == coding_category_utf_16_be
6357 { 6431 || category == coding_category_utf_16_be_nosig)
6358 if (two_byte_eol < 0) 6432 this_eol = utf_16_be_eol;
6359 two_byte_eol = detect_eol (this, coding.source, src_bytes); 6433 else if (category == coding_category_utf_16_le
6360 this_eol = two_byte_eol; 6434 || category == coding_category_utf_16_le_nosig)
6361 } 6435 this_eol = utf_16_le_eol;
6362 else 6436 else
6363 { 6437 this_eol = normal_eol;
6364 if (one_byte_eol < 0) 6438
6365 one_byte_eol =detect_eol (this, coding.source, src_bytes);
6366 this_eol = one_byte_eol;
6367 }
6368 if (this_eol == EOL_SEEN_LF) 6439 if (this_eol == EOL_SEEN_LF)
6369 XSETCAR (tail, AREF (eol_type, 0)); 6440 XSETCAR (tail, AREF (eol_type, 0));
6370 else if (this_eol == EOL_SEEN_CRLF) 6441 else if (this_eol == EOL_SEEN_CRLF)
6371 XSETCAR (tail, AREF (eol_type, 1)); 6442 XSETCAR (tail, AREF (eol_type, 1));
6372 else if (this_eol == EOL_SEEN_CR) 6443 else if (this_eol == EOL_SEEN_CR)
6373 XSETCAR (tail, AREF (eol_type, 2)); 6444 XSETCAR (tail, AREF (eol_type, 2));
6445 else
6446 XSETCAR (tail, CODING_ID_NAME (id));
6374 } 6447 }
6448 else
6449 XSETCAR (tail, CODING_ID_NAME (id));
6375 } 6450 }
6376 } 6451 }
6377 6452
6378 return (highest ? XCAR (val) : val); 6453 return (highest ? XCAR (val) : val);
6379 } 6454 }