Mercurial > emacs
comparison src/coding.c @ 89193:311d061195ef
(detect_coding_utf_8): Check incomplete byte sequence.
Don't update *mask when correctly detected.
(detect_coding_utf_16): Likewise.
(detect_coding_emacs_mule): Likewise.
(detect_coding_iso_2022): Likewise.
(detect_coding_sjis): Likewise.
(detect_coding_big5): Likewise.
(detect_coding_ccl): Likewise.
(decode_coding_sjis): Fix decoding of katakana-jisx0201.
(detect_eol): Delete the argument CODING, and add the argument
CATEGORY.
(detect_coding): Adjusted for the changes above.
(detect_coding_system): Likewise.
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Thu, 10 Oct 2002 09:05:37 +0000 |
| parents | 88a9e962e183 |
| children | c232917f49f7 |
comparison
equal
deleted
inserted
replaced
| 89192:f6651a1271ae | 89193:311d061195ef |
|---|---|
| 1029 unsigned char *src = coding->source, *src_base = src; | 1029 unsigned char *src = coding->source, *src_base = src; |
| 1030 unsigned char *src_end = coding->source + coding->src_bytes; | 1030 unsigned char *src_end = coding->source + coding->src_bytes; |
| 1031 int multibytep = coding->src_multibyte; | 1031 int multibytep = coding->src_multibyte; |
| 1032 int consumed_chars = 0; | 1032 int consumed_chars = 0; |
| 1033 int found = 0; | 1033 int found = 0; |
| 1034 int incomplete; | |
| 1034 | 1035 |
| 1035 /* A coding system of this category is always ASCII compatible. */ | 1036 /* A coding system of this category is always ASCII compatible. */ |
| 1036 src += coding->head_ascii; | 1037 src += coding->head_ascii; |
| 1037 | 1038 |
| 1038 while (1) | 1039 while (1) |
| 1039 { | 1040 { |
| 1040 int c, c1, c2, c3, c4; | 1041 int c, c1, c2, c3, c4; |
| 1041 | 1042 |
| 1043 incomplete = 0; | |
| 1042 ONE_MORE_BYTE (c); | 1044 ONE_MORE_BYTE (c); |
| 1043 if (UTF_8_1_OCTET_P (c)) | 1045 if (UTF_8_1_OCTET_P (c)) |
| 1044 continue; | 1046 continue; |
| 1047 incomplete = 1; | |
| 1045 ONE_MORE_BYTE (c1); | 1048 ONE_MORE_BYTE (c1); |
| 1046 if (! UTF_8_EXTRA_OCTET_P (c1)) | 1049 if (! UTF_8_EXTRA_OCTET_P (c1)) |
| 1047 break; | 1050 break; |
| 1048 if (UTF_8_2_OCTET_LEADING_P (c)) | 1051 if (UTF_8_2_OCTET_LEADING_P (c)) |
| 1049 { | 1052 { |
| 1078 } | 1081 } |
| 1079 *mask &= ~CATEGORY_MASK_UTF_8; | 1082 *mask &= ~CATEGORY_MASK_UTF_8; |
| 1080 return 0; | 1083 return 0; |
| 1081 | 1084 |
| 1082 no_more_source: | 1085 no_more_source: |
| 1083 if (! found) | 1086 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 1084 return 0; | 1087 { |
| 1085 *mask &= CATEGORY_MASK_UTF_8; | 1088 *mask &= ~CATEGORY_MASK_UTF_8; |
| 1086 return 1; | 1089 return 0; |
| 1090 } | |
| 1091 return found; | |
| 1087 } | 1092 } |
| 1088 | 1093 |
| 1089 | 1094 |
| 1090 static void | 1095 static void |
| 1091 decode_coding_utf_8 (coding) | 1096 decode_coding_utf_8 (coding) |
| 1287 unsigned char *src_end = coding->source + coding->src_bytes; | 1292 unsigned char *src_end = coding->source + coding->src_bytes; |
| 1288 int multibytep = coding->src_multibyte; | 1293 int multibytep = coding->src_multibyte; |
| 1289 int consumed_chars = 0; | 1294 int consumed_chars = 0; |
| 1290 int c1, c2; | 1295 int c1, c2; |
| 1291 | 1296 |
| 1297 *mask &= ~CATEGORY_MASK_UTF_16; | |
| 1298 | |
| 1292 ONE_MORE_BYTE (c1); | 1299 ONE_MORE_BYTE (c1); |
| 1293 ONE_MORE_BYTE (c2); | 1300 ONE_MORE_BYTE (c2); |
| 1294 | 1301 |
| 1295 if ((c1 == 0xFF) && (c2 == 0xFE)) | 1302 if ((c1 == 0xFF) && (c2 == 0xFE)) |
| 1296 { | 1303 *mask |= CATEGORY_MASK_UTF_16_LE; |
| 1297 *mask &= CATEGORY_MASK_UTF_16_LE; | |
| 1298 return 1; | |
| 1299 } | |
| 1300 else if ((c1 == 0xFE) && (c2 == 0xFF)) | 1304 else if ((c1 == 0xFE) && (c2 == 0xFF)) |
| 1301 { | 1305 *mask |= CATEGORY_MASK_UTF_16_BE; |
| 1302 *mask &= CATEGORY_MASK_UTF_16_BE; | 1306 else |
| 1303 return 1; | 1307 *mask |= CATEGORY_MASK_UTF_16_BE_NOSIG | CATEGORY_MASK_UTF_16_LE_NOSIG; |
| 1304 } | 1308 return 1; |
| 1309 | |
| 1305 no_more_source: | 1310 no_more_source: |
| 1306 return 0; | 1311 return 0; |
| 1307 } | 1312 } |
| 1308 | 1313 |
| 1309 static void | 1314 static void |
| 1641 unsigned char *src_end = coding->source + coding->src_bytes; | 1646 unsigned char *src_end = coding->source + coding->src_bytes; |
| 1642 int multibytep = coding->src_multibyte; | 1647 int multibytep = coding->src_multibyte; |
| 1643 int consumed_chars = 0; | 1648 int consumed_chars = 0; |
| 1644 int c; | 1649 int c; |
| 1645 int found = 0; | 1650 int found = 0; |
| 1651 int incomplete; | |
| 1646 | 1652 |
| 1647 /* A coding system of this category is always ASCII compatible. */ | 1653 /* A coding system of this category is always ASCII compatible. */ |
| 1648 src += coding->head_ascii; | 1654 src += coding->head_ascii; |
| 1649 | 1655 |
| 1650 while (1) | 1656 while (1) |
| 1651 { | 1657 { |
| 1658 incomplete = 0; | |
| 1652 ONE_MORE_BYTE (c); | 1659 ONE_MORE_BYTE (c); |
| 1660 incomplete = 1; | |
| 1653 | 1661 |
| 1654 if (c == 0x80) | 1662 if (c == 0x80) |
| 1655 { | 1663 { |
| 1656 /* Perhaps the start of composite character. We simple skip | 1664 /* Perhaps the start of composite character. We simple skip |
| 1657 it because analyzing it is too heavy for detecting. But, | 1665 it because analyzing it is too heavy for detecting. But, |
| 1696 } | 1704 } |
| 1697 *mask &= ~CATEGORY_MASK_EMACS_MULE; | 1705 *mask &= ~CATEGORY_MASK_EMACS_MULE; |
| 1698 return 0; | 1706 return 0; |
| 1699 | 1707 |
| 1700 no_more_source: | 1708 no_more_source: |
| 1701 if (!found) | 1709 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 1702 return 0; | 1710 { |
| 1703 *mask &= CATEGORY_MASK_EMACS_MULE; | 1711 *mask &= ~CATEGORY_MASK_EMACS_MULE; |
| 1704 return 1; | 1712 return 0; |
| 1713 } | |
| 1714 return found; | |
| 1705 } | 1715 } |
| 1706 | 1716 |
| 1707 | 1717 |
| 1708 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ | 1718 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */ |
| 1709 | 1719 |
| 2463 case ISO_CODE_SS2: | 2473 case ISO_CODE_SS2: |
| 2464 case ISO_CODE_SS3: | 2474 case ISO_CODE_SS3: |
| 2465 { | 2475 { |
| 2466 int newmask = CATEGORY_MASK_ISO_8_ELSE; | 2476 int newmask = CATEGORY_MASK_ISO_8_ELSE; |
| 2467 | 2477 |
| 2478 mask_8bit_found = 1; | |
| 2468 if (inhibit_iso_escape_detection) | 2479 if (inhibit_iso_escape_detection) |
| 2469 break; | 2480 break; |
| 2470 if (c != ISO_CODE_CSI) | 2481 if (c != ISO_CODE_CSI) |
| 2471 { | 2482 { |
| 2472 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) | 2483 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1]) |
| 2556 *mask &= ~CATEGORY_MASK_ISO; | 2567 *mask &= ~CATEGORY_MASK_ISO; |
| 2557 return 0; | 2568 return 0; |
| 2558 } | 2569 } |
| 2559 if (!mask_found) | 2570 if (!mask_found) |
| 2560 return 0; | 2571 return 0; |
| 2561 *mask &= mask_iso & mask_found; | 2572 *mask &= ~CATEGORY_MASK_ISO; |
| 2573 *mask |= mask_iso & mask_found; | |
| 2562 if (! mask_8bit_found) | 2574 if (! mask_8bit_found) |
| 2563 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE); | 2575 *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE); |
| 2564 return 1; | 2576 return 1; |
| 2565 } | 2577 } |
| 2566 | 2578 |
| 3656 unsigned char *src_end = coding->source + coding->src_bytes; | 3668 unsigned char *src_end = coding->source + coding->src_bytes; |
| 3657 int multibytep = coding->src_multibyte; | 3669 int multibytep = coding->src_multibyte; |
| 3658 int consumed_chars = 0; | 3670 int consumed_chars = 0; |
| 3659 int found = 0; | 3671 int found = 0; |
| 3660 int c; | 3672 int c; |
| 3673 int incomplete; | |
| 3661 | 3674 |
| 3662 /* A coding system of this category is always ASCII compatible. */ | 3675 /* A coding system of this category is always ASCII compatible. */ |
| 3663 src += coding->head_ascii; | 3676 src += coding->head_ascii; |
| 3664 | 3677 |
| 3665 while (1) | 3678 while (1) |
| 3666 { | 3679 { |
| 3680 incomplete = 0; | |
| 3667 ONE_MORE_BYTE (c); | 3681 ONE_MORE_BYTE (c); |
| 3682 incomplete = 1; | |
| 3668 if (c < 0x80) | 3683 if (c < 0x80) |
| 3669 continue; | 3684 continue; |
| 3670 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF)) | 3685 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF)) |
| 3671 { | 3686 { |
| 3672 ONE_MORE_BYTE (c); | 3687 ONE_MORE_BYTE (c); |
| 3681 } | 3696 } |
| 3682 *mask &= ~CATEGORY_MASK_SJIS; | 3697 *mask &= ~CATEGORY_MASK_SJIS; |
| 3683 return 0; | 3698 return 0; |
| 3684 | 3699 |
| 3685 no_more_source: | 3700 no_more_source: |
| 3686 if (!found) | 3701 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 3687 return 0; | 3702 { |
| 3688 *mask &= CATEGORY_MASK_SJIS; | 3703 *mask &= ~CATEGORY_MASK_SJIS; |
| 3689 return 1; | 3704 return 0; |
| 3705 } | |
| 3706 return found; | |
| 3690 } | 3707 } |
| 3691 | 3708 |
| 3692 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 3709 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 3693 Check if a text is encoded in BIG5. If it is, return | 3710 Check if a text is encoded in BIG5. If it is, return |
| 3694 CATEGORY_MASK_BIG5, else return 0. */ | 3711 CATEGORY_MASK_BIG5, else return 0. */ |
| 3702 unsigned char *src_end = coding->source + coding->src_bytes; | 3719 unsigned char *src_end = coding->source + coding->src_bytes; |
| 3703 int multibytep = coding->src_multibyte; | 3720 int multibytep = coding->src_multibyte; |
| 3704 int consumed_chars = 0; | 3721 int consumed_chars = 0; |
| 3705 int found = 0; | 3722 int found = 0; |
| 3706 int c; | 3723 int c; |
| 3724 int incomplete; | |
| 3707 | 3725 |
| 3708 /* A coding system of this category is always ASCII compatible. */ | 3726 /* A coding system of this category is always ASCII compatible. */ |
| 3709 src += coding->head_ascii; | 3727 src += coding->head_ascii; |
| 3710 | 3728 |
| 3711 while (1) | 3729 while (1) |
| 3712 { | 3730 { |
| 3731 incomplete = 0; | |
| 3713 ONE_MORE_BYTE (c); | 3732 ONE_MORE_BYTE (c); |
| 3733 incomplete = 1; | |
| 3714 if (c < 0x80) | 3734 if (c < 0x80) |
| 3715 continue; | 3735 continue; |
| 3716 if (c >= 0xA1) | 3736 if (c >= 0xA1) |
| 3717 { | 3737 { |
| 3718 ONE_MORE_BYTE (c); | 3738 ONE_MORE_BYTE (c); |
| 3725 } | 3745 } |
| 3726 *mask &= ~CATEGORY_MASK_BIG5; | 3746 *mask &= ~CATEGORY_MASK_BIG5; |
| 3727 return 0; | 3747 return 0; |
| 3728 | 3748 |
| 3729 no_more_source: | 3749 no_more_source: |
| 3730 if (!found) | 3750 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK) |
| 3731 return 0; | 3751 { |
| 3732 *mask &= CATEGORY_MASK_BIG5; | 3752 *mask &= ~CATEGORY_MASK_BIG5; |
| 3733 return 1; | 3753 return 0; |
| 3754 } | |
| 3755 return found; | |
| 3734 } | 3756 } |
| 3735 | 3757 |
| 3736 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". | 3758 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". |
| 3737 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ | 3759 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ |
| 3738 | 3760 |
| 3752 | 3774 |
| 3753 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | 3775 CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 3754 | 3776 |
| 3755 val = charset_list; | 3777 val = charset_list; |
| 3756 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); | 3778 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); |
| 3757 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); | 3779 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val); |
| 3758 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))); | 3780 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))); |
| 3759 | 3781 |
| 3760 while (1) | 3782 while (1) |
| 3761 { | 3783 { |
| 3762 int c, c1; | 3784 int c, c1; |
| 3763 | 3785 |
| 3800 c = (c << 8) | c1; | 3822 c = (c << 8) | c1; |
| 3801 SJIS_TO_JIS (c); | 3823 SJIS_TO_JIS (c); |
| 3802 charset = charset_kanji; | 3824 charset = charset_kanji; |
| 3803 } | 3825 } |
| 3804 else | 3826 else |
| 3805 /* SJIS -> JISX0201-Kana */ | 3827 { |
| 3806 charset = charset_kana; | 3828 /* SJIS -> JISX0201-Kana */ |
| 3829 c &= 0x7F; | |
| 3830 charset = charset_kana; | |
| 3831 } | |
| 3807 } | 3832 } |
| 3808 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); | 3833 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c); |
| 3809 } | 3834 } |
| 3810 *charbuf++ = c; | 3835 *charbuf++ = c; |
| 3811 continue; | 3836 continue; |
| 4095 } | 4120 } |
| 4096 *mask &= ~CATEGORY_MASK_CCL; | 4121 *mask &= ~CATEGORY_MASK_CCL; |
| 4097 return 0; | 4122 return 0; |
| 4098 | 4123 |
| 4099 no_more_source: | 4124 no_more_source: |
| 4100 if (!found) | 4125 return found; |
| 4101 return 0; | |
| 4102 *mask &= CATEGORY_MASK_CCL; | |
| 4103 return 1; | |
| 4104 } | 4126 } |
| 4105 | 4127 |
| 4106 static void | 4128 static void |
| 4107 decode_coding_ccl (coding) | 4129 decode_coding_ccl (coding) |
| 4108 struct coding_system *coding; | 4130 struct coding_system *coding; |
| 4366 } | 4388 } |
| 4367 *mask &= ~CATEGORY_MASK_CHARSET; | 4389 *mask &= ~CATEGORY_MASK_CHARSET; |
| 4368 return 0; | 4390 return 0; |
| 4369 | 4391 |
| 4370 no_more_source: | 4392 no_more_source: |
| 4371 *mask &= CATEGORY_MASK_CHARSET; | |
| 4372 return 1; | 4393 return 1; |
| 4373 } | 4394 } |
| 4374 | 4395 |
| 4375 static void | 4396 static void |
| 4376 decode_coding_charset (coding) | 4397 decode_coding_charset (coding) |
| 4892 EOL_SEEN_XXX. */ | 4913 EOL_SEEN_XXX. */ |
| 4893 | 4914 |
| 4894 #define MAX_EOL_CHECK_COUNT 3 | 4915 #define MAX_EOL_CHECK_COUNT 3 |
| 4895 | 4916 |
| 4896 static int | 4917 static int |
| 4897 detect_eol (coding, source, src_bytes) | 4918 detect_eol (source, src_bytes, category) |
| 4898 struct coding_system *coding; | |
| 4899 unsigned char *source; | 4919 unsigned char *source; |
| 4900 EMACS_INT src_bytes; | 4920 EMACS_INT src_bytes; |
| 4901 { | 4921 enum coding_category category; |
| 4902 Lisp_Object attrs, coding_type; | 4922 { |
| 4903 unsigned char *src = source, *src_end = src + src_bytes; | 4923 unsigned char *src = source, *src_end = src + src_bytes; |
| 4904 unsigned char c; | 4924 unsigned char c; |
| 4905 int total = 0; | 4925 int total = 0; |
| 4906 int eol_seen = EOL_SEEN_NONE; | 4926 int eol_seen = EOL_SEEN_NONE; |
| 4907 | 4927 |
| 4908 attrs = CODING_ID_ATTRS (coding->id); | 4928 if ((1 << category) & CATEGORY_MASK_UTF_16) |
| 4909 coding_type = CODING_ATTR_TYPE (attrs); | |
| 4910 | |
| 4911 if (EQ (coding_type, Qccl)) | |
| 4912 { | 4929 { |
| 4913 int msb, lsb; | 4930 int msb, lsb; |
| 4914 | 4931 |
| 4915 msb = coding->spec.utf_16.endian == utf_16_little_endian; | 4932 msb = category == (coding_category_utf_16_le |
| 4933 | coding_category_utf_16_le_nosig); | |
| 4916 lsb = 1 - msb; | 4934 lsb = 1 - msb; |
| 4917 | 4935 |
| 4918 while (src + 1 < src_end) | 4936 while (src + 1 < src_end) |
| 4919 { | 4937 { |
| 4920 c = src[lsb]; | 4938 c = src[lsb]; |
| 5037 for (i = 0; i < coding_category_raw_text; i++) | 5055 for (i = 0; i < coding_category_raw_text; i++) |
| 5038 { | 5056 { |
| 5039 enum coding_category category = coding_priorities[i]; | 5057 enum coding_category category = coding_priorities[i]; |
| 5040 struct coding_system *this = coding_categories + category; | 5058 struct coding_system *this = coding_categories + category; |
| 5041 | 5059 |
| 5042 if (category >= coding_category_raw_text | |
| 5043 || detected & (1 << category)) | |
| 5044 continue; | |
| 5045 | |
| 5046 if (this->id < 0) | 5060 if (this->id < 0) |
| 5047 { | 5061 { |
| 5048 /* No coding system of this category is defined. */ | 5062 /* No coding system of this category is defined. */ |
| 5049 mask &= ~(1 << category); | 5063 mask &= ~(1 << category); |
| 5050 } | 5064 } |
| 5065 else if (category >= coding_category_raw_text | |
| 5066 || detected & (1 << category)) | |
| 5067 continue; | |
| 5051 else | 5068 else |
| 5052 { | 5069 { |
| 5053 detected |= detected_mask[category]; | 5070 detected |= detected_mask[category]; |
| 5054 if ((*(this->detector)) (coding, &mask)) | 5071 if ((*(this->detector)) (coding, &mask) |
| 5072 && (mask & (1 << category))) | |
| 5055 break; | 5073 break; |
| 5056 } | 5074 } |
| 5057 } | 5075 } |
| 5058 if (! mask) | 5076 if (! mask) |
| 5059 setup_coding_system (Qraw_text, coding); | 5077 setup_coding_system (Qraw_text, coding); |
| 5079 detection is impossible for a CCL based coding system, in which | 5097 detection is impossible for a CCL based coding system, in which |
| 5080 case, we detct the EOL type after decoding. */ | 5098 case, we detct the EOL type after decoding. */ |
| 5081 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)) | 5099 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)) |
| 5082 && ! EQ (coding_type, Qccl)) | 5100 && ! EQ (coding_type, Qccl)) |
| 5083 { | 5101 { |
| 5084 int eol_seen = detect_eol (coding, coding->source, coding->src_bytes); | 5102 int eol_seen = detect_eol (coding->source, coding->src_bytes, |
| 5103 XINT (CODING_ATTR_CATEGORY (attrs))); | |
| 5085 | 5104 |
| 5086 if (eol_seen != EOL_SEEN_NONE) | 5105 if (eol_seen != EOL_SEEN_NONE) |
| 5087 adjust_coding_eol_type (coding, eol_seen); | 5106 adjust_coding_eol_type (coding, eol_seen); |
| 5088 } | 5107 } |
| 5089 } | 5108 } |
| 6243 while (1) | 6262 while (1) |
| 6244 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); | 6263 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); |
| 6245 } | 6264 } |
| 6246 | 6265 |
| 6247 | 6266 |
| 6267 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If | |
| 6268 HIGHEST is nonzero, return the coding system of the highest | |
| 6269 priority among the detected coding systems. Otherwize return a | |
| 6270 list of detected coding systems sorted by their priorities. If | |
| 6271 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct | |
| 6272 multibyte form but contains only ASCII and eight-bit chars. | |
| 6273 Otherwise, the bytes are raw bytes. | |
| 6274 | |
| 6275 CODING-SYSTEM controls the detection as below: | |
| 6276 | |
| 6277 If it is nil, detect both text-format and eol-format. If the | |
| 6278 text-format part of CODING-SYSTEM is already specified | |
| 6279 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format | |
| 6280 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'), | |
| 6281 detect only text-format. */ | |
| 6282 | |
| 6248 Lisp_Object | 6283 Lisp_Object |
| 6249 detect_coding_system (src, src_bytes, highest, multibytep, coding_system) | 6284 detect_coding_system (src, src_bytes, highest, multibytep, coding_system) |
| 6250 unsigned char *src; | 6285 unsigned char *src; |
| 6251 int src_bytes, highest; | 6286 int src_bytes, highest; |
| 6252 int multibytep; | 6287 int multibytep; |
| 6257 int detected = 0; | 6292 int detected = 0; |
| 6258 int c, i; | 6293 int c, i; |
| 6259 Lisp_Object attrs, eol_type; | 6294 Lisp_Object attrs, eol_type; |
| 6260 Lisp_Object val; | 6295 Lisp_Object val; |
| 6261 struct coding_system coding; | 6296 struct coding_system coding; |
| 6297 int id; | |
| 6262 | 6298 |
| 6263 if (NILP (coding_system)) | 6299 if (NILP (coding_system)) |
| 6264 coding_system = Qundecided; | 6300 coding_system = Qundecided; |
| 6265 setup_coding_system (coding_system, &coding); | 6301 setup_coding_system (coding_system, &coding); |
| 6266 attrs = CODING_ID_ATTRS (coding.id); | 6302 attrs = CODING_ID_ATTRS (coding.id); |
| 6267 eol_type = CODING_ID_EOL_TYPE (coding.id); | 6303 eol_type = CODING_ID_EOL_TYPE (coding.id); |
| 6304 coding_system = CODING_ATTR_BASE_NAME (attrs); | |
| 6268 | 6305 |
| 6269 coding.source = src; | 6306 coding.source = src; |
| 6270 coding.src_bytes = src_bytes; | 6307 coding.src_bytes = src_bytes; |
| 6271 coding.src_multibyte = multibytep; | 6308 coding.src_multibyte = multibytep; |
| 6272 coding.consumed = 0; | 6309 coding.consumed = 0; |
| 6273 | 6310 coding.mode |= CODING_MODE_LAST_BLOCK; |
| 6274 if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided) | 6311 |
| 6275 { | 6312 /* At first, detect text-format if necessary. */ |
| 6276 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); | 6313 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided) |
| 6277 } | 6314 { |
| 6278 else | |
| 6279 { | |
| 6280 coding_system = Qnil; | |
| 6281 for (; src < src_end; src++) | 6315 for (; src < src_end; src++) |
| 6282 { | 6316 { |
| 6283 c = *src; | 6317 c = *src; |
| 6284 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC | 6318 if (c & 0x80 |
| 6285 || c == ISO_CODE_SI | 6319 || (c < 0x20 && (c == ISO_CODE_ESC |
| 6286 || c == ISO_CODE_SO))) | 6320 || c == ISO_CODE_SI |
| 6321 || c == ISO_CODE_SO | |
| 6322 /* Most UTF-16 text contains '\0'. */ | |
| 6323 || !c))) | |
| 6287 break; | 6324 break; |
| 6288 } | 6325 } |
| 6289 coding.head_ascii = src - coding.source; | 6326 coding.head_ascii = src - coding.source; |
| 6290 | 6327 |
| 6291 if (src < src_end) | 6328 if (src < src_end) |
| 6292 for (i = 0; i < coding_category_raw_text; i++) | 6329 for (i = 0; i < coding_category_raw_text; i++) |
| 6293 { | 6330 { |
| 6294 enum coding_category category = coding_priorities[i]; | 6331 enum coding_category category = coding_priorities[i]; |
| 6295 struct coding_system *this = coding_categories + category; | 6332 struct coding_system *this = coding_categories + category; |
| 6296 | 6333 |
| 6297 if (category >= coding_category_raw_text | |
| 6298 || detected & (1 << category)) | |
| 6299 continue; | |
| 6300 | |
| 6301 if (this->id < 0) | 6334 if (this->id < 0) |
| 6302 { | 6335 { |
| 6303 /* No coding system of this category is defined. */ | 6336 /* No coding system of this category is defined. */ |
| 6304 mask &= ~(1 << category); | 6337 mask &= ~(1 << category); |
| 6305 } | 6338 } |
| 6339 else if (category >= coding_category_raw_text | |
| 6340 || detected & (1 << category)) | |
| 6341 continue; | |
| 6306 else | 6342 else |
| 6307 { | 6343 { |
| 6308 detected |= detected_mask[category]; | 6344 detected |= detected_mask[category]; |
| 6309 if ((*(coding_categories[category].detector)) (&coding, &mask) | 6345 if ((*(coding_categories[category].detector)) (&coding, &mask) |
| 6310 && highest) | 6346 && highest |
| 6347 && (mask & (1 << category))) | |
| 6311 { | 6348 { |
| 6312 mask &= detected_mask[category]; | 6349 mask = 1 << category; |
| 6313 break; | 6350 break; |
| 6314 } | 6351 } |
| 6315 } | 6352 } |
| 6316 } | 6353 } |
| 6317 } | 6354 |
| 6318 | 6355 if (!mask) |
| 6319 if (!mask) | 6356 { |
| 6320 val = Fcons (make_number (coding_category_raw_text), Qnil); | 6357 id = coding_categories[coding_category_raw_text].id; |
| 6321 else if (mask == CATEGORY_MASK_ANY) | 6358 val = Fcons (make_number (id), Qnil); |
| 6322 val = Fcons (make_number (coding_category_undecided), Qnil); | 6359 } |
| 6323 else if (highest) | 6360 else if (mask == CATEGORY_MASK_ANY) |
| 6324 { | 6361 { |
| 6325 for (i = 0; i < coding_category_raw_text; i++) | 6362 id = coding_categories[coding_category_undecided].id; |
| 6326 if (mask & (1 << coding_priorities[i])) | 6363 val = Fcons (make_number (id), Qnil); |
| 6327 { | 6364 } |
| 6328 val = Fcons (make_number (coding_priorities[i]), Qnil); | 6365 else if (highest) |
| 6329 break; | 6366 { |
| 6330 } | 6367 for (i = 0; i < coding_category_raw_text; i++) |
| 6331 } | 6368 if (mask & (1 << coding_priorities[i])) |
| 6369 { | |
| 6370 id = coding_categories[coding_priorities[i]].id; | |
| 6371 val = Fcons (make_number (id), Qnil); | |
| 6372 break; | |
| 6373 } | |
| 6374 } | |
| 6375 else | |
| 6376 { | |
| 6377 val = Qnil; | |
| 6378 for (i = coding_category_raw_text - 1; i >= 0; i--) | |
| 6379 if (mask & (1 << coding_priorities[i])) | |
| 6380 { | |
| 6381 id = coding_categories[coding_priorities[i]].id; | |
| 6382 val = Fcons (make_number (id), val); | |
| 6383 } | |
| 6384 } | |
| 6385 } | |
| 6332 else | 6386 else |
| 6333 { | 6387 { |
| 6334 val = Qnil; | 6388 mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs)); |
| 6335 for (i = coding_category_raw_text - 1; i >= 0; i--) | 6389 val = Fcons (make_number (coding.id), Qnil); |
| 6336 if (mask & (1 << coding_priorities[i])) | 6390 } |
| 6337 val = Fcons (make_number (coding_priorities[i]), val); | 6391 |
| 6338 } | 6392 /* Then, detect eol-format if necessary. */ |
| 6339 | |
| 6340 { | 6393 { |
| 6341 int one_byte_eol = -1, two_byte_eol = -1; | 6394 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol; |
| 6342 Lisp_Object tail; | 6395 Lisp_Object tail; |
| 6396 | |
| 6397 if (VECTORP (eol_type)) | |
| 6398 { | |
| 6399 if (mask & ~CATEGORY_MASK_UTF_16) | |
| 6400 normal_eol = detect_eol (coding.source, src_bytes, | |
| 6401 coding_category_raw_text); | |
| 6402 if (mask & (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_BE_NOSIG)) | |
| 6403 utf_16_be_eol = detect_eol (coding.source, src_bytes, | |
| 6404 coding_category_utf_16_be); | |
| 6405 if (mask & (CATEGORY_MASK_UTF_16_LE | CATEGORY_MASK_UTF_16_LE_NOSIG)) | |
| 6406 utf_16_le_eol = detect_eol (coding.source, src_bytes, | |
| 6407 coding_category_utf_16_le); | |
| 6408 } | |
| 6409 else | |
| 6410 { | |
| 6411 if (EQ (eol_type, Qunix)) | |
| 6412 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF; | |
| 6413 else if (EQ (eol_type, Qdos)) | |
| 6414 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF; | |
| 6415 else | |
| 6416 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR; | |
| 6417 } | |
| 6343 | 6418 |
| 6344 for (tail = val; CONSP (tail); tail = XCDR (tail)) | 6419 for (tail = val; CONSP (tail); tail = XCDR (tail)) |
| 6345 { | 6420 { |
| 6346 struct coding_system *this | 6421 enum coding_category category; |
| 6347 = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail)) | |
| 6348 : &coding); | |
| 6349 int this_eol; | 6422 int this_eol; |
| 6350 | 6423 |
| 6351 attrs = CODING_ID_ATTRS (this->id); | 6424 id = XINT (XCAR (tail)); |
| 6352 eol_type = CODING_ID_EOL_TYPE (this->id); | 6425 attrs = CODING_ID_ATTRS (id); |
| 6353 XSETCAR (tail, CODING_ID_NAME (this->id)); | 6426 category = XINT (CODING_ATTR_CATEGORY (attrs)); |
| 6427 eol_type = CODING_ID_EOL_TYPE (id); | |
| 6354 if (VECTORP (eol_type)) | 6428 if (VECTORP (eol_type)) |
| 6355 { | 6429 { |
| 6356 if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16)) | 6430 if (category == coding_category_utf_16_be |
| 6357 { | 6431 || category == coding_category_utf_16_be_nosig) |
| 6358 if (two_byte_eol < 0) | 6432 this_eol = utf_16_be_eol; |
| 6359 two_byte_eol = detect_eol (this, coding.source, src_bytes); | 6433 else if (category == coding_category_utf_16_le |
| 6360 this_eol = two_byte_eol; | 6434 || category == coding_category_utf_16_le_nosig) |
| 6361 } | 6435 this_eol = utf_16_le_eol; |
| 6362 else | 6436 else |
| 6363 { | 6437 this_eol = normal_eol; |
| 6364 if (one_byte_eol < 0) | 6438 |
| 6365 one_byte_eol =detect_eol (this, coding.source, src_bytes); | |
| 6366 this_eol = one_byte_eol; | |
| 6367 } | |
| 6368 if (this_eol == EOL_SEEN_LF) | 6439 if (this_eol == EOL_SEEN_LF) |
| 6369 XSETCAR (tail, AREF (eol_type, 0)); | 6440 XSETCAR (tail, AREF (eol_type, 0)); |
| 6370 else if (this_eol == EOL_SEEN_CRLF) | 6441 else if (this_eol == EOL_SEEN_CRLF) |
| 6371 XSETCAR (tail, AREF (eol_type, 1)); | 6442 XSETCAR (tail, AREF (eol_type, 1)); |
| 6372 else if (this_eol == EOL_SEEN_CR) | 6443 else if (this_eol == EOL_SEEN_CR) |
| 6373 XSETCAR (tail, AREF (eol_type, 2)); | 6444 XSETCAR (tail, AREF (eol_type, 2)); |
| 6445 else | |
| 6446 XSETCAR (tail, CODING_ID_NAME (id)); | |
| 6374 } | 6447 } |
| 6448 else | |
| 6449 XSETCAR (tail, CODING_ID_NAME (id)); | |
| 6375 } | 6450 } |
| 6376 } | 6451 } |
| 6377 | 6452 |
| 6378 return (highest ? XCAR (val) : val); | 6453 return (highest ? XCAR (val) : val); |
| 6379 } | 6454 } |
