Mercurial > emacs
comparison src/coding.c @ 72395:94e4795b333d
(ONE_MORE_BYTE_CHECK_MULTIBYTE): New arg RET. If SRC
is exhausted, return with RET.
(detect_coding_emacs_mule, detect_coding_iso2022)
(detect_coding_sjis, detect_coding_big5, detect_coding_utf_8)
(detect_coding_utf_16, detect_coding_ccl): Adjusted for the above
change.
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Tue, 15 Aug 2006 02:41:29 +0000 |
| parents | af796bc81ff0 |
| children | 6493d4697ad2 694bbb62a75d |
comparison
equal
deleted
inserted
replaced
| 72394:bec9a701aee6 | 72395:94e4795b333d |
|---|---|
| 217 c2 = *src++; \ | 217 c2 = *src++; \ |
| 218 } while (0) | 218 } while (0) |
| 219 | 219 |
| 220 | 220 |
| 221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte | 221 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte |
| 222 form if MULTIBYTEP is nonzero. */ | 222 form if MULTIBYTEP is nonzero. In addition, if SRC is not less |
| 223 | 223 than SRC_END, return with RET. */ |
| 224 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \ | 224 |
| 225 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep, ret) \ | |
| 225 do { \ | 226 do { \ |
| 226 if (src >= src_end) \ | 227 if (src >= src_end) \ |
| 227 { \ | 228 { \ |
| 228 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ | 229 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ |
| 229 goto label_end_of_loop; \ | 230 return ret; \ |
| 230 } \ | 231 } \ |
| 231 c1 = *src++; \ | 232 c1 = *src++; \ |
| 232 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \ | 233 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \ |
| 233 c1 = *src++ - 0x20; \ | 234 c1 = *src++ - 0x20; \ |
| 234 } while (0) | 235 } while (0) |
| 630 struct coding_system dummy_coding; | 631 struct coding_system dummy_coding; |
| 631 struct coding_system *coding = &dummy_coding; | 632 struct coding_system *coding = &dummy_coding; |
| 632 | 633 |
| 633 while (1) | 634 while (1) |
| 634 { | 635 { |
| 635 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 636 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, |
| 636 | 637 CODING_CATEGORY_MASK_EMACS_MULE); |
| 637 if (composing) | 638 if (composing) |
| 638 { | 639 { |
| 639 if (c < 0xA0) | 640 if (c < 0xA0) |
| 640 composing = 0; | 641 composing = 0; |
| 641 else if (c == 0xA0) | 642 else if (c == 0xA0) |
| 642 { | 643 { |
| 643 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 644 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0); |
| 644 c &= 0x7F; | 645 c &= 0x7F; |
| 645 } | 646 } |
| 646 else | 647 else |
| 647 c -= 0x20; | 648 c -= 0x20; |
| 648 } | 649 } |
| 667 return 0; | 668 return 0; |
| 668 src = src_base + bytes; | 669 src = src_base + bytes; |
| 669 } | 670 } |
| 670 } | 671 } |
| 671 } | 672 } |
| 672 label_end_of_loop: | |
| 673 return CODING_CATEGORY_MASK_EMACS_MULE; | |
| 674 } | 673 } |
| 675 | 674 |
| 676 | 675 |
| 677 /* Record the starting position START and METHOD of one composition. */ | 676 /* Record the starting position START and METHOD of one composition. */ |
| 678 | 677 |
| 1423 struct coding_system dummy_coding; | 1422 struct coding_system dummy_coding; |
| 1424 struct coding_system *coding = &dummy_coding; | 1423 struct coding_system *coding = &dummy_coding; |
| 1425 Lisp_Object safe_chars; | 1424 Lisp_Object safe_chars; |
| 1426 | 1425 |
| 1427 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; | 1426 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; |
| 1428 while (mask && src < src_end) | 1427 while (mask) |
| 1429 { | 1428 { |
| 1430 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 1429 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found); |
| 1431 retry: | 1430 retry: |
| 1432 switch (c) | 1431 switch (c) |
| 1433 { | 1432 { |
| 1434 case ISO_CODE_ESC: | 1433 case ISO_CODE_ESC: |
| 1435 if (inhibit_iso_escape_detection) | 1434 if (inhibit_iso_escape_detection) |
| 1436 break; | 1435 break; |
| 1437 single_shifting = 0; | 1436 single_shifting = 0; |
| 1438 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 1437 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found); |
| 1439 if (c >= '(' && c <= '/') | 1438 if (c >= '(' && c <= '/') |
| 1440 { | 1439 { |
| 1441 /* Designation sequence for a charset of dimension 1. */ | 1440 /* Designation sequence for a charset of dimension 1. */ |
| 1442 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); | 1441 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, mask & mask_found); |
| 1443 if (c1 < ' ' || c1 >= 0x80 | 1442 if (c1 < ' ' || c1 >= 0x80 |
| 1444 || (charset = iso_charset_table[0][c >= ','][c1]) < 0) | 1443 || (charset = iso_charset_table[0][c >= ','][c1]) < 0) |
| 1445 /* Invalid designation sequence. Just ignore. */ | 1444 /* Invalid designation sequence. Just ignore. */ |
| 1446 break; | 1445 break; |
| 1447 reg[(c - '(') % 4] = charset; | 1446 reg[(c - '(') % 4] = charset; |
| 1448 } | 1447 } |
| 1449 else if (c == '$') | 1448 else if (c == '$') |
| 1450 { | 1449 { |
| 1451 /* Designation sequence for a charset of dimension 2. */ | 1450 /* Designation sequence for a charset of dimension 2. */ |
| 1452 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 1451 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, mask & mask_found); |
| 1453 if (c >= '@' && c <= 'B') | 1452 if (c >= '@' && c <= 'B') |
| 1454 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ | 1453 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ |
| 1455 reg[0] = charset = iso_charset_table[1][0][c]; | 1454 reg[0] = charset = iso_charset_table[1][0][c]; |
| 1456 else if (c >= '(' && c <= '/') | 1455 else if (c >= '(' && c <= '/') |
| 1457 { | 1456 { |
| 1458 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); | 1457 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, |
| 1458 mask & mask_found); | |
| 1459 if (c1 < ' ' || c1 >= 0x80 | 1459 if (c1 < ' ' || c1 >= 0x80 |
| 1460 || (charset = iso_charset_table[1][c >= ','][c1]) < 0) | 1460 || (charset = iso_charset_table[1][c >= ','][c1]) < 0) |
| 1461 /* Invalid designation sequence. Just ignore. */ | 1461 /* Invalid designation sequence. Just ignore. */ |
| 1462 break; | 1462 break; |
| 1463 reg[(c - '(') % 4] = charset; | 1463 reg[(c - '(') % 4] = charset; |
| 1628 int i = 1; | 1628 int i = 1; |
| 1629 | 1629 |
| 1630 c = -1; | 1630 c = -1; |
| 1631 while (src < src_end) | 1631 while (src < src_end) |
| 1632 { | 1632 { |
| 1633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 1633 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, |
| 1634 mask & mask_found); | |
| 1634 if (c < 0xA0) | 1635 if (c < 0xA0) |
| 1635 break; | 1636 break; |
| 1636 i++; | 1637 i++; |
| 1637 } | 1638 } |
| 1638 | 1639 |
| 1646 } | 1647 } |
| 1647 } | 1648 } |
| 1648 break; | 1649 break; |
| 1649 } | 1650 } |
| 1650 } | 1651 } |
| 1651 label_end_of_loop: | |
| 1652 return (mask & mask_found); | 1652 return (mask & mask_found); |
| 1653 } | 1653 } |
| 1654 | 1654 |
| 1655 /* Decode a character of which charset is CHARSET, the 1st position | 1655 /* Decode a character of which charset is CHARSET, the 1st position |
| 1656 code is C1, the 2nd position code is C2, and return the decoded | 1656 code is C1, the 2nd position code is C2, and return the decoded |
| 2917 struct coding_system dummy_coding; | 2917 struct coding_system dummy_coding; |
| 2918 struct coding_system *coding = &dummy_coding; | 2918 struct coding_system *coding = &dummy_coding; |
| 2919 | 2919 |
| 2920 while (1) | 2920 while (1) |
| 2921 { | 2921 { |
| 2922 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2922 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_SJIS); |
| 2923 if (c < 0x80) | 2923 if (c < 0x80) |
| 2924 continue; | 2924 continue; |
| 2925 if (c == 0x80 || c == 0xA0 || c > 0xEF) | 2925 if (c == 0x80 || c == 0xA0 || c > 0xEF) |
| 2926 return 0; | 2926 return 0; |
| 2927 if (c <= 0x9F || c >= 0xE0) | 2927 if (c <= 0x9F || c >= 0xE0) |
| 2928 { | 2928 { |
| 2929 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2929 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0); |
| 2930 if (c < 0x40 || c == 0x7F || c > 0xFC) | 2930 if (c < 0x40 || c == 0x7F || c > 0xFC) |
| 2931 return 0; | 2931 return 0; |
| 2932 } | 2932 } |
| 2933 } | 2933 } |
| 2934 label_end_of_loop: | |
| 2935 return CODING_CATEGORY_MASK_SJIS; | |
| 2936 } | 2934 } |
| 2937 | 2935 |
| 2938 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 2936 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 2939 Check if a text is encoded in BIG5. If it is, return | 2937 Check if a text is encoded in BIG5. If it is, return |
| 2940 CODING_CATEGORY_MASK_BIG5, else return 0. */ | 2938 CODING_CATEGORY_MASK_BIG5, else return 0. */ |
| 2949 struct coding_system dummy_coding; | 2947 struct coding_system dummy_coding; |
| 2950 struct coding_system *coding = &dummy_coding; | 2948 struct coding_system *coding = &dummy_coding; |
| 2951 | 2949 |
| 2952 while (1) | 2950 while (1) |
| 2953 { | 2951 { |
| 2954 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2952 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_BIG5); |
| 2955 if (c < 0x80) | 2953 if (c < 0x80) |
| 2956 continue; | 2954 continue; |
| 2957 if (c < 0xA1 || c > 0xFE) | 2955 if (c < 0xA1 || c > 0xFE) |
| 2958 return 0; | 2956 return 0; |
| 2959 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2957 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0); |
| 2960 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE) | 2958 if (c < 0x40 || (c > 0x7F && c < 0xA1) || c > 0xFE) |
| 2961 return 0; | 2959 return 0; |
| 2962 } | 2960 } |
| 2963 label_end_of_loop: | |
| 2964 return CODING_CATEGORY_MASK_BIG5; | |
| 2965 } | 2961 } |
| 2966 | 2962 |
| 2967 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 2963 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 2968 Check if a text is encoded in UTF-8. If it is, return | 2964 Check if a text is encoded in UTF-8. If it is, return |
| 2969 CODING_CATEGORY_MASK_UTF_8, else return 0. */ | 2965 CODING_CATEGORY_MASK_UTF_8, else return 0. */ |
| 2987 struct coding_system dummy_coding; | 2983 struct coding_system dummy_coding; |
| 2988 struct coding_system *coding = &dummy_coding; | 2984 struct coding_system *coding = &dummy_coding; |
| 2989 | 2985 |
| 2990 while (1) | 2986 while (1) |
| 2991 { | 2987 { |
| 2992 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 2988 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_UTF_8); |
| 2993 if (UTF_8_1_OCTET_P (c)) | 2989 if (UTF_8_1_OCTET_P (c)) |
| 2994 continue; | 2990 continue; |
| 2995 else if (UTF_8_2_OCTET_LEADING_P (c)) | 2991 else if (UTF_8_2_OCTET_LEADING_P (c)) |
| 2996 seq_maybe_bytes = 1; | 2992 seq_maybe_bytes = 1; |
| 2997 else if (UTF_8_3_OCTET_LEADING_P (c)) | 2993 else if (UTF_8_3_OCTET_LEADING_P (c)) |
| 3005 else | 3001 else |
| 3006 return 0; | 3002 return 0; |
| 3007 | 3003 |
| 3008 do | 3004 do |
| 3009 { | 3005 { |
| 3010 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 3006 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, 0); |
| 3011 if (!UTF_8_EXTRA_OCTET_P (c)) | 3007 if (!UTF_8_EXTRA_OCTET_P (c)) |
| 3012 return 0; | 3008 return 0; |
| 3013 seq_maybe_bytes--; | 3009 seq_maybe_bytes--; |
| 3014 } | 3010 } |
| 3015 while (seq_maybe_bytes > 0); | 3011 while (seq_maybe_bytes > 0); |
| 3016 } | 3012 } |
| 3017 | |
| 3018 label_end_of_loop: | |
| 3019 return CODING_CATEGORY_MASK_UTF_8; | |
| 3020 } | 3013 } |
| 3021 | 3014 |
| 3022 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 3015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 3023 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or | 3016 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or |
| 3024 Little Endian (otherwise). If it is, return | 3017 Little Endian (otherwise). If it is, return |
| 3043 unsigned char c1, c2; | 3036 unsigned char c1, c2; |
| 3044 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */ | 3037 /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */ |
| 3045 struct coding_system dummy_coding; | 3038 struct coding_system dummy_coding; |
| 3046 struct coding_system *coding = &dummy_coding; | 3039 struct coding_system *coding = &dummy_coding; |
| 3047 | 3040 |
| 3048 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); | 3041 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep, 0); |
| 3049 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep); | 3042 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep, 0); |
| 3050 | 3043 |
| 3051 if ((c1 == 0xFF) && (c2 == 0xFE)) | 3044 if ((c1 == 0xFF) && (c2 == 0xFE)) |
| 3052 return CODING_CATEGORY_MASK_UTF_16_LE; | 3045 return CODING_CATEGORY_MASK_UTF_16_LE; |
| 3053 else if ((c1 == 0xFE) && (c2 == 0xFF)) | 3046 else if ((c1 == 0xFE) && (c2 == 0xFF)) |
| 3054 return CODING_CATEGORY_MASK_UTF_16_BE; | 3047 return CODING_CATEGORY_MASK_UTF_16_BE; |
| 3055 | |
| 3056 label_end_of_loop: | |
| 3057 return 0; | 3048 return 0; |
| 3058 } | 3049 } |
| 3059 | 3050 |
| 3060 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". | 3051 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". |
| 3061 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ | 3052 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ |
| 3320 return 0; | 3311 return 0; |
| 3321 | 3312 |
| 3322 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes; | 3313 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes; |
| 3323 while (1) | 3314 while (1) |
| 3324 { | 3315 { |
| 3325 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); | 3316 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep, CODING_CATEGORY_MASK_CCL); |
| 3326 if (! valid[c]) | 3317 if (! valid[c]) |
| 3327 return 0; | 3318 return 0; |
| 3328 } | 3319 } |
| 3329 label_end_of_loop: | |
| 3330 return CODING_CATEGORY_MASK_CCL; | |
| 3331 } | 3320 } |
| 3332 | 3321 |
| 3333 | 3322 |
| 3334 /*** 6. End-of-line handlers ***/ | 3323 /*** 6. End-of-line handlers ***/ |
| 3335 | 3324 |
