Mercurial > emacs
comparison src/coding.c @ 34531:37f85e931855
(ONE_MORE_BYTE_CHECK_MULTIBYTE): New macro.
(detect_coding_emacs_mule, detect_coding_iso2022,)
(detect_coding_sjis, detect_coding_big5, detect_coding_utf_8)
(detect_coding_utf_16, detect_coding_ccl): Make them static. New
argument MULTIBYTEP. Callers changed.
(detect_coding_mask, detect_coding_system): New argument
MULTIBYTEP. Callers changed.
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Wed, 13 Dec 2000 23:24:37 +0000 |
| parents | 78561a43cdd1 |
| children | aa667988f2b0 |
comparison
equal
deleted
inserted
replaced
| 34530:8739ed222334 | 34531:37f85e931855 |
|---|---|
| 110 | 110 |
| 111 These functions check if a text between SRC and SRC_END is encoded | 111 These functions check if a text between SRC and SRC_END is encoded |
| 112 in the coding system category XXX. Each returns an integer value in | 112 in the coding system category XXX. Each returns an integer value in |
| 113 which appropriate flag bits for the category XXX is set. The flag | 113 which appropriate flag bits for the category XXX is set. The flag |
| 114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the | 114 bits are defined in macros CODING_CATEGORY_MASK_XXX. Below is the |
| 115 template of these functions. */ | 115 template of these functions. If MULTIBYTEP is nonzero, 8-bit codes |
| 116 of the range 0x80..0x9F are in multibyte form. */ | |
| 116 #if 0 | 117 #if 0 |
| 117 int | 118 int |
| 118 detect_coding_emacs_mule (src, src_end) | 119 detect_coding_emacs_mule (src, src_end, multibytep) |
| 119 unsigned char *src, *src_end; | 120 unsigned char *src, *src_end; |
| 121 int multibytep; | |
| 120 { | 122 { |
| 121 ... | 123 ... |
| 122 } | 124 } |
| 123 #endif | 125 #endif |
| 124 | 126 |
| 207 } \ | 209 } \ |
| 208 c1 = *src++; \ | 210 c1 = *src++; \ |
| 209 c2 = *src++; \ | 211 c2 = *src++; \ |
| 210 } while (0) | 212 } while (0) |
| 211 | 213 |
| 214 | |
| 215 /* Like ONE_MORE_BYTE, but 8-bit bytes of data at SRC are in multibyte | |
| 216 form if MULTIBYTEP is nonzero. */ | |
| 217 | |
| 218 #define ONE_MORE_BYTE_CHECK_MULTIBYTE(c1, multibytep) \ | |
| 219 do { \ | |
| 220 if (src >= src_end) \ | |
| 221 { \ | |
| 222 coding->result = CODING_FINISH_INSUFFICIENT_SRC; \ | |
| 223 goto label_end_of_loop; \ | |
| 224 } \ | |
| 225 c1 = *src++; \ | |
| 226 if (multibytep && c1 == LEADING_CODE_8_BIT_CONTROL) \ | |
| 227 c1 = *src++ - 0x20; \ | |
| 228 } while (0) | |
| 212 | 229 |
| 213 /* Set C to the next character at the source text pointed by `src'. | 230 /* Set C to the next character at the source text pointed by `src'. |
| 214 If there are not enough characters in the source, jump to | 231 If there are not enough characters in the source, jump to |
| 215 `label_end_of_loop'. The caller should set variables `coding' | 232 `label_end_of_loop'. The caller should set variables `coding' |
| 216 `src', `src_end', and `translation_table' to appropriate pointers | 233 `src', `src_end', and `translation_table' to appropriate pointers |
| 534 | 551 |
| 535 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 552 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 536 Check if a text is encoded in Emacs' internal format. If it is, | 553 Check if a text is encoded in Emacs' internal format. If it is, |
| 537 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */ | 554 return CODING_CATEGORY_MASK_EMACS_MULE, else return 0. */ |
| 538 | 555 |
| 539 int | 556 static int |
| 540 detect_coding_emacs_mule (src, src_end) | 557 detect_coding_emacs_mule (src, src_end, multibytep) |
| 541 unsigned char *src, *src_end; | 558 unsigned char *src, *src_end; |
| 559 int multibytep; | |
| 542 { | 560 { |
| 543 unsigned char c; | 561 unsigned char c; |
| 544 int composing = 0; | 562 int composing = 0; |
| 545 /* Dummy for ONE_MORE_BYTE. */ | 563 /* Dummy for ONE_MORE_BYTE. */ |
| 546 struct coding_system dummy_coding; | 564 struct coding_system dummy_coding; |
| 547 struct coding_system *coding = &dummy_coding; | 565 struct coding_system *coding = &dummy_coding; |
| 548 | 566 |
| 549 while (1) | 567 while (1) |
| 550 { | 568 { |
| 551 ONE_MORE_BYTE (c); | 569 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 552 | 570 |
| 553 if (composing) | 571 if (composing) |
| 554 { | 572 { |
| 555 if (c < 0xA0) | 573 if (c < 0xA0) |
| 556 composing = 0; | 574 composing = 0; |
| 557 else if (c == 0xA0) | 575 else if (c == 0xA0) |
| 558 { | 576 { |
| 559 ONE_MORE_BYTE (c); | 577 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 560 c &= 0x7F; | 578 c &= 0x7F; |
| 561 } | 579 } |
| 562 else | 580 else |
| 563 c -= 0x20; | 581 c -= 0x20; |
| 564 } | 582 } |
| 879 CODING_CATEGORY_MASK_ISO_7_ELSE | 897 CODING_CATEGORY_MASK_ISO_7_ELSE |
| 880 CODING_CATEGORY_MASK_ISO_8_ELSE | 898 CODING_CATEGORY_MASK_ISO_8_ELSE |
| 881 are set. If a code which should never appear in ISO2022 is found, | 899 are set. If a code which should never appear in ISO2022 is found, |
| 882 returns 0. */ | 900 returns 0. */ |
| 883 | 901 |
| 884 int | 902 static int |
| 885 detect_coding_iso2022 (src, src_end) | 903 detect_coding_iso2022 (src, src_end, multibytep) |
| 886 unsigned char *src, *src_end; | 904 unsigned char *src, *src_end; |
| 905 int multibytep; | |
| 887 { | 906 { |
| 888 int mask = CODING_CATEGORY_MASK_ISO; | 907 int mask = CODING_CATEGORY_MASK_ISO; |
| 889 int mask_found = 0; | 908 int mask_found = 0; |
| 890 int reg[4], shift_out = 0, single_shifting = 0; | 909 int reg[4], shift_out = 0, single_shifting = 0; |
| 891 int c, c1, i, charset; | 910 int c, c1, i, charset; |
| 895 Lisp_Object safe_chars; | 914 Lisp_Object safe_chars; |
| 896 | 915 |
| 897 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; | 916 reg[0] = CHARSET_ASCII, reg[1] = reg[2] = reg[3] = -1; |
| 898 while (mask && src < src_end) | 917 while (mask && src < src_end) |
| 899 { | 918 { |
| 900 ONE_MORE_BYTE (c); | 919 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 901 switch (c) | 920 switch (c) |
| 902 { | 921 { |
| 903 case ISO_CODE_ESC: | 922 case ISO_CODE_ESC: |
| 904 if (inhibit_iso_escape_detection) | 923 if (inhibit_iso_escape_detection) |
| 905 break; | 924 break; |
| 906 single_shifting = 0; | 925 single_shifting = 0; |
| 907 ONE_MORE_BYTE (c); | 926 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 908 if (c >= '(' && c <= '/') | 927 if (c >= '(' && c <= '/') |
| 909 { | 928 { |
| 910 /* Designation sequence for a charset of dimension 1. */ | 929 /* Designation sequence for a charset of dimension 1. */ |
| 911 ONE_MORE_BYTE (c1); | 930 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); |
| 912 if (c1 < ' ' || c1 >= 0x80 | 931 if (c1 < ' ' || c1 >= 0x80 |
| 913 || (charset = iso_charset_table[0][c >= ','][c1]) < 0) | 932 || (charset = iso_charset_table[0][c >= ','][c1]) < 0) |
| 914 /* Invalid designation sequence. Just ignore. */ | 933 /* Invalid designation sequence. Just ignore. */ |
| 915 break; | 934 break; |
| 916 reg[(c - '(') % 4] = charset; | 935 reg[(c - '(') % 4] = charset; |
| 917 } | 936 } |
| 918 else if (c == '$') | 937 else if (c == '$') |
| 919 { | 938 { |
| 920 /* Designation sequence for a charset of dimension 2. */ | 939 /* Designation sequence for a charset of dimension 2. */ |
| 921 ONE_MORE_BYTE (c); | 940 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 922 if (c >= '@' && c <= 'B') | 941 if (c >= '@' && c <= 'B') |
| 923 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ | 942 /* Designation for JISX0208.1978, GB2312, or JISX0208. */ |
| 924 reg[0] = charset = iso_charset_table[1][0][c]; | 943 reg[0] = charset = iso_charset_table[1][0][c]; |
| 925 else if (c >= '(' && c <= '/') | 944 else if (c >= '(' && c <= '/') |
| 926 { | 945 { |
| 927 ONE_MORE_BYTE (c1); | 946 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); |
| 928 if (c1 < ' ' || c1 >= 0x80 | 947 if (c1 < ' ' || c1 >= 0x80 |
| 929 || (charset = iso_charset_table[1][c >= ','][c1]) < 0) | 948 || (charset = iso_charset_table[1][c >= ','][c1]) < 0) |
| 930 /* Invalid designation sequence. Just ignore. */ | 949 /* Invalid designation sequence. Just ignore. */ |
| 931 break; | 950 break; |
| 932 reg[(c - '(') % 4] = charset; | 951 reg[(c - '(') % 4] = charset; |
| 1072 && mask & CODING_CATEGORY_MASK_ISO_8_2) | 1091 && mask & CODING_CATEGORY_MASK_ISO_8_2) |
| 1073 { | 1092 { |
| 1074 int i = 1; | 1093 int i = 1; |
| 1075 while (src < src_end) | 1094 while (src < src_end) |
| 1076 { | 1095 { |
| 1077 ONE_MORE_BYTE (c); | 1096 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 1078 if (c < 0xA0) | 1097 if (c < 0xA0) |
| 1079 break; | 1098 break; |
| 1080 i++; | 1099 i++; |
| 1081 } | 1100 } |
| 1082 | 1101 |
| 2290 | 2309 |
| 2291 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 2310 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 2292 Check if a text is encoded in SJIS. If it is, return | 2311 Check if a text is encoded in SJIS. If it is, return |
| 2293 CODING_CATEGORY_MASK_SJIS, else return 0. */ | 2312 CODING_CATEGORY_MASK_SJIS, else return 0. */ |
| 2294 | 2313 |
| 2295 int | 2314 static int |
| 2296 detect_coding_sjis (src, src_end) | 2315 detect_coding_sjis (src, src_end, multibytep) |
| 2297 unsigned char *src, *src_end; | 2316 unsigned char *src, *src_end; |
| 2317 int multibytep; | |
| 2298 { | 2318 { |
| 2299 int c; | 2319 int c; |
| 2300 /* Dummy for ONE_MORE_BYTE. */ | 2320 /* Dummy for ONE_MORE_BYTE. */ |
| 2301 struct coding_system dummy_coding; | 2321 struct coding_system dummy_coding; |
| 2302 struct coding_system *coding = &dummy_coding; | 2322 struct coding_system *coding = &dummy_coding; |
| 2303 | 2323 |
| 2304 while (1) | 2324 while (1) |
| 2305 { | 2325 { |
| 2306 ONE_MORE_BYTE (c); | 2326 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 2307 if (c >= 0x81) | 2327 if (c >= 0x81) |
| 2308 { | 2328 { |
| 2309 if (c <= 0x9F || (c >= 0xE0 && c <= 0xEF)) | 2329 if (c <= 0x9F || (c >= 0xE0 && c <= 0xEF)) |
| 2310 { | 2330 { |
| 2311 ONE_MORE_BYTE (c); | 2331 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 2312 if (c < 0x40 || c == 0x7F || c > 0xFC) | 2332 if (c < 0x40 || c == 0x7F || c > 0xFC) |
| 2313 return 0; | 2333 return 0; |
| 2314 } | 2334 } |
| 2315 else if (c > 0xDF) | 2335 else if (c > 0xDF) |
| 2316 return 0; | 2336 return 0; |
| 2322 | 2342 |
| 2323 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 2343 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 2324 Check if a text is encoded in BIG5. If it is, return | 2344 Check if a text is encoded in BIG5. If it is, return |
| 2325 CODING_CATEGORY_MASK_BIG5, else return 0. */ | 2345 CODING_CATEGORY_MASK_BIG5, else return 0. */ |
| 2326 | 2346 |
| 2327 int | 2347 static int |
| 2328 detect_coding_big5 (src, src_end) | 2348 detect_coding_big5 (src, src_end, multibytep) |
| 2329 unsigned char *src, *src_end; | 2349 unsigned char *src, *src_end; |
| 2350 int multibytep; | |
| 2330 { | 2351 { |
| 2331 int c; | 2352 int c; |
| 2332 /* Dummy for ONE_MORE_BYTE. */ | 2353 /* Dummy for ONE_MORE_BYTE. */ |
| 2333 struct coding_system dummy_coding; | 2354 struct coding_system dummy_coding; |
| 2334 struct coding_system *coding = &dummy_coding; | 2355 struct coding_system *coding = &dummy_coding; |
| 2335 | 2356 |
| 2336 while (1) | 2357 while (1) |
| 2337 { | 2358 { |
| 2338 ONE_MORE_BYTE (c); | 2359 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 2339 if (c >= 0xA1) | 2360 if (c >= 0xA1) |
| 2340 { | 2361 { |
| 2341 ONE_MORE_BYTE (c); | 2362 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 2342 if (c < 0x40 || (c >= 0x7F && c <= 0xA0)) | 2363 if (c < 0x40 || (c >= 0x7F && c <= 0xA0)) |
| 2343 return 0; | 2364 return 0; |
| 2344 } | 2365 } |
| 2345 } | 2366 } |
| 2346 label_end_of_loop: | 2367 label_end_of_loop: |
| 2357 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0) | 2378 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0) |
| 2358 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) | 2379 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) |
| 2359 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) | 2380 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) |
| 2360 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC) | 2381 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC) |
| 2361 | 2382 |
| 2362 int | 2383 static int |
| 2363 detect_coding_utf_8 (src, src_end) | 2384 detect_coding_utf_8 (src, src_end, multibytep) |
| 2364 unsigned char *src, *src_end; | 2385 unsigned char *src, *src_end; |
| 2386 int multibytep; | |
| 2365 { | 2387 { |
| 2366 unsigned char c; | 2388 unsigned char c; |
| 2367 int seq_maybe_bytes; | 2389 int seq_maybe_bytes; |
| 2368 /* Dummy for ONE_MORE_BYTE. */ | 2390 /* Dummy for ONE_MORE_BYTE. */ |
| 2369 struct coding_system dummy_coding; | 2391 struct coding_system dummy_coding; |
| 2370 struct coding_system *coding = &dummy_coding; | 2392 struct coding_system *coding = &dummy_coding; |
| 2371 | 2393 |
| 2372 while (1) | 2394 while (1) |
| 2373 { | 2395 { |
| 2374 ONE_MORE_BYTE (c); | 2396 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 2375 if (UTF_8_1_OCTET_P (c)) | 2397 if (UTF_8_1_OCTET_P (c)) |
| 2376 continue; | 2398 continue; |
| 2377 else if (UTF_8_2_OCTET_LEADING_P (c)) | 2399 else if (UTF_8_2_OCTET_LEADING_P (c)) |
| 2378 seq_maybe_bytes = 1; | 2400 seq_maybe_bytes = 1; |
| 2379 else if (UTF_8_3_OCTET_LEADING_P (c)) | 2401 else if (UTF_8_3_OCTET_LEADING_P (c)) |
| 2387 else | 2409 else |
| 2388 return 0; | 2410 return 0; |
| 2389 | 2411 |
| 2390 do | 2412 do |
| 2391 { | 2413 { |
| 2392 ONE_MORE_BYTE (c); | 2414 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 2393 if (!UTF_8_EXTRA_OCTET_P (c)) | 2415 if (!UTF_8_EXTRA_OCTET_P (c)) |
| 2394 return 0; | 2416 return 0; |
| 2395 seq_maybe_bytes--; | 2417 seq_maybe_bytes--; |
| 2396 } | 2418 } |
| 2397 while (seq_maybe_bytes > 0); | 2419 while (seq_maybe_bytes > 0); |
| 2415 (((val) & 0xD800) == 0xD800) | 2437 (((val) & 0xD800) == 0xD800) |
| 2416 | 2438 |
| 2417 #define UTF_16_LOW_SURROGATE_P(val) \ | 2439 #define UTF_16_LOW_SURROGATE_P(val) \ |
| 2418 (((val) & 0xDC00) == 0xDC00) | 2440 (((val) & 0xDC00) == 0xDC00) |
| 2419 | 2441 |
| 2420 int | 2442 static int |
| 2421 detect_coding_utf_16 (src, src_end) | 2443 detect_coding_utf_16 (src, src_end, multibytep) |
| 2422 unsigned char *src, *src_end; | 2444 unsigned char *src, *src_end; |
| 2445 int multibytep; | |
| 2423 { | 2446 { |
| 2424 unsigned char c1, c2; | 2447 unsigned char c1, c2; |
| 2425 /* Dummy for TWO_MORE_BYTES. */ | 2448 /* Dummy for TWO_MORE_BYTES. */ |
| 2426 struct coding_system dummy_coding; | 2449 struct coding_system dummy_coding; |
| 2427 struct coding_system *coding = &dummy_coding; | 2450 struct coding_system *coding = &dummy_coding; |
| 2428 | 2451 |
| 2429 TWO_MORE_BYTES (c1, c2); | 2452 ONE_MORE_BYTE_CHECK_MULTIBYTE (c1, multibytep); |
| 2453 ONE_MORE_BYTE_CHECK_MULTIBYTE (c2, multibytep); | |
| 2430 | 2454 |
| 2431 if ((c1 == 0xFF) && (c2 == 0xFE)) | 2455 if ((c1 == 0xFF) && (c2 == 0xFE)) |
| 2432 return CODING_CATEGORY_MASK_UTF_16_LE; | 2456 return CODING_CATEGORY_MASK_UTF_16_LE; |
| 2433 else if ((c1 == 0xFE) && (c2 == 0xFF)) | 2457 else if ((c1 == 0xFE) && (c2 == 0xFF)) |
| 2434 return CODING_CATEGORY_MASK_UTF_16_BE; | 2458 return CODING_CATEGORY_MASK_UTF_16_BE; |
| 2675 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | 2699 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". |
| 2676 Check if a text is encoded in a coding system of which | 2700 Check if a text is encoded in a coding system of which |
| 2677 encoder/decoder are written in CCL program. If it is, return | 2701 encoder/decoder are written in CCL program. If it is, return |
| 2678 CODING_CATEGORY_MASK_CCL, else return 0. */ | 2702 CODING_CATEGORY_MASK_CCL, else return 0. */ |
| 2679 | 2703 |
| 2680 int | 2704 static int |
| 2681 detect_coding_ccl (src, src_end) | 2705 detect_coding_ccl (src, src_end, multibytep) |
| 2682 unsigned char *src, *src_end; | 2706 unsigned char *src, *src_end; |
| 2707 int multibytep; | |
| 2683 { | 2708 { |
| 2684 unsigned char *valid; | 2709 unsigned char *valid; |
| 2685 int c; | 2710 int c; |
| 2686 /* Dummy for ONE_MORE_BYTE. */ | 2711 /* Dummy for ONE_MORE_BYTE. */ |
| 2687 struct coding_system dummy_coding; | 2712 struct coding_system dummy_coding; |
| 2692 return 0; | 2717 return 0; |
| 2693 | 2718 |
| 2694 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes; | 2719 valid = coding_system_table[CODING_CATEGORY_IDX_CCL]->spec.ccl.valid_codes; |
| 2695 while (1) | 2720 while (1) |
| 2696 { | 2721 { |
| 2697 ONE_MORE_BYTE (c); | 2722 ONE_MORE_BYTE_CHECK_MULTIBYTE (c, multibytep); |
| 2698 if (! valid[c]) | 2723 if (! valid[c]) |
| 2699 return 0; | 2724 return 0; |
| 2700 } | 2725 } |
| 2701 label_end_of_loop: | 2726 label_end_of_loop: |
| 2702 return CODING_CATEGORY_MASK_CCL; | 2727 return CODING_CATEGORY_MASK_CCL; |
| 3482 If it detects possible coding systems, return an integer in which | 3507 If it detects possible coding systems, return an integer in which |
| 3483 appropriate flag bits are set. Flag bits are defined by macros | 3508 appropriate flag bits are set. Flag bits are defined by macros |
| 3484 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL, | 3509 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL, |
| 3485 it should point the table `coding_priorities'. In that case, only | 3510 it should point the table `coding_priorities'. In that case, only |
| 3486 the flag bit for a coding system of the highest priority is set in | 3511 the flag bit for a coding system of the highest priority is set in |
| 3487 the returned value. | 3512 the returned value. If MULTIBYTEP is nonzero, 8-bit codes of the |
| 3513 range 0x80..0x9F are in multibyte form. | |
| 3488 | 3514 |
| 3489 How many ASCII characters are at the head is returned as *SKIP. */ | 3515 How many ASCII characters are at the head is returned as *SKIP. */ |
| 3490 | 3516 |
| 3491 static int | 3517 static int |
| 3492 detect_coding_mask (source, src_bytes, priorities, skip) | 3518 detect_coding_mask (source, src_bytes, priorities, skip, multibytep) |
| 3493 unsigned char *source; | 3519 unsigned char *source; |
| 3494 int src_bytes, *priorities, *skip; | 3520 int src_bytes, *priorities, *skip; |
| 3521 int multibytep; | |
| 3495 { | 3522 { |
| 3496 register unsigned char c; | 3523 register unsigned char c; |
| 3497 unsigned char *src = source, *src_end = source + src_bytes; | 3524 unsigned char *src = source, *src_end = source + src_bytes; |
| 3498 unsigned int mask, utf16_examined_p, iso2022_examined_p; | 3525 unsigned int mask, utf16_examined_p, iso2022_examined_p; |
| 3499 int i, idx; | 3526 int i, idx; |
| 3517 Now, try to find in which coding system the text is encoded. */ | 3544 Now, try to find in which coding system the text is encoded. */ |
| 3518 if (c < 0x80) | 3545 if (c < 0x80) |
| 3519 { | 3546 { |
| 3520 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ | 3547 /* i.e. (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) */ |
| 3521 /* C is an ISO2022 specific control code of C0. */ | 3548 /* C is an ISO2022 specific control code of C0. */ |
| 3522 mask = detect_coding_iso2022 (src, src_end); | 3549 mask = detect_coding_iso2022 (src, src_end, multibytep); |
| 3523 if (mask == 0) | 3550 if (mask == 0) |
| 3524 { | 3551 { |
| 3525 /* No valid ISO2022 code follows C. Try again. */ | 3552 /* No valid ISO2022 code follows C. Try again. */ |
| 3526 src++; | 3553 src++; |
| 3527 if (c == ISO_CODE_ESC) | 3554 if (c == ISO_CODE_ESC) |
| 3541 } | 3568 } |
| 3542 } | 3569 } |
| 3543 else | 3570 else |
| 3544 { | 3571 { |
| 3545 int try; | 3572 int try; |
| 3573 | |
| 3574 if (multibytep && c == LEADING_CODE_8_BIT_CONTROL) | |
| 3575 c = *src++ - 0x20; | |
| 3546 | 3576 |
| 3547 if (c < 0xA0) | 3577 if (c < 0xA0) |
| 3548 { | 3578 { |
| 3549 /* C is the first byte of SJIS character code, | 3579 /* C is the first byte of SJIS character code, |
| 3550 or a leading-code of Emacs' internal format (emacs-mule), | 3580 or a leading-code of Emacs' internal format (emacs-mule), |
| 3600 { | 3630 { |
| 3601 mask |= detect_coding_iso2022 (src, src_end); | 3631 mask |= detect_coding_iso2022 (src, src_end); |
| 3602 iso2022_examined_p = 1; | 3632 iso2022_examined_p = 1; |
| 3603 } | 3633 } |
| 3604 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) | 3634 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) |
| 3605 mask |= detect_coding_sjis (src, src_end); | 3635 mask |= detect_coding_sjis (src, src_end, multibytep); |
| 3606 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8) | 3636 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8) |
| 3607 mask |= detect_coding_utf_8 (src, src_end); | 3637 mask |= detect_coding_utf_8 (src, src_end, multibytep); |
| 3608 else if (!utf16_examined_p | 3638 else if (!utf16_examined_p |
| 3609 && (priorities[i] & try & | 3639 && (priorities[i] & try & |
| 3610 CODING_CATEGORY_MASK_UTF_16_BE_LE)) | 3640 CODING_CATEGORY_MASK_UTF_16_BE_LE)) |
| 3611 { | 3641 { |
| 3612 mask |= detect_coding_utf_16 (src, src_end); | 3642 mask |= detect_coding_utf_16 (src, src_end, multibytep); |
| 3613 utf16_examined_p = 1; | 3643 utf16_examined_p = 1; |
| 3614 } | 3644 } |
| 3615 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5) | 3645 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5) |
| 3616 mask |= detect_coding_big5 (src, src_end); | 3646 mask |= detect_coding_big5 (src, src_end, multibytep); |
| 3617 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) | 3647 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) |
| 3618 mask |= detect_coding_emacs_mule (src, src_end); | 3648 mask |= detect_coding_emacs_mule (src, src_end, multibytep); |
| 3619 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) | 3649 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) |
| 3620 mask |= detect_coding_ccl (src, src_end); | 3650 mask |= detect_coding_ccl (src, src_end, multibytep); |
| 3621 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) | 3651 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) |
| 3622 mask |= CODING_CATEGORY_MASK_RAW_TEXT; | 3652 mask |= CODING_CATEGORY_MASK_RAW_TEXT; |
| 3623 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) | 3653 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) |
| 3624 mask |= CODING_CATEGORY_MASK_BINARY; | 3654 mask |= CODING_CATEGORY_MASK_BINARY; |
| 3625 if (mask & priorities[i]) | 3655 if (mask & priorities[i]) |
| 3626 return priorities[i]; | 3656 return priorities[i]; |
| 3627 } | 3657 } |
| 3628 return CODING_CATEGORY_MASK_RAW_TEXT; | 3658 return CODING_CATEGORY_MASK_RAW_TEXT; |
| 3629 } | 3659 } |
| 3630 if (try & CODING_CATEGORY_MASK_ISO) | 3660 if (try & CODING_CATEGORY_MASK_ISO) |
| 3631 mask |= detect_coding_iso2022 (src, src_end); | 3661 mask |= detect_coding_iso2022 (src, src_end, multibytep); |
| 3632 if (try & CODING_CATEGORY_MASK_SJIS) | 3662 if (try & CODING_CATEGORY_MASK_SJIS) |
| 3633 mask |= detect_coding_sjis (src, src_end); | 3663 mask |= detect_coding_sjis (src, src_end, multibytep); |
| 3634 if (try & CODING_CATEGORY_MASK_BIG5) | 3664 if (try & CODING_CATEGORY_MASK_BIG5) |
| 3635 mask |= detect_coding_big5 (src, src_end); | 3665 mask |= detect_coding_big5 (src, src_end, multibytep); |
| 3636 if (try & CODING_CATEGORY_MASK_UTF_8) | 3666 if (try & CODING_CATEGORY_MASK_UTF_8) |
| 3637 mask |= detect_coding_utf_8 (src, src_end); | 3667 mask |= detect_coding_utf_8 (src, src_end, multibytep); |
| 3638 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE) | 3668 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE) |
| 3639 mask |= detect_coding_utf_16 (src, src_end); | 3669 mask |= detect_coding_utf_16 (src, src_end, multibytep); |
| 3640 if (try & CODING_CATEGORY_MASK_EMACS_MULE) | 3670 if (try & CODING_CATEGORY_MASK_EMACS_MULE) |
| 3641 mask |= detect_coding_emacs_mule (src, src_end); | 3671 mask |= detect_coding_emacs_mule (src, src_end, multibytep); |
| 3642 if (try & CODING_CATEGORY_MASK_CCL) | 3672 if (try & CODING_CATEGORY_MASK_CCL) |
| 3643 mask |= detect_coding_ccl (src, src_end); | 3673 mask |= detect_coding_ccl (src, src_end, multibytep); |
| 3644 } | 3674 } |
| 3645 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY); | 3675 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY); |
| 3646 } | 3676 } |
| 3647 | 3677 |
| 3648 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded. | 3678 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded. |
| 3657 unsigned int idx; | 3687 unsigned int idx; |
| 3658 int skip, mask, i; | 3688 int skip, mask, i; |
| 3659 Lisp_Object val; | 3689 Lisp_Object val; |
| 3660 | 3690 |
| 3661 val = Vcoding_category_list; | 3691 val = Vcoding_category_list; |
| 3662 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip); | 3692 mask = detect_coding_mask (src, src_bytes, coding_priorities, &skip, 0); |
| 3663 coding->heading_ascii = skip; | 3693 coding->heading_ascii = skip; |
| 3664 | 3694 |
| 3665 if (!mask) return; | 3695 if (!mask) return; |
| 3666 | 3696 |
| 3667 /* We found a single coding system of the highest priority in MASK. */ | 3697 /* We found a single coding system of the highest priority in MASK. */ |
| 5605 while (1) | 5635 while (1) |
| 5606 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); | 5636 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); |
| 5607 } | 5637 } |
| 5608 | 5638 |
| 5609 Lisp_Object | 5639 Lisp_Object |
| 5610 detect_coding_system (src, src_bytes, highest) | 5640 detect_coding_system (src, src_bytes, highest, multibytep) |
| 5611 unsigned char *src; | 5641 unsigned char *src; |
| 5612 int src_bytes, highest; | 5642 int src_bytes, highest; |
| 5643 int multibytep; | |
| 5613 { | 5644 { |
| 5614 int coding_mask, eol_type; | 5645 int coding_mask, eol_type; |
| 5615 Lisp_Object val, tmp; | 5646 Lisp_Object val, tmp; |
| 5616 int dummy; | 5647 int dummy; |
| 5617 | 5648 |
| 5618 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy); | 5649 coding_mask = detect_coding_mask (src, src_bytes, NULL, &dummy, multibytep); |
| 5619 eol_type = detect_eol_type (src, src_bytes, &dummy); | 5650 eol_type = detect_eol_type (src, src_bytes, &dummy); |
| 5620 if (eol_type == CODING_EOL_INCONSISTENT) | 5651 if (eol_type == CODING_EOL_INCONSISTENT) |
| 5621 eol_type = CODING_EOL_UNDECIDED; | 5652 eol_type = CODING_EOL_UNDECIDED; |
| 5622 | 5653 |
| 5623 if (!coding_mask) | 5654 if (!coding_mask) |
| 5696 if (from < GPT && to >= GPT) | 5727 if (from < GPT && to >= GPT) |
| 5697 move_gap_both (to, to_byte); | 5728 move_gap_both (to, to_byte); |
| 5698 | 5729 |
| 5699 return detect_coding_system (BYTE_POS_ADDR (from_byte), | 5730 return detect_coding_system (BYTE_POS_ADDR (from_byte), |
| 5700 to_byte - from_byte, | 5731 to_byte - from_byte, |
| 5701 !NILP (highest)); | 5732 !NILP (highest), |
| 5733 !NILP (current_buffer | |
| 5734 ->enable_multibyte_characters)); | |
| 5702 } | 5735 } |
| 5703 | 5736 |
| 5704 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, | 5737 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, |
| 5705 1, 2, 0, | 5738 1, 2, 0, |
| 5706 "Detect coding system of the text in STRING.\n\ | 5739 "Detect coding system of the text in STRING.\n\ |
| 5717 { | 5750 { |
| 5718 CHECK_STRING (string, 0); | 5751 CHECK_STRING (string, 0); |
| 5719 | 5752 |
| 5720 return detect_coding_system (XSTRING (string)->data, | 5753 return detect_coding_system (XSTRING (string)->data, |
| 5721 STRING_BYTES (XSTRING (string)), | 5754 STRING_BYTES (XSTRING (string)), |
| 5722 !NILP (highest)); | 5755 !NILP (highest), |
| 5756 STRING_MULTIBYTE (string)); | |
| 5723 } | 5757 } |
| 5724 | 5758 |
| 5725 /* Return an intersection of lists L1 and L2. */ | 5759 /* Return an intersection of lists L1 and L2. */ |
| 5726 | 5760 |
| 5727 static Lisp_Object | 5761 static Lisp_Object |
