Mercurial > emacs
comparison src/coding.c @ 28022:6c41f3276340
Add comments on coding-category-utf-8,
coding-category-utf-16-be, and coding-category-utf-16-le.
(coding_category_name): Include "coding-category-utf-8",
"coding-category-utf-16-be", and "coding-category-utf-16-le".
(UTF_8_1_OCTET_P) (UTF_8_EXTRA_OCTET_P) (UTF_8_2_OCTET_LEADING_P)
(UTF_8_3_OCTET_LEADING_P) (UTF_8_4_OCTET_LEADING_P)
(UTF_8_5_OCTET_LEADING_P) (UTF_8_6_OCTET_LEADING_P): New macros.
(detect_coding_utf_8): New function.
(UTF_16_INVALID_P) (TF_16_HIGH_SURROGATE_P)
(UTF_16_LOW_SURROGATE_P): New macros.
(detect_coding_utf_16): New function.
(detect_coding_mask): Fix bug of returning wrong mask bits in the
case that detect_coding_XXX returns a mask not set in
priorities[i].
(detect_eol_type_in_2_octet_form): New function.
(detect_eol): If cooding->category_idx is for UTF-16, call
detect_eol_type_in_2_octet_form instead of dectect_eol_type.
(detect_coding_system): Don't include `nil' coding-system in the
result.
(Fupdate_coding_systems_internal): Update all coding-categories.
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Tue, 07 Mar 2000 06:17:54 +0000 |
| parents | c2e0998057f9 |
| children | 01292435daaf |
comparison
equal
deleted
inserted
replaced
| 28021:e34a172ee77e | 28022:6c41f3276340 |
|---|---|
| 360 "coding-category-iso-8-2", | 360 "coding-category-iso-8-2", |
| 361 "coding-category-iso-7-else", | 361 "coding-category-iso-7-else", |
| 362 "coding-category-iso-8-else", | 362 "coding-category-iso-8-else", |
| 363 "coding-category-ccl", | 363 "coding-category-ccl", |
| 364 "coding-category-big5", | 364 "coding-category-big5", |
| 365 "coding-category-utf-8", | |
| 366 "coding-category-utf-16-be", | |
| 367 "coding-category-utf-16-le", | |
| 365 "coding-category-raw-text", | 368 "coding-category-raw-text", |
| 366 "coding-category-binary" | 369 "coding-category-binary" |
| 367 }; | 370 }; |
| 368 | 371 |
| 369 /* Table of pointers to coding systems corresponding to each coding | 372 /* Table of pointers to coding systems corresponding to each coding |
| 2346 } | 2349 } |
| 2347 } | 2350 } |
| 2348 return CODING_CATEGORY_MASK_BIG5; | 2351 return CODING_CATEGORY_MASK_BIG5; |
| 2349 } | 2352 } |
| 2350 | 2353 |
| 2354 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | |
| 2355 Check if a text is encoded in UTF-8. If it is, return | |
| 2356 CODING_CATEGORY_MASK_UTF_8, else return 0. */ | |
| 2357 | |
| 2358 #define UTF_8_1_OCTET_P(c) ((c) < 0x80) | |
| 2359 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80) | |
| 2360 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0) | |
| 2361 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0) | |
| 2362 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0) | |
| 2363 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8) | |
| 2364 #define UTF_8_6_OCTET_LEADING_P(c) (((c) & 0xFE) == 0xFC) | |
| 2365 | |
| 2366 int | |
| 2367 detect_coding_utf_8 (src, src_end) | |
| 2368 unsigned char *src, *src_end; | |
| 2369 { | |
| 2370 unsigned char c; | |
| 2371 int seq_maybe_bytes; | |
| 2372 | |
| 2373 while (src < src_end) | |
| 2374 { | |
| 2375 c = *src++; | |
| 2376 if (UTF_8_1_OCTET_P (c)) | |
| 2377 continue; | |
| 2378 else if (UTF_8_2_OCTET_LEADING_P (c)) | |
| 2379 seq_maybe_bytes = 1; | |
| 2380 else if (UTF_8_3_OCTET_LEADING_P (c)) | |
| 2381 seq_maybe_bytes = 2; | |
| 2382 else if (UTF_8_4_OCTET_LEADING_P (c)) | |
| 2383 seq_maybe_bytes = 3; | |
| 2384 else if (UTF_8_5_OCTET_LEADING_P (c)) | |
| 2385 seq_maybe_bytes = 4; | |
| 2386 else if (UTF_8_6_OCTET_LEADING_P (c)) | |
| 2387 seq_maybe_bytes = 5; | |
| 2388 else | |
| 2389 return 0; | |
| 2390 | |
| 2391 do | |
| 2392 { | |
| 2393 if (src >= src_end) | |
| 2394 return CODING_CATEGORY_MASK_UTF_8; | |
| 2395 | |
| 2396 c = *src++; | |
| 2397 if (!UTF_8_EXTRA_OCTET_P (c)) | |
| 2398 return 0; | |
| 2399 seq_maybe_bytes--; | |
| 2400 } | |
| 2401 while (seq_maybe_bytes > 0); | |
| 2402 } | |
| 2403 | |
| 2404 return CODING_CATEGORY_MASK_UTF_8; | |
| 2405 } | |
| 2406 | |
| 2407 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". | |
| 2408 Check if a text is encoded in UTF-16 Big Endian (endian == 1) or | |
| 2409 Little Endian (otherwise). If it is, return | |
| 2410 CODING_CATEGORY_MASK_UTF_16_BE or CODING_CATEGORY_MASK_UTF_16_LE, | |
| 2411 else return 0. */ | |
| 2412 | |
| 2413 #define UTF_16_INVALID_P(val) \ | |
| 2414 (((val) == 0xFFFE) \ | |
| 2415 || ((val) == 0xFFFF)) | |
| 2416 | |
| 2417 #define UTF_16_HIGH_SURROGATE_P(val) \ | |
| 2418 (((val) & 0xD800) == 0xD800) | |
| 2419 | |
| 2420 #define UTF_16_LOW_SURROGATE_P(val) \ | |
| 2421 (((val) & 0xDC00) == 0xDC00) | |
| 2422 | |
| 2423 int | |
| 2424 detect_coding_utf_16 (src, src_end) | |
| 2425 unsigned char *src, *src_end; | |
| 2426 { | |
| 2427 if ((src + 1) >= src_end) return 0; | |
| 2428 | |
| 2429 if ((src[0] == 0xFF) && (src[1] == 0xFE)) | |
| 2430 return CODING_CATEGORY_MASK_UTF_16_LE; | |
| 2431 else if ((src[0] == 0xFE) && (src[1] == 0xFF)) | |
| 2432 return CODING_CATEGORY_MASK_UTF_16_BE; | |
| 2433 | |
| 2434 return 0; | |
| 2435 } | |
| 2436 | |
| 2351 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". | 2437 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". |
| 2352 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ | 2438 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */ |
| 2353 | 2439 |
| 2354 int | 2440 int |
| 2355 decode_coding_sjis_big5 (coding, source, destination, | 2441 decode_coding_sjis_big5 (coding, source, destination, |
| 3451 | 3537 |
| 3452 The category for a coding system which has the same code range | 3538 The category for a coding system which has the same code range |
| 3453 as BIG5. Assigned the coding-system (Lisp symbol) | 3539 as BIG5. Assigned the coding-system (Lisp symbol) |
| 3454 `cn-big5' by default. | 3540 `cn-big5' by default. |
| 3455 | 3541 |
| 3542 o coding-category-utf-8 | |
| 3543 | |
| 3544 The category for a coding system which has the same code range | |
| 3545 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp | |
| 3546 symbol) `utf-8' by default. | |
| 3547 | |
| 3548 o coding-category-utf-16-be | |
| 3549 | |
| 3550 The category for a coding system in which a text has an | |
| 3551 Unicode signature (cf. Unicode Standard) in the order of BIG | |
| 3552 endian at the head. Assigned the coding-system (Lisp symbol) | |
| 3553 `utf-16-be' by default. | |
| 3554 | |
| 3555 o coding-category-utf-16-le | |
| 3556 | |
| 3557 The category for a coding system in which a text has an | |
| 3558 Unicode signature (cf. Unicode Standard) in the order of | |
| 3559 LITTLE endian at the head. Assigned the coding-system (Lisp | |
| 3560 symbol) `utf-16-le' by default. | |
| 3561 | |
| 3456 o coding-category-ccl | 3562 o coding-category-ccl |
| 3457 | 3563 |
| 3458 The category for a coding system of which encoder/decoder is | 3564 The category for a coding system of which encoder/decoder is |
| 3459 written in CCL programs. The default value is nil, i.e., no | 3565 written in CCL programs. The default value is nil, i.e., no |
| 3460 coding system is assigned. | 3566 coding system is assigned. |
| 3479 int ascii_skip_code[256]; | 3585 int ascii_skip_code[256]; |
| 3480 | 3586 |
| 3481 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded. | 3587 /* Detect how a text of length SRC_BYTES pointed by SOURCE is encoded. |
| 3482 If it detects possible coding systems, return an integer in which | 3588 If it detects possible coding systems, return an integer in which |
| 3483 appropriate flag bits are set. Flag bits are defined by macros | 3589 appropriate flag bits are set. Flag bits are defined by macros |
| 3484 CODING_CATEGORY_MASK_XXX in `coding.h'. | 3590 CODING_CATEGORY_MASK_XXX in `coding.h'. If PRIORITIES is non-NULL, |
| 3591 it should point the table `coding_priorities'. In that case, only | |
| 3592 the flag bit for a coding system of the highest priority is set in | |
| 3593 the returned value. | |
| 3485 | 3594 |
| 3486 How many ASCII characters are at the head is returned as *SKIP. */ | 3595 How many ASCII characters are at the head is returned as *SKIP. */ |
| 3487 | 3596 |
| 3488 static int | 3597 static int |
| 3489 detect_coding_mask (source, src_bytes, priorities, skip) | 3598 detect_coding_mask (source, src_bytes, priorities, skip) |
| 3490 unsigned char *source; | 3599 unsigned char *source; |
| 3491 int src_bytes, *priorities, *skip; | 3600 int src_bytes, *priorities, *skip; |
| 3492 { | 3601 { |
| 3493 register unsigned char c; | 3602 register unsigned char c; |
| 3494 unsigned char *src = source, *src_end = source + src_bytes; | 3603 unsigned char *src = source, *src_end = source + src_bytes; |
| 3495 unsigned int mask; | 3604 unsigned int mask, utf16_examined_p, iso2022_examined_p; |
| 3496 int i; | 3605 int i, idx; |
| 3497 | 3606 |
| 3498 /* At first, skip all ASCII characters and control characters except | 3607 /* At first, skip all ASCII characters and control characters except |
| 3499 for three ISO2022 specific control characters. */ | 3608 for three ISO2022 specific control characters. */ |
| 3500 ascii_skip_code[ISO_CODE_SO] = 0; | 3609 ascii_skip_code[ISO_CODE_SO] = 0; |
| 3501 ascii_skip_code[ISO_CODE_SI] = 0; | 3610 ascii_skip_code[ISO_CODE_SI] = 0; |
| 3526 else | 3635 else |
| 3527 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1; | 3636 ascii_skip_code[ISO_CODE_SO] = ascii_skip_code[ISO_CODE_SI] = 1; |
| 3528 goto label_loop_detect_coding; | 3637 goto label_loop_detect_coding; |
| 3529 } | 3638 } |
| 3530 if (priorities) | 3639 if (priorities) |
| 3531 goto label_return_highest_only; | 3640 { |
| 3641 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) | |
| 3642 { | |
| 3643 if (mask & priorities[i]) | |
| 3644 return priorities[i]; | |
| 3645 } | |
| 3646 return CODING_CATEGORY_MASK_RAW_TEXT; | |
| 3647 } | |
| 3532 } | 3648 } |
| 3533 else | 3649 else |
| 3534 { | 3650 { |
| 3535 int try; | 3651 int try; |
| 3536 | 3652 |
| 3537 if (c < 0xA0) | 3653 if (c < 0xA0) |
| 3538 { | 3654 { |
| 3539 /* C is the first byte of SJIS character code, | 3655 /* C is the first byte of SJIS character code, |
| 3540 or a leading-code of Emacs' internal format (emacs-mule). */ | 3656 or a leading-code of Emacs' internal format (emacs-mule), |
| 3541 try = CODING_CATEGORY_MASK_SJIS | CODING_CATEGORY_MASK_EMACS_MULE; | 3657 or the first byte of UTF-16. */ |
| 3658 try = (CODING_CATEGORY_MASK_SJIS | |
| 3659 | CODING_CATEGORY_MASK_EMACS_MULE | |
| 3660 | CODING_CATEGORY_MASK_UTF_16_BE | |
| 3661 | CODING_CATEGORY_MASK_UTF_16_LE); | |
| 3542 | 3662 |
| 3543 /* Or, if C is a special latin extra code, | 3663 /* Or, if C is a special latin extra code, |
| 3544 or is an ISO2022 specific control code of C1 (SS2 or SS3), | 3664 or is an ISO2022 specific control code of C1 (SS2 or SS3), |
| 3545 or is an ISO2022 control-sequence-introducer (CSI), | 3665 or is an ISO2022 control-sequence-introducer (CSI), |
| 3546 we should also consider the possibility of ISO2022 codings. */ | 3666 we should also consider the possibility of ISO2022 codings. */ |
| 3557 | CODING_CATEGORY_MASK_ISO_8BIT); | 3677 | CODING_CATEGORY_MASK_ISO_8BIT); |
| 3558 } | 3678 } |
| 3559 else | 3679 else |
| 3560 /* C is a character of ISO2022 in graphic plane right, | 3680 /* C is a character of ISO2022 in graphic plane right, |
| 3561 or a SJIS's 1-byte character code (i.e. JISX0201), | 3681 or a SJIS's 1-byte character code (i.e. JISX0201), |
| 3562 or the first byte of BIG5's 2-byte code. */ | 3682 or the first byte of BIG5's 2-byte code, |
| 3683 or the first byte of UTF-8/16. */ | |
| 3563 try = (CODING_CATEGORY_MASK_ISO_8_ELSE | 3684 try = (CODING_CATEGORY_MASK_ISO_8_ELSE |
| 3564 | CODING_CATEGORY_MASK_ISO_8BIT | 3685 | CODING_CATEGORY_MASK_ISO_8BIT |
| 3565 | CODING_CATEGORY_MASK_SJIS | 3686 | CODING_CATEGORY_MASK_SJIS |
| 3566 | CODING_CATEGORY_MASK_BIG5); | 3687 | CODING_CATEGORY_MASK_BIG5 |
| 3688 | CODING_CATEGORY_MASK_UTF_8 | |
| 3689 | CODING_CATEGORY_MASK_UTF_16_BE | |
| 3690 | CODING_CATEGORY_MASK_UTF_16_LE); | |
| 3567 | 3691 |
| 3568 /* Or, we may have to consider the possibility of CCL. */ | 3692 /* Or, we may have to consider the possibility of CCL. */ |
| 3569 if (coding_system_table[CODING_CATEGORY_IDX_CCL] | 3693 if (coding_system_table[CODING_CATEGORY_IDX_CCL] |
| 3570 && (coding_system_table[CODING_CATEGORY_IDX_CCL] | 3694 && (coding_system_table[CODING_CATEGORY_IDX_CCL] |
| 3571 ->spec.ccl.valid_codes)[c]) | 3695 ->spec.ccl.valid_codes)[c]) |
| 3572 try |= CODING_CATEGORY_MASK_CCL; | 3696 try |= CODING_CATEGORY_MASK_CCL; |
| 3573 | 3697 |
| 3574 mask = 0; | 3698 mask = 0; |
| 3699 utf16_examined_p = iso2022_examined_p = 0; | |
| 3575 if (priorities) | 3700 if (priorities) |
| 3576 { | 3701 { |
| 3577 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) | 3702 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) |
| 3578 { | 3703 { |
| 3579 if (priorities[i] & try & CODING_CATEGORY_MASK_ISO) | 3704 if (!iso2022_examined_p |
| 3580 mask = detect_coding_iso2022 (src, src_end); | 3705 && (priorities[i] & try & CODING_CATEGORY_MASK_ISO)) |
| 3706 { | |
| 3707 mask |= detect_coding_iso2022 (src, src_end); | |
| 3708 iso2022_examined_p = 1; | |
| 3709 } | |
| 3581 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) | 3710 else if (priorities[i] & try & CODING_CATEGORY_MASK_SJIS) |
| 3582 mask = detect_coding_sjis (src, src_end); | 3711 mask |= detect_coding_sjis (src, src_end); |
| 3712 else if (priorities[i] & try & CODING_CATEGORY_MASK_UTF_8) | |
| 3713 mask |= detect_coding_utf_8 (src, src_end); | |
| 3714 else if (!utf16_examined_p | |
| 3715 && (priorities[i] & try & | |
| 3716 CODING_CATEGORY_MASK_UTF_16_BE_LE)) | |
| 3717 { | |
| 3718 mask |= detect_coding_utf_16 (src, src_end); | |
| 3719 utf16_examined_p = 1; | |
| 3720 } | |
| 3583 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5) | 3721 else if (priorities[i] & try & CODING_CATEGORY_MASK_BIG5) |
| 3584 mask = detect_coding_big5 (src, src_end); | 3722 mask |= detect_coding_big5 (src, src_end); |
| 3585 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) | 3723 else if (priorities[i] & try & CODING_CATEGORY_MASK_EMACS_MULE) |
| 3586 mask = detect_coding_emacs_mule (src, src_end); | 3724 mask |= detect_coding_emacs_mule (src, src_end); |
| 3587 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) | 3725 else if (priorities[i] & try & CODING_CATEGORY_MASK_CCL) |
| 3588 mask = detect_coding_ccl (src, src_end); | 3726 mask |= detect_coding_ccl (src, src_end); |
| 3589 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) | 3727 else if (priorities[i] & CODING_CATEGORY_MASK_RAW_TEXT) |
| 3590 mask = CODING_CATEGORY_MASK_RAW_TEXT; | 3728 mask |= CODING_CATEGORY_MASK_RAW_TEXT; |
| 3591 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) | 3729 else if (priorities[i] & CODING_CATEGORY_MASK_BINARY) |
| 3592 mask = CODING_CATEGORY_MASK_BINARY; | 3730 mask |= CODING_CATEGORY_MASK_BINARY; |
| 3593 if (mask) | 3731 if (mask & priorities[i]) |
| 3594 goto label_return_highest_only; | 3732 return priorities[i]; |
| 3595 } | 3733 } |
| 3596 return CODING_CATEGORY_MASK_RAW_TEXT; | 3734 return CODING_CATEGORY_MASK_RAW_TEXT; |
| 3597 } | 3735 } |
| 3598 if (try & CODING_CATEGORY_MASK_ISO) | 3736 if (try & CODING_CATEGORY_MASK_ISO) |
| 3599 mask |= detect_coding_iso2022 (src, src_end); | 3737 mask |= detect_coding_iso2022 (src, src_end); |
| 3600 if (try & CODING_CATEGORY_MASK_SJIS) | 3738 if (try & CODING_CATEGORY_MASK_SJIS) |
| 3601 mask |= detect_coding_sjis (src, src_end); | 3739 mask |= detect_coding_sjis (src, src_end); |
| 3602 if (try & CODING_CATEGORY_MASK_BIG5) | 3740 if (try & CODING_CATEGORY_MASK_BIG5) |
| 3603 mask |= detect_coding_big5 (src, src_end); | 3741 mask |= detect_coding_big5 (src, src_end); |
| 3742 if (try & CODING_CATEGORY_MASK_UTF_8) | |
| 3743 mask |= detect_coding_utf_8 (src, src_end); | |
| 3744 if (try & CODING_CATEGORY_MASK_UTF_16_BE_LE) | |
| 3745 mask |= detect_coding_utf_16 (src, src_end); | |
| 3604 if (try & CODING_CATEGORY_MASK_EMACS_MULE) | 3746 if (try & CODING_CATEGORY_MASK_EMACS_MULE) |
| 3605 mask |= detect_coding_emacs_mule (src, src_end); | 3747 mask |= detect_coding_emacs_mule (src, src_end); |
| 3606 if (try & CODING_CATEGORY_MASK_CCL) | 3748 if (try & CODING_CATEGORY_MASK_CCL) |
| 3607 mask |= detect_coding_ccl (src, src_end); | 3749 mask |= detect_coding_ccl (src, src_end); |
| 3608 } | 3750 } |
| 3609 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY); | 3751 return (mask | CODING_CATEGORY_MASK_RAW_TEXT | CODING_CATEGORY_MASK_BINARY); |
| 3610 | |
| 3611 label_return_highest_only: | |
| 3612 for (i = 0; i < CODING_CATEGORY_IDX_MAX; i++) | |
| 3613 { | |
| 3614 if (mask & priorities[i]) | |
| 3615 return priorities[i]; | |
| 3616 } | |
| 3617 return CODING_CATEGORY_MASK_RAW_TEXT; | |
| 3618 } | 3752 } |
| 3619 | 3753 |
| 3620 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded. | 3754 /* Detect how a text of length SRC_BYTES pointed by SRC is encoded. |
| 3621 The information of the detected coding system is set in CODING. */ | 3755 The information of the detected coding system is set in CODING. */ |
| 3622 | 3756 |
| 3708 if (*skip == 0) | 3842 if (*skip == 0) |
| 3709 *skip = src_end - source; | 3843 *skip = src_end - source; |
| 3710 return eol_type; | 3844 return eol_type; |
| 3711 } | 3845 } |
| 3712 | 3846 |
| 3847 /* Like detect_eol_type, but detect EOL type in 2-octet | |
| 3848 big-endian/little-endian format for coding systems utf-16-be and | |
| 3849 utf-16-le. */ | |
| 3850 | |
| 3851 static int | |
| 3852 detect_eol_type_in_2_octet_form (source, src_bytes, skip, big_endian_p) | |
| 3853 unsigned char *source; | |
| 3854 int src_bytes, *skip; | |
| 3855 { | |
| 3856 unsigned char *src = source, *src_end = src + src_bytes; | |
| 3857 unsigned int c1, c2; | |
| 3858 int total = 0; /* How many end-of-lines are found so far. */ | |
| 3859 int eol_type = CODING_EOL_UNDECIDED; | |
| 3860 int this_eol_type; | |
| 3861 int msb, lsb; | |
| 3862 | |
| 3863 if (big_endian_p) | |
| 3864 msb = 0, lsb = 1; | |
| 3865 else | |
| 3866 msb = 1, lsb = 0; | |
| 3867 | |
| 3868 *skip = 0; | |
| 3869 | |
| 3870 while ((src + 1) < src_end && total < MAX_EOL_CHECK_COUNT) | |
| 3871 { | |
| 3872 c1 = (src[msb] << 8) | (src[lsb]); | |
| 3873 src += 2; | |
| 3874 | |
| 3875 if (c1 == '\n' || c1 == '\r') | |
| 3876 { | |
| 3877 if (*skip == 0) | |
| 3878 *skip = src - 2 - source; | |
| 3879 total++; | |
| 3880 if (c1 == '\n') | |
| 3881 { | |
| 3882 this_eol_type = CODING_EOL_LF; | |
| 3883 } | |
| 3884 else | |
| 3885 { | |
| 3886 if ((src + 1) >= src_end) | |
| 3887 { | |
| 3888 this_eol_type = CODING_EOL_CR; | |
| 3889 } | |
| 3890 else | |
| 3891 { | |
| 3892 c2 = (src[msb] << 8) | (src[lsb]); | |
| 3893 if (c2 == '\n') | |
| 3894 this_eol_type = CODING_EOL_CRLF, src += 2; | |
| 3895 else | |
| 3896 this_eol_type = CODING_EOL_CR; | |
| 3897 } | |
| 3898 } | |
| 3899 | |
| 3900 if (eol_type == CODING_EOL_UNDECIDED) | |
| 3901 /* This is the first end-of-line. */ | |
| 3902 eol_type = this_eol_type; | |
| 3903 else if (eol_type != this_eol_type) | |
| 3904 { | |
| 3905 /* The found type is different from what found before. */ | |
| 3906 eol_type = CODING_EOL_INCONSISTENT; | |
| 3907 break; | |
| 3908 } | |
| 3909 } | |
| 3910 } | |
| 3911 | |
| 3912 if (*skip == 0) | |
| 3913 *skip = src_end - source; | |
| 3914 return eol_type; | |
| 3915 } | |
| 3916 | |
| 3713 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC | 3917 /* Detect how end-of-line of a text of length SRC_BYTES pointed by SRC |
| 3714 is encoded. If it detects an appropriate format of end-of-line, it | 3918 is encoded. If it detects an appropriate format of end-of-line, it |
| 3715 sets the information in *CODING. */ | 3919 sets the information in *CODING. */ |
| 3716 | 3920 |
| 3717 void | 3921 void |
| 3720 unsigned char *src; | 3924 unsigned char *src; |
| 3721 int src_bytes; | 3925 int src_bytes; |
| 3722 { | 3926 { |
| 3723 Lisp_Object val; | 3927 Lisp_Object val; |
| 3724 int skip; | 3928 int skip; |
| 3725 int eol_type = detect_eol_type (src, src_bytes, &skip); | 3929 int eol_type; |
| 3930 | |
| 3931 switch (coding->category_idx) | |
| 3932 { | |
| 3933 case CODING_CATEGORY_IDX_UTF_16_BE: | |
| 3934 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 1); | |
| 3935 break; | |
| 3936 case CODING_CATEGORY_IDX_UTF_16_LE: | |
| 3937 eol_type = detect_eol_type_in_2_octet_form (src, src_bytes, &skip, 0); | |
| 3938 break; | |
| 3939 default: | |
| 3940 eol_type = detect_eol_type (src, src_bytes, &skip); | |
| 3941 break; | |
| 3942 } | |
| 3726 | 3943 |
| 3727 if (coding->heading_ascii > skip) | 3944 if (coding->heading_ascii > skip) |
| 3728 coding->heading_ascii = skip; | 3945 coding->heading_ascii = skip; |
| 3729 else | 3946 else |
| 3730 skip = coding->heading_ascii; | 3947 skip = coding->heading_ascii; |
| 5214 return (highest ? val : Fcons (val, Qnil)); | 5431 return (highest ? val : Fcons (val, Qnil)); |
| 5215 } | 5432 } |
| 5216 | 5433 |
| 5217 /* At first, gather possible coding systems in VAL. */ | 5434 /* At first, gather possible coding systems in VAL. */ |
| 5218 val = Qnil; | 5435 val = Qnil; |
| 5219 for (tmp = Vcoding_category_list; !NILP (tmp); tmp = XCDR (tmp)) | 5436 for (tmp = Vcoding_category_list; CONSP (tmp); tmp = XCDR (tmp)) |
| 5220 { | 5437 { |
| 5221 int idx | 5438 Lisp_Object category_val, category_index; |
| 5222 = XFASTINT (Fget (XCAR (tmp), Qcoding_category_index)); | 5439 |
| 5223 if (coding_mask & (1 << idx)) | 5440 category_index = Fget (XCAR (tmp), Qcoding_category_index); |
| 5224 { | 5441 category_val = Fsymbol_value (XCAR (tmp)); |
| 5225 val = Fcons (Fsymbol_value (XCAR (tmp)), val); | 5442 if (!NILP (category_val) |
| 5443 && NATNUMP (category_index) | |
| 5444 && (coding_mask & (1 << XFASTINT (category_index)))) | |
| 5445 { | |
| 5446 val = Fcons (category_val, val); | |
| 5226 if (highest) | 5447 if (highest) |
| 5227 break; | 5448 break; |
| 5228 } | 5449 } |
| 5229 } | 5450 } |
| 5230 if (!highest) | 5451 if (!highest) |
| 5231 val = Fnreverse (val); | 5452 val = Fnreverse (val); |
| 5232 | 5453 |
| 5233 /* Then, replace the elements with subsidiary coding systems. */ | 5454 /* Then, replace the elements with subsidiary coding systems. */ |
| 5234 for (tmp = val; !NILP (tmp); tmp = XCDR (tmp)) | 5455 for (tmp = val; CONSP (tmp); tmp = XCDR (tmp)) |
| 5235 { | 5456 { |
| 5236 if (eol_type != CODING_EOL_UNDECIDED | 5457 if (eol_type != CODING_EOL_UNDECIDED |
| 5237 && eol_type != CODING_EOL_INCONSISTENT) | 5458 && eol_type != CODING_EOL_INCONSISTENT) |
| 5238 { | 5459 { |
| 5239 Lisp_Object eol; | 5460 Lisp_Object eol; |
| 5710 } | 5931 } |
| 5711 | 5932 |
| 5712 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal, | 5933 DEFUN ("update-coding-systems-internal", Fupdate_coding_systems_internal, |
| 5713 Supdate_coding_systems_internal, 0, 0, 0, | 5934 Supdate_coding_systems_internal, 0, 0, 0, |
| 5714 "Update internal database for ISO2022 and CCL based coding systems.\n\ | 5935 "Update internal database for ISO2022 and CCL based coding systems.\n\ |
| 5715 When values of the following coding categories are changed, you must\n\ | 5936 When values of any coding categories are changed, you must\n\ |
| 5716 call this function:\n\ | 5937 call this function") |
| 5717 coding-category-iso-7, coding-category-iso-7-tight,\n\ | |
| 5718 coding-category-iso-8-1, coding-category-iso-8-2,\n\ | |
| 5719 coding-category-iso-7-else, coding-category-iso-8-else,\n\ | |
| 5720 coding-category-ccl") | |
| 5721 () | 5938 () |
| 5722 { | 5939 { |
| 5723 int i; | 5940 int i; |
| 5724 | 5941 |
| 5725 for (i = CODING_CATEGORY_IDX_ISO_7; i <= CODING_CATEGORY_IDX_CCL; i++) | 5942 for (i = CODING_CATEGORY_IDX_EMACS_MULE; i < CODING_CATEGORY_IDX_MAX; i++) |
| 5726 { | 5943 { |
| 5727 Lisp_Object val; | 5944 Lisp_Object val; |
| 5728 | 5945 |
| 5729 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value; | 5946 val = XSYMBOL (XVECTOR (Vcoding_category_table)->contents[i])->value; |
| 5730 if (!NILP (val)) | 5947 if (!NILP (val)) |
| 5765 coding_priorities[i++] = (1 << idx); | 5982 coding_priorities[i++] = (1 << idx); |
| 5766 val = XCDR (val); | 5983 val = XCDR (val); |
| 5767 } | 5984 } |
| 5768 /* If coding-category-list is valid and contains all coding | 5985 /* If coding-category-list is valid and contains all coding |
| 5769 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not, | 5986 categories, `i' should be CODING_CATEGORY_IDX_MAX now. If not, |
| 5770 the following code saves Emacs from craching. */ | 5987 the following code saves Emacs from crashing. */ |
| 5771 while (i < CODING_CATEGORY_IDX_MAX) | 5988 while (i < CODING_CATEGORY_IDX_MAX) |
| 5772 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT; | 5989 coding_priorities[i++] = CODING_CATEGORY_MASK_RAW_TEXT; |
| 5773 | 5990 |
| 5774 return Qnil; | 5991 return Qnil; |
| 5775 } | 5992 } |
