Mercurial > emacs
comparison src/coding.c @ 95533:831c8ee4d884
(detect_coding): Fix handling of coding->head_ascii.
Be sure to call setup_coding_system when a proper coding system is
found.
(detect_coding_system): Fix handling of coding->head_ascii.
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Wed, 04 Jun 2008 07:52:46 +0000 |
| parents | c99f0a16c077 |
| children | 862c7386145c |
comparison
equal
deleted
inserted
replaced
| 95532:b657ba21e4d3 | 95533:831c8ee4d884 |
|---|---|
| 5780 coding->consumed = coding->consumed_char = 0; | 5780 coding->consumed = coding->consumed_char = 0; |
| 5781 coding->produced = coding->produced_char = 0; | 5781 coding->produced = coding->produced_char = 0; |
| 5782 coding_set_source (coding); | 5782 coding_set_source (coding); |
| 5783 | 5783 |
| 5784 src_end = coding->source + coding->src_bytes; | 5784 src_end = coding->source + coding->src_bytes; |
| 5785 coding->head_ascii = 0; | |
| 5785 | 5786 |
| 5786 /* If we have not yet decided the text encoding type, detect it | 5787 /* If we have not yet decided the text encoding type, detect it |
| 5787 now. */ | 5788 now. */ |
| 5788 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) | 5789 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) |
| 5789 { | 5790 { |
| 5790 int c, i; | 5791 int c, i; |
| 5791 struct coding_detection_info detect_info; | 5792 struct coding_detection_info detect_info; |
| 5792 int null_byte_found = 0, eight_bit_found = 0; | 5793 int null_byte_found = 0, eight_bit_found = 0; |
| 5793 | 5794 |
| 5794 detect_info.checked = detect_info.found = detect_info.rejected = 0; | 5795 detect_info.checked = detect_info.found = detect_info.rejected = 0; |
| 5795 coding->head_ascii = -1; | |
| 5796 for (src = coding->source; src < src_end; src++) | 5796 for (src = coding->source; src < src_end; src++) |
| 5797 { | 5797 { |
| 5798 c = *src; | 5798 c = *src; |
| 5799 if (c & 0x80) | 5799 if (c & 0x80) |
| 5800 { | 5800 { |
| 5801 eight_bit_found = 1; | 5801 eight_bit_found = 1; |
| 5802 if (coding->head_ascii < 0) | |
| 5803 coding->head_ascii = src - coding->source; | |
| 5804 if (null_byte_found) | 5802 if (null_byte_found) |
| 5805 break; | 5803 break; |
| 5806 } | 5804 } |
| 5807 else if (c < 0x20) | 5805 else if (c < 0x20) |
| 5808 { | 5806 { |
| 5809 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | 5807 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) |
| 5810 && ! inhibit_iso_escape_detection | 5808 && ! inhibit_iso_escape_detection |
| 5811 && ! detect_info.checked) | 5809 && ! detect_info.checked) |
| 5812 { | 5810 { |
| 5813 if (coding->head_ascii < 0) | |
| 5814 coding->head_ascii = src - coding->source; | |
| 5815 if (detect_coding_iso_2022 (coding, &detect_info)) | 5811 if (detect_coding_iso_2022 (coding, &detect_info)) |
| 5816 { | 5812 { |
| 5817 /* We have scanned the whole data. */ | 5813 /* We have scanned the whole data. */ |
| 5818 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | 5814 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) |
| 5819 /* We didn't find an 8-bit code. We may have | 5815 { |
| 5820 found a null-byte, but it's very rare that | 5816 /* We didn't find an 8-bit code. We may |
| 5821 a binary file confirm to ISO-2022. */ | 5817 have found a null-byte, but it's very |
| 5822 src = src_end; | 5818 rare that a binary file confirm to |
| 5819 ISO-2022. */ | |
| 5820 src = src_end; | |
| 5821 coding->head_ascii = src - coding->source; | |
| 5822 } | |
| 5823 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE; | |
| 5823 break; | 5824 break; |
| 5824 } | 5825 } |
| 5825 } | 5826 } |
| 5826 else if (! c) | 5827 else if (! c) |
| 5827 { | 5828 { |
| 5828 null_byte_found = 1; | 5829 null_byte_found = 1; |
| 5829 if (eight_bit_found) | 5830 if (eight_bit_found) |
| 5830 break; | 5831 break; |
| 5831 } | 5832 } |
| 5833 coding->head_ascii++; | |
| 5832 } | 5834 } |
| 5833 } | 5835 else |
| 5834 if (coding->head_ascii < 0) | 5836 coding->head_ascii++; |
| 5835 coding->head_ascii = src - coding->source; | 5837 } |
| 5836 | 5838 |
| 5837 if (null_byte_found || eight_bit_found | 5839 if (null_byte_found || eight_bit_found |
| 5838 || coding->head_ascii < coding->src_bytes | 5840 || coding->head_ascii < coding->src_bytes |
| 5839 || detect_info.found) | 5841 || detect_info.found) |
| 5840 { | 5842 { |
| 5884 category = coding_category_utf_16_be; | 5886 category = coding_category_utf_16_be; |
| 5885 } | 5887 } |
| 5886 break; | 5888 break; |
| 5887 } | 5889 } |
| 5888 } | 5890 } |
| 5889 | |
| 5890 if (i < coding_category_raw_text) | |
| 5891 setup_coding_system (CODING_ID_NAME (this->id), coding); | |
| 5892 else if (null_byte_found) | |
| 5893 setup_coding_system (Qno_conversion, coding); | |
| 5894 else if ((detect_info.rejected & CATEGORY_MASK_ANY) | |
| 5895 == CATEGORY_MASK_ANY) | |
| 5896 setup_coding_system (Qraw_text, coding); | |
| 5897 else if (detect_info.rejected) | |
| 5898 for (i = 0; i < coding_category_raw_text; i++) | |
| 5899 if (! (detect_info.rejected & (1 << coding_priorities[i]))) | |
| 5900 { | |
| 5901 this = coding_categories + coding_priorities[i]; | |
| 5902 setup_coding_system (CODING_ID_NAME (this->id), coding); | |
| 5903 break; | |
| 5904 } | |
| 5905 } | 5891 } |
| 5892 | |
| 5893 if (i < coding_category_raw_text) | |
| 5894 setup_coding_system (CODING_ID_NAME (this->id), coding); | |
| 5895 else if (null_byte_found) | |
| 5896 setup_coding_system (Qno_conversion, coding); | |
| 5897 else if ((detect_info.rejected & CATEGORY_MASK_ANY) | |
| 5898 == CATEGORY_MASK_ANY) | |
| 5899 setup_coding_system (Qraw_text, coding); | |
| 5900 else if (detect_info.rejected) | |
| 5901 for (i = 0; i < coding_category_raw_text; i++) | |
| 5902 if (! (detect_info.rejected & (1 << coding_priorities[i]))) | |
| 5903 { | |
| 5904 this = coding_categories + coding_priorities[i]; | |
| 5905 setup_coding_system (CODING_ID_NAME (this->id), coding); | |
| 5906 break; | |
| 5907 } | |
| 5906 } | 5908 } |
| 5907 } | 5909 } |
| 5908 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) | 5910 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) |
| 5909 == coding_category_utf_8_auto) | 5911 == coding_category_utf_8_auto) |
| 5910 { | 5912 { |
| 7653 coding.src_chars = src_chars; | 7655 coding.src_chars = src_chars; |
| 7654 coding.src_bytes = src_bytes; | 7656 coding.src_bytes = src_bytes; |
| 7655 coding.src_multibyte = multibytep; | 7657 coding.src_multibyte = multibytep; |
| 7656 coding.consumed = 0; | 7658 coding.consumed = 0; |
| 7657 coding.mode |= CODING_MODE_LAST_BLOCK; | 7659 coding.mode |= CODING_MODE_LAST_BLOCK; |
| 7660 coding.head_ascii = 0; | |
| 7658 | 7661 |
| 7659 detect_info.checked = detect_info.found = detect_info.rejected = 0; | 7662 detect_info.checked = detect_info.found = detect_info.rejected = 0; |
| 7660 | 7663 |
| 7661 /* At first, detect text-format if necessary. */ | 7664 /* At first, detect text-format if necessary. */ |
| 7662 base_category = XINT (CODING_ATTR_CATEGORY (attrs)); | 7665 base_category = XINT (CODING_ATTR_CATEGORY (attrs)); |
| 7664 { | 7667 { |
| 7665 enum coding_category category; | 7668 enum coding_category category; |
| 7666 struct coding_system *this; | 7669 struct coding_system *this; |
| 7667 int c, i; | 7670 int c, i; |
| 7668 | 7671 |
| 7669 coding.head_ascii = -1; | |
| 7670 /* Skip all ASCII bytes except for a few ISO2022 controls. */ | 7672 /* Skip all ASCII bytes except for a few ISO2022 controls. */ |
| 7671 for (; src < src_end; src++) | 7673 for (; src < src_end; src++) |
| 7672 { | 7674 { |
| 7673 c = *src; | 7675 c = *src; |
| 7674 if (c & 0x80) | 7676 if (c & 0x80) |
| 7675 { | 7677 { |
| 7676 eight_bit_found = 1; | 7678 eight_bit_found = 1; |
| 7677 if (coding.head_ascii < 0) | |
| 7678 coding.head_ascii = src - coding.source; | |
| 7679 if (null_byte_found) | 7679 if (null_byte_found) |
| 7680 break; | 7680 break; |
| 7681 } | 7681 } |
| 7682 if (c < 0x20) | 7682 else if (c < 0x20) |
| 7683 { | 7683 { |
| 7684 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | 7684 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) |
| 7685 && ! inhibit_iso_escape_detection | 7685 && ! inhibit_iso_escape_detection |
| 7686 && ! detect_info.checked) | 7686 && ! detect_info.checked) |
| 7687 { | 7687 { |
| 7688 if (coding.head_ascii < 0) | |
| 7689 coding.head_ascii = src - coding.source; | |
| 7690 if (detect_coding_iso_2022 (&coding, &detect_info)) | 7688 if (detect_coding_iso_2022 (&coding, &detect_info)) |
| 7691 { | 7689 { |
| 7692 /* We have scanned the whole data. */ | 7690 /* We have scanned the whole data. */ |
| 7693 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | 7691 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) |
| 7694 /* We didn't find an 8-bit code. We may have | 7692 { |
| 7695 found a null-byte, but it's very rare that | 7693 /* We didn't find an 8-bit code. We may |
| 7696 a binary file confirm to ISO-2022. */ | 7694 have found a null-byte, but it's very |
| 7697 src = src_end; | 7695 rare that a binary file confirm to |
| 7696 ISO-2022. */ | |
| 7697 src = src_end; | |
| 7698 coding.head_ascii = src - coding.source; | |
| 7699 } | |
| 7700 detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE; | |
| 7698 break; | 7701 break; |
| 7699 } | 7702 } |
| 7700 } | 7703 } |
| 7701 else if (! c) | 7704 else if (! c) |
| 7702 { | 7705 { |
| 7703 null_byte_found = 1; | 7706 null_byte_found = 1; |
| 7704 if (eight_bit_found) | 7707 if (eight_bit_found) |
| 7705 break; | 7708 break; |
| 7706 } | 7709 } |
| 7710 coding.head_ascii++; | |
| 7707 } | 7711 } |
| 7708 } | 7712 else |
| 7709 if (coding.head_ascii < 0) | 7713 coding.head_ascii++; |
| 7710 coding.head_ascii = src - coding.source; | 7714 } |
| 7711 | 7715 |
| 7712 if (null_byte_found || eight_bit_found | 7716 if (null_byte_found || eight_bit_found |
| 7713 || coding.head_ascii < coding.src_bytes | 7717 || coding.head_ascii < coding.src_bytes |
| 7714 || detect_info.found) | 7718 || detect_info.found) |
| 7715 { | 7719 { |
