Mercurial > emacs
comparison src/coding.c @ 93595:ac4d127a841a
(CATEGORY_MASK_ANY): Add CATEGORY_MASK_UTF_16_AUTO.
(CATEGORY_MASK_UTF_16): Likewise.
(detect_coding_utf_16): Add heuristics to reject utf-16 for a
binary file.
(detect_coding): Add null-byte detection for a binary file.
(detect_coding_system): Likewise.
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Thu, 03 Apr 2008 12:30:02 +0000 |
| parents | 06e93ffa2e9f |
| children | 62d97ebb13a9 |
comparison
equal
deleted
inserted
replaced
| 93594:46b8fe649bbc | 93595:ac4d127a841a |
|---|---|
| 623 | CATEGORY_MASK_ISO_8_1 \ | 623 | CATEGORY_MASK_ISO_8_1 \ |
| 624 | CATEGORY_MASK_ISO_8_2 \ | 624 | CATEGORY_MASK_ISO_8_2 \ |
| 625 | CATEGORY_MASK_ISO_7_ELSE \ | 625 | CATEGORY_MASK_ISO_7_ELSE \ |
| 626 | CATEGORY_MASK_ISO_8_ELSE \ | 626 | CATEGORY_MASK_ISO_8_ELSE \ |
| 627 | CATEGORY_MASK_UTF_8 \ | 627 | CATEGORY_MASK_UTF_8 \ |
| 628 | CATEGORY_MASK_UTF_16_AUTO \ | |
| 628 | CATEGORY_MASK_UTF_16_BE \ | 629 | CATEGORY_MASK_UTF_16_BE \ |
| 629 | CATEGORY_MASK_UTF_16_LE \ | 630 | CATEGORY_MASK_UTF_16_LE \ |
| 630 | CATEGORY_MASK_UTF_16_BE_NOSIG \ | 631 | CATEGORY_MASK_UTF_16_BE_NOSIG \ |
| 631 | CATEGORY_MASK_UTF_16_LE_NOSIG \ | 632 | CATEGORY_MASK_UTF_16_LE_NOSIG \ |
| 632 | CATEGORY_MASK_CHARSET \ | 633 | CATEGORY_MASK_CHARSET \ |
| 655 ( CATEGORY_MASK_ISO_7BIT \ | 656 ( CATEGORY_MASK_ISO_7BIT \ |
| 656 | CATEGORY_MASK_ISO_8BIT \ | 657 | CATEGORY_MASK_ISO_8BIT \ |
| 657 | CATEGORY_MASK_ISO_ELSE) | 658 | CATEGORY_MASK_ISO_ELSE) |
| 658 | 659 |
| 659 #define CATEGORY_MASK_UTF_16 \ | 660 #define CATEGORY_MASK_UTF_16 \ |
| 660 (CATEGORY_MASK_UTF_16_BE \ | 661 (CATEGORY_MASK_UTF_16_AUTO \ |
| 662 | CATEGORY_MASK_UTF_16_BE \ | |
| 661 | CATEGORY_MASK_UTF_16_LE \ | 663 | CATEGORY_MASK_UTF_16_LE \ |
| 662 | CATEGORY_MASK_UTF_16_BE_NOSIG \ | 664 | CATEGORY_MASK_UTF_16_BE_NOSIG \ |
| 663 | CATEGORY_MASK_UTF_16_LE_NOSIG) | 665 | CATEGORY_MASK_UTF_16_LE_NOSIG) |
| 664 | 666 |
| 665 | 667 |
| 1511 | CATEGORY_MASK_UTF_16_AUTO); | 1513 | CATEGORY_MASK_UTF_16_AUTO); |
| 1512 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE | 1514 detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE |
| 1513 | CATEGORY_MASK_UTF_16_BE_NOSIG | 1515 | CATEGORY_MASK_UTF_16_BE_NOSIG |
| 1514 | CATEGORY_MASK_UTF_16_LE_NOSIG); | 1516 | CATEGORY_MASK_UTF_16_LE_NOSIG); |
| 1515 } | 1517 } |
| 1516 else if (c1 >= 0 && c2 >= 0) | 1518 else |
| 1517 { | 1519 { |
| 1520 /* We check the dispersion of Eth and Oth bytes where E is even and | |
| 1521 O is odd. If both are high, we assume binary data.*/ | |
| 1522 unsigned char e[256], o[256]; | |
| 1523 unsigned e_num = 1, o_num = 1; | |
| 1524 | |
| 1525 memset (e, 0, 256); | |
| 1526 memset (o, 0, 256); | |
| 1527 e[c1] = 1; | |
| 1528 o[c2] = 1; | |
| 1529 | |
| 1518 detect_info->rejected | 1530 detect_info->rejected |
| 1519 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); | 1531 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); |
| 1520 } | 1532 |
| 1533 while (1) | |
| 1534 { | |
| 1535 ONE_MORE_BYTE (c1); | |
| 1536 ONE_MORE_BYTE (c2); | |
| 1537 if (! e[c1]) | |
| 1538 { | |
| 1539 e[c1] = 1; | |
| 1540 e_num++; | |
| 1541 if (e_num >= 128) | |
| 1542 break; | |
| 1543 } | |
| 1544 if (! o[c2]) | |
| 1545 { | |
| 1546 o[c1] = 1; | |
| 1547 o_num++; | |
| 1548 if (o_num >= 128) | |
| 1549 break; | |
| 1550 } | |
| 1551 } | |
| 1552 detect_info->rejected |= CATEGORY_MASK_UTF_16; | |
| 1553 return 0; | |
| 1554 } | |
| 1555 | |
| 1521 no_more_source: | 1556 no_more_source: |
| 1522 return 1; | 1557 return 1; |
| 1523 } | 1558 } |
| 1524 | 1559 |
| 1525 static void | 1560 static void |
| 5675 now. */ | 5710 now. */ |
| 5676 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) | 5711 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) |
| 5677 { | 5712 { |
| 5678 int c, i; | 5713 int c, i; |
| 5679 struct coding_detection_info detect_info; | 5714 struct coding_detection_info detect_info; |
| 5715 int null_byte_found = 0, eight_bit_found = 0; | |
| 5680 | 5716 |
| 5681 detect_info.checked = detect_info.found = detect_info.rejected = 0; | 5717 detect_info.checked = detect_info.found = detect_info.rejected = 0; |
| 5682 for (i = 0, src = coding->source; src < src_end; i++, src++) | 5718 coding->head_ascii = -1; |
| 5719 for (src = coding->source; src < src_end; src++) | |
| 5683 { | 5720 { |
| 5684 c = *src; | 5721 c = *src; |
| 5685 if (c & 0x80) | 5722 if (c & 0x80) |
| 5686 break; | |
| 5687 if (c < 0x20 | |
| 5688 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
| 5689 && ! inhibit_iso_escape_detection | |
| 5690 && ! detect_info.checked) | |
| 5691 { | 5723 { |
| 5692 coding->head_ascii = src - (coding->source + coding->consumed); | 5724 eight_bit_found = 1; |
| 5693 if (detect_coding_iso_2022 (coding, &detect_info)) | 5725 if (coding->head_ascii < 0) |
| 5726 coding->head_ascii = src - coding->source; | |
| 5727 if (null_byte_found) | |
| 5728 break; | |
| 5729 } | |
| 5730 else if (c < 0x20) | |
| 5731 { | |
| 5732 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
| 5733 && ! inhibit_iso_escape_detection | |
| 5734 && ! detect_info.checked) | |
| 5694 { | 5735 { |
| 5695 /* We have scanned the whole data. */ | 5736 if (coding->head_ascii < 0) |
| 5696 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | 5737 coding->head_ascii = src - coding->source; |
| 5697 /* We didn't find an 8-bit code. */ | 5738 if (detect_coding_iso_2022 (coding, &detect_info)) |
| 5698 src = src_end; | 5739 { |
| 5699 break; | 5740 /* We have scanned the whole data. */ |
| 5741 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | |
| 5742 /* We didn't find an 8-bit code. We may have | |
| 5743 found a null-byte, but it's very rare that | |
| 5744 a binary file confirm to ISO-2022. */ | |
| 5745 src = src_end; | |
| 5746 break; | |
| 5747 } | |
| 5748 } | |
| 5749 else if (! c) | |
| 5750 { | |
| 5751 null_byte_found = 1; | |
| 5752 if (eight_bit_found) | |
| 5753 break; | |
| 5700 } | 5754 } |
| 5701 } | 5755 } |
| 5702 } | 5756 } |
| 5703 coding->head_ascii = src - (coding->source + coding->consumed); | 5757 if (coding->head_ascii < 0) |
| 5704 | 5758 coding->head_ascii = src - coding->source; |
| 5705 if (coding->head_ascii < coding->src_bytes | 5759 |
| 5760 if (null_byte_found || eight_bit_found | |
| 5761 || coding->head_ascii < coding->src_bytes | |
| 5706 || detect_info.found) | 5762 || detect_info.found) |
| 5707 { | 5763 { |
| 5708 enum coding_category category; | 5764 enum coding_category category; |
| 5709 struct coding_system *this; | 5765 struct coding_system *this; |
| 5710 | 5766 |
| 5716 this = coding_categories + category; | 5772 this = coding_categories + category; |
| 5717 if (detect_info.found & (1 << category)) | 5773 if (detect_info.found & (1 << category)) |
| 5718 break; | 5774 break; |
| 5719 } | 5775 } |
| 5720 else | 5776 else |
| 5721 for (i = 0; i < coding_category_raw_text; i++) | 5777 { |
| 5722 { | 5778 if (null_byte_found) |
| 5723 category = coding_priorities[i]; | 5779 { |
| 5724 this = coding_categories + category; | 5780 detect_info.checked |= ~CATEGORY_MASK_UTF_16; |
| 5725 if (this->id < 0) | 5781 detect_info.rejected |= ~CATEGORY_MASK_UTF_16; |
| 5726 { | 5782 } |
| 5727 /* No coding system of this category is defined. */ | 5783 for (i = 0; i < coding_category_raw_text; i++) |
| 5728 detect_info.rejected |= (1 << category); | 5784 { |
| 5729 } | 5785 category = coding_priorities[i]; |
| 5730 else if (category >= coding_category_raw_text) | 5786 this = coding_categories + category; |
| 5731 continue; | 5787 if (this->id < 0) |
| 5732 else if (detect_info.checked & (1 << category)) | 5788 { |
| 5733 { | 5789 /* No coding system of this category is defined. */ |
| 5734 if (detect_info.found & (1 << category)) | 5790 detect_info.rejected |= (1 << category); |
| 5791 } | |
| 5792 else if (category >= coding_category_raw_text) | |
| 5793 continue; | |
| 5794 else if (detect_info.checked & (1 << category)) | |
| 5795 { | |
| 5796 if (detect_info.found & (1 << category)) | |
| 5797 break; | |
| 5798 } | |
| 5799 else if ((*(this->detector)) (coding, &detect_info) | |
| 5800 && detect_info.found & (1 << category)) | |
| 5801 { | |
| 5802 if (category == coding_category_utf_16_auto) | |
| 5803 { | |
| 5804 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
| 5805 category = coding_category_utf_16_le; | |
| 5806 else | |
| 5807 category = coding_category_utf_16_be; | |
| 5808 } | |
| 5735 break; | 5809 break; |
| 5736 } | 5810 } |
| 5737 else if ((*(this->detector)) (coding, &detect_info) | 5811 } |
| 5738 && detect_info.found & (1 << category)) | |
| 5739 { | |
| 5740 if (category == coding_category_utf_16_auto) | |
| 5741 { | |
| 5742 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
| 5743 category = coding_category_utf_16_le; | |
| 5744 else | |
| 5745 category = coding_category_utf_16_be; | |
| 5746 } | |
| 5747 break; | |
| 5748 } | |
| 5749 } | |
| 5750 | 5812 |
| 5751 if (i < coding_category_raw_text) | 5813 if (i < coding_category_raw_text) |
| 5752 setup_coding_system (CODING_ID_NAME (this->id), coding); | 5814 setup_coding_system (CODING_ID_NAME (this->id), coding); |
| 5753 else if (detect_info.rejected == CATEGORY_MASK_ANY) | 5815 else if (null_byte_found) |
| 5754 setup_coding_system (Qraw_text, coding); | 5816 setup_coding_system (Qno_conversion, coding); |
| 5755 else if (detect_info.rejected) | 5817 else if ((detect_info.rejected & CATEGORY_MASK_ANY) |
| 5756 for (i = 0; i < coding_category_raw_text; i++) | 5818 == CATEGORY_MASK_ANY) |
| 5757 if (! (detect_info.rejected & (1 << coding_priorities[i]))) | 5819 setup_coding_system (Qraw_text, coding); |
| 5758 { | 5820 else if (detect_info.rejected) |
| 5759 this = coding_categories + coding_priorities[i]; | 5821 for (i = 0; i < coding_category_raw_text; i++) |
| 5760 setup_coding_system (CODING_ID_NAME (this->id), coding); | 5822 if (! (detect_info.rejected & (1 << coding_priorities[i]))) |
| 5761 break; | 5823 { |
| 5762 } | 5824 this = coding_categories + coding_priorities[i]; |
| 5825 setup_coding_system (CODING_ID_NAME (this->id), coding); | |
| 5826 break; | |
| 5827 } | |
| 5828 } | |
| 5763 } | 5829 } |
| 5764 } | 5830 } |
| 5765 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) | 5831 else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id))) |
| 5766 == coding_category_utf_16_auto) | 5832 == coding_category_utf_16_auto) |
| 5767 { | 5833 { |
| 7470 Lisp_Object val; | 7536 Lisp_Object val; |
| 7471 struct coding_system coding; | 7537 struct coding_system coding; |
| 7472 int id; | 7538 int id; |
| 7473 struct coding_detection_info detect_info; | 7539 struct coding_detection_info detect_info; |
| 7474 enum coding_category base_category; | 7540 enum coding_category base_category; |
| 7541 int null_byte_found = 0, eight_bit_found = 0; | |
| 7475 | 7542 |
| 7476 if (NILP (coding_system)) | 7543 if (NILP (coding_system)) |
| 7477 coding_system = Qundecided; | 7544 coding_system = Qundecided; |
| 7478 setup_coding_system (coding_system, &coding); | 7545 setup_coding_system (coding_system, &coding); |
| 7479 attrs = CODING_ID_ATTRS (coding.id); | 7546 attrs = CODING_ID_ATTRS (coding.id); |
| 7495 { | 7562 { |
| 7496 enum coding_category category; | 7563 enum coding_category category; |
| 7497 struct coding_system *this; | 7564 struct coding_system *this; |
| 7498 int c, i; | 7565 int c, i; |
| 7499 | 7566 |
| 7567 coding.head_ascii = -1; | |
| 7500 /* Skip all ASCII bytes except for a few ISO2022 controls. */ | 7568 /* Skip all ASCII bytes except for a few ISO2022 controls. */ |
| 7501 for (i = 0; src < src_end; i++, src++) | 7569 for (; src < src_end; src++) |
| 7502 { | 7570 { |
| 7503 c = *src; | 7571 c = *src; |
| 7504 if (c & 0x80) | 7572 if (c & 0x80) |
| 7505 break; | |
| 7506 if (c < 0x20 | |
| 7507 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
| 7508 && ! inhibit_iso_escape_detection) | |
| 7509 { | 7573 { |
| 7510 coding.head_ascii = src - coding.source; | 7574 eight_bit_found = 1; |
| 7511 if (detect_coding_iso_2022 (&coding, &detect_info)) | 7575 if (coding.head_ascii < 0) |
| 7576 coding.head_ascii = src - coding.source; | |
| 7577 if (null_byte_found) | |
| 7578 break; | |
| 7579 } | |
| 7580 if (c < 0x20) | |
| 7581 { | |
| 7582 if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
| 7583 && ! inhibit_iso_escape_detection | |
| 7584 && ! detect_info.checked) | |
| 7512 { | 7585 { |
| 7513 /* We have scanned the whole data. */ | 7586 if (coding.head_ascii < 0) |
| 7514 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | 7587 coding.head_ascii = src - coding.source; |
| 7515 /* We didn't find an 8-bit code. */ | 7588 if (detect_coding_iso_2022 (&coding, &detect_info)) |
| 7516 src = src_end; | 7589 { |
| 7517 break; | 7590 /* We have scanned the whole data. */ |
| 7591 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | |
| 7592 /* We didn't find an 8-bit code. We may have | |
| 7593 found a null-byte, but it's very rare that | |
| 7594 a binary file confirm to ISO-2022. */ | |
| 7595 src = src_end; | |
| 7596 break; | |
| 7597 } | |
| 7598 } | |
| 7599 else if (! c) | |
| 7600 { | |
| 7601 null_byte_found = 1; | |
| 7602 if (eight_bit_found) | |
| 7603 break; | |
| 7518 } | 7604 } |
| 7519 } | 7605 } |
| 7520 } | 7606 } |
| 7521 coding.head_ascii = src - coding.source; | 7607 if (coding.head_ascii < 0) |
| 7522 | 7608 coding.head_ascii = src - coding.source; |
| 7523 if (src < src_end | 7609 |
| 7610 if (null_byte_found || eight_bit_found | |
| 7611 || coding.head_ascii < coding.src_bytes | |
| 7524 || detect_info.found) | 7612 || detect_info.found) |
| 7525 { | 7613 { |
| 7526 if (src == src_end) | 7614 if (coding.head_ascii == coding.src_bytes) |
| 7527 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */ | 7615 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */ |
| 7528 for (i = 0; i < coding_category_raw_text; i++) | 7616 for (i = 0; i < coding_category_raw_text; i++) |
| 7529 { | 7617 { |
| 7530 category = coding_priorities[i]; | 7618 category = coding_priorities[i]; |
| 7531 this = coding_categories + category; | 7619 this = coding_categories + category; |
| 7532 if (detect_info.found & (1 << category)) | 7620 if (detect_info.found & (1 << category)) |
| 7533 break; | 7621 break; |
| 7534 } | 7622 } |
| 7535 else | 7623 else |
| 7536 for (i = 0; i < coding_category_raw_text; i++) | 7624 { |
| 7537 { | 7625 if (null_byte_found) |
| 7538 category = coding_priorities[i]; | 7626 { |
| 7539 this = coding_categories + category; | 7627 detect_info.checked |= ~CATEGORY_MASK_UTF_16; |
| 7540 | 7628 detect_info.rejected |= ~CATEGORY_MASK_UTF_16; |
| 7541 if (this->id < 0) | 7629 } |
| 7542 { | 7630 for (i = 0; i < coding_category_raw_text; i++) |
| 7543 /* No coding system of this category is defined. */ | 7631 { |
| 7544 detect_info.rejected |= (1 << category); | 7632 category = coding_priorities[i]; |
| 7545 } | 7633 this = coding_categories + category; |
| 7546 else if (category >= coding_category_raw_text) | 7634 |
| 7547 continue; | 7635 if (this->id < 0) |
| 7548 else if (detect_info.checked & (1 << category)) | 7636 { |
| 7549 { | 7637 /* No coding system of this category is defined. */ |
| 7550 if (highest | 7638 detect_info.rejected |= (1 << category); |
| 7551 && (detect_info.found & (1 << category))) | 7639 } |
| 7640 else if (category >= coding_category_raw_text) | |
| 7641 continue; | |
| 7642 else if (detect_info.checked & (1 << category)) | |
| 7643 { | |
| 7644 if (highest | |
| 7645 && (detect_info.found & (1 << category))) | |
| 7646 break; | |
| 7647 } | |
| 7648 else if ((*(this->detector)) (&coding, &detect_info) | |
| 7649 && highest | |
| 7650 && (detect_info.found & (1 << category))) | |
| 7651 { | |
| 7652 if (category == coding_category_utf_16_auto) | |
| 7653 { | |
| 7654 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
| 7655 category = coding_category_utf_16_le; | |
| 7656 else | |
| 7657 category = coding_category_utf_16_be; | |
| 7658 } | |
| 7552 break; | 7659 break; |
| 7553 } | 7660 } |
| 7554 else | 7661 } |
| 7555 { | 7662 } |
| 7556 if ((*(this->detector)) (&coding, &detect_info) | 7663 } |
| 7557 && highest | 7664 |
| 7558 && (detect_info.found & (1 << category))) | 7665 if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY) |
| 7559 { | |
| 7560 if (category == coding_category_utf_16_auto) | |
| 7561 { | |
| 7562 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
| 7563 category = coding_category_utf_16_le; | |
| 7564 else | |
| 7565 category = coding_category_utf_16_be; | |
| 7566 } | |
| 7567 break; | |
| 7568 } | |
| 7569 } | |
| 7570 } | |
| 7571 } | |
| 7572 | |
| 7573 if (detect_info.rejected == CATEGORY_MASK_ANY) | |
| 7574 { | 7666 { |
| 7575 detect_info.found = CATEGORY_MASK_RAW_TEXT; | 7667 detect_info.found = CATEGORY_MASK_RAW_TEXT; |
| 7576 id = coding_categories[coding_category_raw_text].id; | 7668 id = coding_categories[coding_category_raw_text].id; |
| 7577 val = Fcons (make_number (id), Qnil); | 7669 val = Fcons (make_number (id), Qnil); |
| 7578 } | 7670 } |
| 7657 Lisp_Object tail; | 7749 Lisp_Object tail; |
| 7658 | 7750 |
| 7659 if (VECTORP (eol_type)) | 7751 if (VECTORP (eol_type)) |
| 7660 { | 7752 { |
| 7661 if (detect_info.found & ~CATEGORY_MASK_UTF_16) | 7753 if (detect_info.found & ~CATEGORY_MASK_UTF_16) |
| 7662 normal_eol = detect_eol (coding.source, src_bytes, | 7754 { |
| 7663 coding_category_raw_text); | 7755 if (null_byte_found) |
| 7756 normal_eol = EOL_SEEN_LF; | |
| 7757 else | |
| 7758 normal_eol = detect_eol (coding.source, src_bytes, | |
| 7759 coding_category_raw_text); | |
| 7760 } | |
| 7664 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE | 7761 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE |
| 7665 | CATEGORY_MASK_UTF_16_BE_NOSIG)) | 7762 | CATEGORY_MASK_UTF_16_BE_NOSIG)) |
| 7666 utf_16_be_eol = detect_eol (coding.source, src_bytes, | 7763 utf_16_be_eol = detect_eol (coding.source, src_bytes, |
| 7667 coding_category_utf_16_be); | 7764 coding_category_utf_16_be); |
| 7668 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE | 7765 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE |
