Mercurial > emacs
comparison src/coding.c @ 89974:2fef8edebd5c
(detect_coding_utf_16): Don't set detect_info->found if
BOM is not found.
(detect_coding): Optimization for ISO-2022 when no 8-bit data is
found.
(detect_coding_system): Likewise.
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Mon, 13 Sep 2004 00:41:15 +0000 |
| parents | 48af0ea7d387 |
| children | 0217b5bc6404 |
comparison
equal
deleted
inserted
replaced
| 89973:c42bb1e5b6f3 | 89974:2fef8edebd5c |
|---|---|
| 1416 | CATEGORY_MASK_UTF_16_BE_NOSIG | 1416 | CATEGORY_MASK_UTF_16_BE_NOSIG |
| 1417 | CATEGORY_MASK_UTF_16_LE_NOSIG); | 1417 | CATEGORY_MASK_UTF_16_LE_NOSIG); |
| 1418 } | 1418 } |
| 1419 else if (c1 >= 0 && c2 >= 0) | 1419 else if (c1 >= 0 && c2 >= 0) |
| 1420 { | 1420 { |
| 1421 unsigned char b1[256], b2[256]; | |
| 1422 int b1_variants = 1, b2_variants = 1; | |
| 1423 int n; | |
| 1424 | |
| 1425 bzero (b1, 256), bzero (b2, 256); | |
| 1426 b1[c1]++, b2[c2]++; | |
| 1427 for (n = 0; n < 256 && src < src_end; n++) | |
| 1428 { | |
| 1429 src_base = src; | |
| 1430 ONE_MORE_BYTE (c1); | |
| 1431 ONE_MORE_BYTE (c2); | |
| 1432 if (c1 < 0 || c2 < 0) | |
| 1433 break; | |
| 1434 if (! b1[c1++]) b1_variants++; | |
| 1435 if (! b2[c2++]) b2_variants++; | |
| 1436 } | |
| 1437 if (b1_variants < b2_variants) | |
| 1438 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG; | |
| 1439 else | |
| 1440 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG; | |
| 1441 detect_info->rejected | 1421 detect_info->rejected |
| 1442 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); | 1422 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); |
| 1443 } | 1423 } |
| 1444 no_more_source: | 1424 no_more_source: |
| 1445 return 1; | 1425 return 1; |
| 5419 /* If we have not yet decided the text encoding type, detect it | 5399 /* If we have not yet decided the text encoding type, detect it |
| 5420 now. */ | 5400 now. */ |
| 5421 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) | 5401 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) |
| 5422 { | 5402 { |
| 5423 int c, i; | 5403 int c, i; |
| 5424 | 5404 struct coding_detection_info detect_info; |
| 5405 | |
| 5406 detect_info.checked = detect_info.found = detect_info.rejected = 0; | |
| 5425 for (i = 0, src = coding->source; src < src_end; i++, src++) | 5407 for (i = 0, src = coding->source; src < src_end; i++, src++) |
| 5426 { | 5408 { |
| 5427 c = *src; | 5409 c = *src; |
| 5428 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC | 5410 if (c & 0x80) |
| 5429 || c == ISO_CODE_SI | |
| 5430 || c == ISO_CODE_SO))) | |
| 5431 break; | 5411 break; |
| 5432 } | 5412 if (c < 0x20 |
| 5433 coding->head_ascii = src - (coding->source + coding->consumed); | 5413 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) |
| 5434 | 5414 && ! inhibit_iso_escape_detection |
| 5435 if (coding->head_ascii < coding->src_bytes) | 5415 && ! detect_info.checked) |
| 5436 { | |
| 5437 struct coding_detection_info detect_info; | |
| 5438 enum coding_category category; | |
| 5439 struct coding_system *this; | |
| 5440 | |
| 5441 detect_info.checked = detect_info.found = detect_info.rejected = 0; | |
| 5442 for (i = 0; i < coding_category_raw_text; i++) | |
| 5443 { | 5416 { |
| 5444 category = coding_priorities[i]; | 5417 coding->head_ascii = src - (coding->source + coding->consumed); |
| 5445 this = coding_categories + category; | 5418 if (detect_coding_iso_2022 (coding, &detect_info)) |
| 5446 if (this->id < 0) | |
| 5447 { | 5419 { |
| 5448 /* No coding system of this category is defined. */ | 5420 /* We have scanned the whole data. */ |
| 5449 detect_info.rejected |= (1 << category); | 5421 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) |
| 5450 } | 5422 /* We didn't find an 8-bit code. */ |
| 5451 else if (category >= coding_category_raw_text) | 5423 src = src_end; |
| 5452 continue; | |
| 5453 else if (detect_info.checked & (1 << category)) | |
| 5454 { | |
| 5455 if (detect_info.found & (1 << category)) | |
| 5456 break; | |
| 5457 } | |
| 5458 else if ((*(this->detector)) (coding, &detect_info) | |
| 5459 && detect_info.found & (1 << category)) | |
| 5460 { | |
| 5461 if (category == coding_category_utf_16_auto) | |
| 5462 { | |
| 5463 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
| 5464 category = coding_category_utf_16_le; | |
| 5465 else | |
| 5466 category = coding_category_utf_16_be; | |
| 5467 } | |
| 5468 break; | 5424 break; |
| 5469 } | 5425 } |
| 5470 } | 5426 } |
| 5427 } | |
| 5428 coding->head_ascii = src - (coding->source + coding->consumed); | |
| 5429 | |
| 5430 if (coding->head_ascii == coding->src_bytes | |
| 5431 || detect_info.found) | |
| 5432 { | |
| 5433 enum coding_category category; | |
| 5434 struct coding_system *this; | |
| 5435 | |
| 5436 if (coding->head_ascii == coding->src_bytes) | |
| 5437 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */ | |
| 5438 for (i = 0; i < coding_category_raw_text; i++) | |
| 5439 { | |
| 5440 category = coding_priorities[i]; | |
| 5441 this = coding_categories + category; | |
| 5442 if (detect_info.found & (1 << category)) | |
| 5443 break; | |
| 5444 } | |
| 5445 else | |
| 5446 for (i = 0; i < coding_category_raw_text; i++) | |
| 5447 { | |
| 5448 category = coding_priorities[i]; | |
| 5449 this = coding_categories + category; | |
| 5450 if (this->id < 0) | |
| 5451 { | |
| 5452 /* No coding system of this category is defined. */ | |
| 5453 detect_info.rejected |= (1 << category); | |
| 5454 } | |
| 5455 else if (category >= coding_category_raw_text) | |
| 5456 continue; | |
| 5457 else if (detect_info.checked & (1 << category)) | |
| 5458 { | |
| 5459 if (detect_info.found & (1 << category)) | |
| 5460 break; | |
| 5461 } | |
| 5462 else if ((*(this->detector)) (coding, &detect_info) | |
| 5463 && detect_info.found & (1 << category)) | |
| 5464 { | |
| 5465 if (category == coding_category_utf_16_auto) | |
| 5466 { | |
| 5467 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
| 5468 category = coding_category_utf_16_le; | |
| 5469 else | |
| 5470 category = coding_category_utf_16_be; | |
| 5471 } | |
| 5472 break; | |
| 5473 } | |
| 5474 } | |
| 5475 | |
| 5471 if (i < coding_category_raw_text) | 5476 if (i < coding_category_raw_text) |
| 5472 setup_coding_system (CODING_ID_NAME (this->id), coding); | 5477 setup_coding_system (CODING_ID_NAME (this->id), coding); |
| 5473 else if (detect_info.rejected == CATEGORY_MASK_ANY) | 5478 else if (detect_info.rejected == CATEGORY_MASK_ANY) |
| 5474 setup_coding_system (Qraw_text, coding); | 5479 setup_coding_system (Qraw_text, coding); |
| 5475 else if (detect_info.rejected) | 5480 else if (detect_info.rejected) |
| 7118 | 7123 |
| 7119 /* Skip all ASCII bytes except for a few ISO2022 controls. */ | 7124 /* Skip all ASCII bytes except for a few ISO2022 controls. */ |
| 7120 for (i = 0; src < src_end; i++, src++) | 7125 for (i = 0; src < src_end; i++, src++) |
| 7121 { | 7126 { |
| 7122 c = *src; | 7127 c = *src; |
| 7123 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC | 7128 if (c & 0x80) |
| 7124 || c == ISO_CODE_SI | |
| 7125 || c == ISO_CODE_SO))) | |
| 7126 break; | 7129 break; |
| 7130 if (c < 0x20 | |
| 7131 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO) | |
| 7132 && inhibit_iso_escape_detection) | |
| 7133 { | |
| 7134 coding.head_ascii = src - coding.source; | |
| 7135 if (detect_coding_iso_2022 (&coding, &detect_info)) | |
| 7136 { | |
| 7137 /* We have scanned the whole data. */ | |
| 7138 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE)) | |
| 7139 /* We didn't find an 8-bit code. */ | |
| 7140 src = src_end; | |
| 7141 break; | |
| 7142 } | |
| 7143 } | |
| 7127 } | 7144 } |
| 7128 coding.head_ascii = src - coding.source; | 7145 coding.head_ascii = src - coding.source; |
| 7129 | 7146 |
| 7130 if (src < src_end) | 7147 if (src < src_end |
| 7131 for (i = 0; i < coding_category_raw_text; i++) | 7148 || detect_info.found) |
| 7132 { | 7149 { |
| 7133 category = coding_priorities[i]; | 7150 if (src == src_end) |
| 7134 this = coding_categories + category; | 7151 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */ |
| 7135 | 7152 for (i = 0; i < coding_category_raw_text; i++) |
| 7136 if (this->id < 0) | |
| 7137 { | 7153 { |
| 7138 /* No coding system of this category is defined. */ | 7154 category = coding_priorities[i]; |
| 7139 detect_info.rejected |= (1 << category); | 7155 if (detect_info.found & (1 << category)) |
| 7140 } | |
| 7141 else if (category >= coding_category_raw_text) | |
| 7142 continue; | |
| 7143 else if (detect_info.checked & (1 << category)) | |
| 7144 { | |
| 7145 if (highest | |
| 7146 && (detect_info.found & (1 << category))) | |
| 7147 break; | 7156 break; |
| 7148 } | 7157 } |
| 7149 else | 7158 else |
| 7159 for (i = 0; i < coding_category_raw_text; i++) | |
| 7150 { | 7160 { |
| 7151 if ((*(this->detector)) (&coding, &detect_info) | 7161 category = coding_priorities[i]; |
| 7152 && highest | 7162 this = coding_categories + category; |
| 7153 && (detect_info.found & (1 << category))) | 7163 |
| 7164 if (this->id < 0) | |
| 7154 { | 7165 { |
| 7155 if (category == coding_category_utf_16_auto) | 7166 /* No coding system of this category is defined. */ |
| 7167 detect_info.rejected |= (1 << category); | |
| 7168 } | |
| 7169 else if (category >= coding_category_raw_text) | |
| 7170 continue; | |
| 7171 else if (detect_info.checked & (1 << category)) | |
| 7172 { | |
| 7173 if (highest | |
| 7174 && (detect_info.found & (1 << category))) | |
| 7175 break; | |
| 7176 } | |
| 7177 else | |
| 7178 { | |
| 7179 if ((*(this->detector)) (&coding, &detect_info) | |
| 7180 && highest | |
| 7181 && (detect_info.found & (1 << category))) | |
| 7156 { | 7182 { |
| 7157 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | 7183 if (category == coding_category_utf_16_auto) |
| 7158 category = coding_category_utf_16_le; | 7184 { |
| 7159 else | 7185 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) |
| 7160 category = coding_category_utf_16_be; | 7186 category = coding_category_utf_16_le; |
| 7187 else | |
| 7188 category = coding_category_utf_16_be; | |
| 7189 } | |
| 7190 break; | |
| 7161 } | 7191 } |
| 7162 break; | |
| 7163 } | 7192 } |
| 7164 } | 7193 } |
| 7165 } | 7194 } |
| 7166 | 7195 |
| 7167 if (detect_info.rejected == CATEGORY_MASK_ANY) | 7196 if (detect_info.rejected == CATEGORY_MASK_ANY) |
| 7168 { | 7197 { |
| 7169 detect_info.found = CATEGORY_MASK_RAW_TEXT; | 7198 detect_info.found = CATEGORY_MASK_RAW_TEXT; |
| 7170 id = coding_categories[coding_category_raw_text].id; | 7199 id = coding_categories[coding_category_raw_text].id; |
