comparison src/coding.c @ 89974:2fef8edebd5c

(detect_coding_utf_16): Don't set detect_info->found if BOM is not found. (detect_coding): Optimization for ISO-2022 when no 8-bit data is found. (detect_coding_system): Likewise.
author Kenichi Handa <handa@m17n.org>
date Mon, 13 Sep 2004 00:41:15 +0000
parents 48af0ea7d387
children 0217b5bc6404
comparison
equal deleted inserted replaced
89973:c42bb1e5b6f3 89974:2fef8edebd5c
1416 | CATEGORY_MASK_UTF_16_BE_NOSIG 1416 | CATEGORY_MASK_UTF_16_BE_NOSIG
1417 | CATEGORY_MASK_UTF_16_LE_NOSIG); 1417 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1418 } 1418 }
1419 else if (c1 >= 0 && c2 >= 0) 1419 else if (c1 >= 0 && c2 >= 0)
1420 { 1420 {
1421 unsigned char b1[256], b2[256];
1422 int b1_variants = 1, b2_variants = 1;
1423 int n;
1424
1425 bzero (b1, 256), bzero (b2, 256);
1426 b1[c1]++, b2[c2]++;
1427 for (n = 0; n < 256 && src < src_end; n++)
1428 {
1429 src_base = src;
1430 ONE_MORE_BYTE (c1);
1431 ONE_MORE_BYTE (c2);
1432 if (c1 < 0 || c2 < 0)
1433 break;
1434 if (! b1[c1++]) b1_variants++;
1435 if (! b2[c2++]) b2_variants++;
1436 }
1437 if (b1_variants < b2_variants)
1438 detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1439 else
1440 detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1441 detect_info->rejected 1421 detect_info->rejected
1442 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE); 1422 |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1443 } 1423 }
1444 no_more_source: 1424 no_more_source:
1445 return 1; 1425 return 1;
5419 /* If we have not yet decided the text encoding type, detect it 5399 /* If we have not yet decided the text encoding type, detect it
5420 now. */ 5400 now. */
5421 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided)) 5401 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5422 { 5402 {
5423 int c, i; 5403 int c, i;
5424 5404 struct coding_detection_info detect_info;
5405
5406 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5425 for (i = 0, src = coding->source; src < src_end; i++, src++) 5407 for (i = 0, src = coding->source; src < src_end; i++, src++)
5426 { 5408 {
5427 c = *src; 5409 c = *src;
5428 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC 5410 if (c & 0x80)
5429 || c == ISO_CODE_SI
5430 || c == ISO_CODE_SO)))
5431 break; 5411 break;
5432 } 5412 if (c < 0x20
5433 coding->head_ascii = src - (coding->source + coding->consumed); 5413 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5434 5414 && ! inhibit_iso_escape_detection
5435 if (coding->head_ascii < coding->src_bytes) 5415 && ! detect_info.checked)
5436 {
5437 struct coding_detection_info detect_info;
5438 enum coding_category category;
5439 struct coding_system *this;
5440
5441 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5442 for (i = 0; i < coding_category_raw_text; i++)
5443 { 5416 {
5444 category = coding_priorities[i]; 5417 coding->head_ascii = src - (coding->source + coding->consumed);
5445 this = coding_categories + category; 5418 if (detect_coding_iso_2022 (coding, &detect_info))
5446 if (this->id < 0)
5447 { 5419 {
5448 /* No coding system of this category is defined. */ 5420 /* We have scanned the whole data. */
5449 detect_info.rejected |= (1 << category); 5421 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5450 } 5422 /* We didn't find an 8-bit code. */
5451 else if (category >= coding_category_raw_text) 5423 src = src_end;
5452 continue;
5453 else if (detect_info.checked & (1 << category))
5454 {
5455 if (detect_info.found & (1 << category))
5456 break;
5457 }
5458 else if ((*(this->detector)) (coding, &detect_info)
5459 && detect_info.found & (1 << category))
5460 {
5461 if (category == coding_category_utf_16_auto)
5462 {
5463 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5464 category = coding_category_utf_16_le;
5465 else
5466 category = coding_category_utf_16_be;
5467 }
5468 break; 5424 break;
5469 } 5425 }
5470 } 5426 }
5427 }
5428 coding->head_ascii = src - (coding->source + coding->consumed);
5429
5430 if (coding->head_ascii == coding->src_bytes
5431 || detect_info.found)
5432 {
5433 enum coding_category category;
5434 struct coding_system *this;
5435
5436 if (coding->head_ascii == coding->src_bytes)
5437 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
5438 for (i = 0; i < coding_category_raw_text; i++)
5439 {
5440 category = coding_priorities[i];
5441 this = coding_categories + category;
5442 if (detect_info.found & (1 << category))
5443 break;
5444 }
5445 else
5446 for (i = 0; i < coding_category_raw_text; i++)
5447 {
5448 category = coding_priorities[i];
5449 this = coding_categories + category;
5450 if (this->id < 0)
5451 {
5452 /* No coding system of this category is defined. */
5453 detect_info.rejected |= (1 << category);
5454 }
5455 else if (category >= coding_category_raw_text)
5456 continue;
5457 else if (detect_info.checked & (1 << category))
5458 {
5459 if (detect_info.found & (1 << category))
5460 break;
5461 }
5462 else if ((*(this->detector)) (coding, &detect_info)
5463 && detect_info.found & (1 << category))
5464 {
5465 if (category == coding_category_utf_16_auto)
5466 {
5467 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5468 category = coding_category_utf_16_le;
5469 else
5470 category = coding_category_utf_16_be;
5471 }
5472 break;
5473 }
5474 }
5475
5471 if (i < coding_category_raw_text) 5476 if (i < coding_category_raw_text)
5472 setup_coding_system (CODING_ID_NAME (this->id), coding); 5477 setup_coding_system (CODING_ID_NAME (this->id), coding);
5473 else if (detect_info.rejected == CATEGORY_MASK_ANY) 5478 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5474 setup_coding_system (Qraw_text, coding); 5479 setup_coding_system (Qraw_text, coding);
5475 else if (detect_info.rejected) 5480 else if (detect_info.rejected)
7118 7123
7119 /* Skip all ASCII bytes except for a few ISO2022 controls. */ 7124 /* Skip all ASCII bytes except for a few ISO2022 controls. */
7120 for (i = 0; src < src_end; i++, src++) 7125 for (i = 0; src < src_end; i++, src++)
7121 { 7126 {
7122 c = *src; 7127 c = *src;
7123 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC 7128 if (c & 0x80)
7124 || c == ISO_CODE_SI
7125 || c == ISO_CODE_SO)))
7126 break; 7129 break;
7130 if (c < 0x20
7131 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7132 && inhibit_iso_escape_detection)
7133 {
7134 coding.head_ascii = src - coding.source;
7135 if (detect_coding_iso_2022 (&coding, &detect_info))
7136 {
7137 /* We have scanned the whole data. */
7138 if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7139 /* We didn't find an 8-bit code. */
7140 src = src_end;
7141 break;
7142 }
7143 }
7127 } 7144 }
7128 coding.head_ascii = src - coding.source; 7145 coding.head_ascii = src - coding.source;
7129 7146
7130 if (src < src_end) 7147 if (src < src_end
7131 for (i = 0; i < coding_category_raw_text; i++) 7148 || detect_info.found)
7132 { 7149 {
7133 category = coding_priorities[i]; 7150 if (src == src_end)
7134 this = coding_categories + category; 7151 /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings. */
7135 7152 for (i = 0; i < coding_category_raw_text; i++)
7136 if (this->id < 0)
7137 { 7153 {
7138 /* No coding system of this category is defined. */ 7154 category = coding_priorities[i];
7139 detect_info.rejected |= (1 << category); 7155 if (detect_info.found & (1 << category))
7140 }
7141 else if (category >= coding_category_raw_text)
7142 continue;
7143 else if (detect_info.checked & (1 << category))
7144 {
7145 if (highest
7146 && (detect_info.found & (1 << category)))
7147 break; 7156 break;
7148 } 7157 }
7149 else 7158 else
7159 for (i = 0; i < coding_category_raw_text; i++)
7150 { 7160 {
7151 if ((*(this->detector)) (&coding, &detect_info) 7161 category = coding_priorities[i];
7152 && highest 7162 this = coding_categories + category;
7153 && (detect_info.found & (1 << category))) 7163
7164 if (this->id < 0)
7154 { 7165 {
7155 if (category == coding_category_utf_16_auto) 7166 /* No coding system of this category is defined. */
7167 detect_info.rejected |= (1 << category);
7168 }
7169 else if (category >= coding_category_raw_text)
7170 continue;
7171 else if (detect_info.checked & (1 << category))
7172 {
7173 if (highest
7174 && (detect_info.found & (1 << category)))
7175 break;
7176 }
7177 else
7178 {
7179 if ((*(this->detector)) (&coding, &detect_info)
7180 && highest
7181 && (detect_info.found & (1 << category)))
7156 { 7182 {
7157 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) 7183 if (category == coding_category_utf_16_auto)
7158 category = coding_category_utf_16_le; 7184 {
7159 else 7185 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7160 category = coding_category_utf_16_be; 7186 category = coding_category_utf_16_le;
7187 else
7188 category = coding_category_utf_16_be;
7189 }
7190 break;
7161 } 7191 }
7162 break;
7163 } 7192 }
7164 } 7193 }
7165 } 7194 }
7166 7195
7167 if (detect_info.rejected == CATEGORY_MASK_ANY) 7196 if (detect_info.rejected == CATEGORY_MASK_ANY)
7168 { 7197 {
7169 detect_info.found = CATEGORY_MASK_RAW_TEXT; 7198 detect_info.found = CATEGORY_MASK_RAW_TEXT;
7170 id = coding_categories[coding_category_raw_text].id; 7199 id = coding_categories[coding_category_raw_text].id;