Mercurial > emacs
comparison src/coding.c @ 89420:c3e67ce6ee0f
(Qsignature, Qendian): Delete these variables.
(syms_of_coding): Don't initialize them.
(CATEGORY_MASK_UTF_16_AUTO): New macro.
(detect_coding_utf_16): Add CATEGORY_MASK_UTF_16_AUTO in
detect_info->found.
(decode_coding_utf_16): Don't detect BOM here.
(encode_coding_utf_16): Produce BOM if CODING_UTF_16_BOM (coding)
is NOT utf_16_without_bom.
(setup_coding_system): For a coding system of type utf-16, check
if the attribute :endian is Qbig or not (not nil or not), and set
CODING_REQUIRE_DETECTION_MASK if BOM detection is required.
(detect_coding): If coding type is utf-16 and BOM detection is
required, detect it.
(Fdefine_coding_system_internal): For a coding system of type
utf-16, check if the attribute :endian is Qbig or not (not nil or
not).
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Tue, 06 May 2003 12:28:11 +0000 |
| parents | a9c2b3712863 |
| children | 3c978149859b |
comparison
equal
deleted
inserted
replaced
| 89419:18e57407a82b | 89420:c3e67ce6ee0f |
|---|---|
| 306 Lisp_Object Qbuffer_file_coding_system; | 306 Lisp_Object Qbuffer_file_coding_system; |
| 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion; | 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion; |
| 308 Lisp_Object Qdefault_char; | 308 Lisp_Object Qdefault_char; |
| 309 Lisp_Object Qno_conversion, Qundecided; | 309 Lisp_Object Qno_conversion, Qundecided; |
| 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5; | 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5; |
| 311 Lisp_Object Qsignature, Qendian, Qbig, Qlittle; | 311 Lisp_Object Qbig, Qlittle; |
| 312 Lisp_Object Qcoding_system_history; | 312 Lisp_Object Qcoding_system_history; |
| 313 Lisp_Object Qvalid_codes; | 313 Lisp_Object Qvalid_codes; |
| 314 | 314 |
| 315 extern Lisp_Object Qinsert_file_contents, Qwrite_region; | 315 extern Lisp_Object Qinsert_file_contents, Qwrite_region; |
| 316 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; | 316 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; |
| 624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1) | 624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1) |
| 625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2) | 625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2) |
| 626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else) | 626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else) |
| 627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else) | 627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else) |
| 628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8) | 628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8) |
| 629 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto) | |
| 629 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be) | 630 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be) |
| 630 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le) | 631 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le) |
| 631 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig) | 632 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig) |
| 632 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig) | 633 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig) |
| 633 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset) | 634 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset) |
| 1355 ONE_MORE_BYTE (c1); | 1356 ONE_MORE_BYTE (c1); |
| 1356 ONE_MORE_BYTE (c2); | 1357 ONE_MORE_BYTE (c2); |
| 1357 | 1358 |
| 1358 if ((c1 == 0xFF) && (c2 == 0xFE)) | 1359 if ((c1 == 0xFF) && (c2 == 0xFE)) |
| 1359 { | 1360 { |
| 1360 detect_info->found |= CATEGORY_MASK_UTF_16_LE; | 1361 detect_info->found |= (CATEGORY_MASK_UTF_16_LE |
| 1362 | CATEGORY_MASK_UTF_16_AUTO); | |
| 1361 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE; | 1363 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE; |
| 1362 } | 1364 } |
| 1363 else if ((c1 == 0xFE) && (c2 == 0xFF)) | 1365 else if ((c1 == 0xFE) && (c2 == 0xFF)) |
| 1364 { | 1366 { |
| 1365 detect_info->found |= CATEGORY_MASK_UTF_16_BE; | 1367 detect_info->found |= (CATEGORY_MASK_UTF_16_BE |
| 1368 | CATEGORY_MASK_UTF_16_AUTO); | |
| 1366 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE; | 1369 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE; |
| 1367 } | 1370 } |
| 1368 no_more_source: | 1371 no_more_source: |
| 1369 return 1; | 1372 return 1; |
| 1370 } | 1373 } |
| 1385 int surrogate = CODING_UTF_16_SURROGATE (coding); | 1388 int surrogate = CODING_UTF_16_SURROGATE (coding); |
| 1386 Lisp_Object attr, eol_type, charset_list; | 1389 Lisp_Object attr, eol_type, charset_list; |
| 1387 | 1390 |
| 1388 CODING_GET_INFO (coding, attr, eol_type, charset_list); | 1391 CODING_GET_INFO (coding, attr, eol_type, charset_list); |
| 1389 | 1392 |
| 1390 if (bom != utf_16_without_bom) | 1393 if (bom == utf_16_with_bom) |
| 1391 { | 1394 { |
| 1392 int c, c1, c2; | 1395 int c, c1, c2; |
| 1393 | 1396 |
| 1394 src_base = src; | 1397 src_base = src; |
| 1395 ONE_MORE_BYTE (c1); | 1398 ONE_MORE_BYTE (c1); |
| 1396 ONE_MORE_BYTE (c2); | 1399 ONE_MORE_BYTE (c2); |
| 1397 c = (c1 << 8) | c2; | 1400 c = (c1 << 8) | c2; |
| 1398 if (bom == utf_16_with_bom) | 1401 |
| 1399 { | 1402 if (endian == utf_16_big_endian |
| 1400 if (endian == utf_16_big_endian | 1403 ? c != 0xFEFF : c != 0xFFFE) |
| 1401 ? c != 0xFEFF : c != 0xFFFE) | 1404 { |
| 1402 { | 1405 /* The first two bytes are not BOM. Treat them as bytes |
| 1403 /* We are sure that there's enouph room at CHARBUF. */ | 1406 for a normal character. */ |
| 1404 *charbuf++ = c1; | 1407 src = src_base; |
| 1405 *charbuf++ = c2; | 1408 coding->errors++; |
| 1406 coding->errors++; | 1409 } |
| 1407 } | 1410 CODING_UTF_16_BOM (coding) = utf_16_without_bom; |
| 1408 } | 1411 } |
| 1409 else | 1412 else if (bom == utf_16_detect_bom) |
| 1410 { | 1413 { |
| 1411 if (c == 0xFEFF) | 1414 /* We have already tried to detect BOM and failed in |
| 1412 CODING_UTF_16_ENDIAN (coding) | 1415 detect_coding. */ |
| 1413 = endian = utf_16_big_endian; | 1416 CODING_UTF_16_BOM (coding) = utf_16_without_bom; |
| 1414 else if (c == 0xFFFE) | |
| 1415 CODING_UTF_16_ENDIAN (coding) | |
| 1416 = endian = utf_16_little_endian; | |
| 1417 else | |
| 1418 { | |
| 1419 CODING_UTF_16_ENDIAN (coding) | |
| 1420 = endian = utf_16_big_endian; | |
| 1421 src = src_base; | |
| 1422 } | |
| 1423 } | |
| 1424 CODING_UTF_16_BOM (coding) = utf_16_with_bom; | |
| 1425 } | 1417 } |
| 1426 | 1418 |
| 1427 while (1) | 1419 while (1) |
| 1428 { | 1420 { |
| 1429 int c, c1, c2; | 1421 int c, c1, c2; |
| 1492 Lisp_Object attrs, eol_type, charset_list; | 1484 Lisp_Object attrs, eol_type, charset_list; |
| 1493 int c; | 1485 int c; |
| 1494 | 1486 |
| 1495 CODING_GET_INFO (coding, attrs, eol_type, charset_list); | 1487 CODING_GET_INFO (coding, attrs, eol_type, charset_list); |
| 1496 | 1488 |
| 1497 if (bom == utf_16_with_bom) | 1489 if (bom != utf_16_without_bom) |
| 1498 { | 1490 { |
| 1499 ASSURE_DESTINATION (safe_room); | 1491 ASSURE_DESTINATION (safe_room); |
| 1500 if (big_endian) | 1492 if (big_endian) |
| 1501 EMIT_TWO_BYTES (0xFE, 0xFF); | 1493 EMIT_TWO_BYTES (0xFE, 0xFF); |
| 1502 else | 1494 else |
| 4857 val = AREF (attrs, coding_attr_utf_16_bom); | 4849 val = AREF (attrs, coding_attr_utf_16_bom); |
| 4858 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom | 4850 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom |
| 4859 : EQ (val, Qt) ? utf_16_with_bom | 4851 : EQ (val, Qt) ? utf_16_with_bom |
| 4860 : utf_16_without_bom); | 4852 : utf_16_without_bom); |
| 4861 val = AREF (attrs, coding_attr_utf_16_endian); | 4853 val = AREF (attrs, coding_attr_utf_16_endian); |
| 4862 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian | 4854 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian |
| 4863 : utf_16_little_endian); | 4855 : utf_16_little_endian); |
| 4864 CODING_UTF_16_SURROGATE (coding) = 0; | 4856 CODING_UTF_16_SURROGATE (coding) = 0; |
| 4865 coding->detector = detect_coding_utf_16; | 4857 coding->detector = detect_coding_utf_16; |
| 4866 coding->decoder = decode_coding_utf_16; | 4858 coding->decoder = decode_coding_utf_16; |
| 4867 coding->encoder = encode_coding_utf_16; | 4859 coding->encoder = encode_coding_utf_16; |
| 4868 coding->common_flags | 4860 coding->common_flags |
| 4869 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); | 4861 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); |
| 4862 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom) | |
| 4863 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK; | |
| 4870 } | 4864 } |
| 4871 else if (EQ (coding_type, Qccl)) | 4865 else if (EQ (coding_type, Qccl)) |
| 4872 { | 4866 { |
| 4873 coding->detector = detect_coding_ccl; | 4867 coding->detector = detect_coding_ccl; |
| 4874 coding->decoder = decode_coding_ccl; | 4868 coding->decoder = decode_coding_ccl; |
| 5283 setup_coding_system (CODING_ID_NAME (this->id), coding); | 5277 setup_coding_system (CODING_ID_NAME (this->id), coding); |
| 5284 break; | 5278 break; |
| 5285 } | 5279 } |
| 5286 } | 5280 } |
| 5287 } | 5281 } |
| 5282 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16)) | |
| 5283 { | |
| 5284 Lisp_Object coding_systems; | |
| 5285 struct coding_detection_info detect_info; | |
| 5286 | |
| 5287 coding_systems | |
| 5288 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom); | |
| 5289 detect_info.found = detect_info.rejected = 0; | |
| 5290 if (CONSP (coding_systems) | |
| 5291 && detect_coding_utf_16 (coding, &detect_info) | |
| 5292 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE | |
| 5293 | CATEGORY_MASK_UTF_16_BE))) | |
| 5294 { | |
| 5295 if (detect_info.found & CATEGORY_MASK_UTF_16_LE) | |
| 5296 setup_coding_system (XCAR (coding_systems), coding); | |
| 5297 else | |
| 5298 setup_coding_system (XCDR (coding_systems), coding); | |
| 5299 } | |
| 5300 } | |
| 5288 | 5301 |
| 5289 attrs = CODING_ID_ATTRS (coding->id); | 5302 attrs = CODING_ID_ATTRS (coding->id); |
| 5290 coding_type = CODING_ATTR_TYPE (attrs); | 5303 coding_type = CODING_ATTR_TYPE (attrs); |
| 5291 | 5304 |
| 5292 /* If we have not yet decided the EOL type, detect it now. But, the | 5305 /* If we have not yet decided the EOL type, detect it now. But, the |
| 7955 CHECK_CODING_SYSTEM (XCDR (bom)); | 7968 CHECK_CODING_SYSTEM (XCDR (bom)); |
| 7956 } | 7969 } |
| 7957 ASET (attrs, coding_attr_utf_16_bom, bom); | 7970 ASET (attrs, coding_attr_utf_16_bom, bom); |
| 7958 | 7971 |
| 7959 endian = args[coding_arg_utf16_endian]; | 7972 endian = args[coding_arg_utf16_endian]; |
| 7973 CHECK_SYMBOL (endian); | |
| 7974 if (NILP (endian)) | |
| 7975 endian = Qbig; | |
| 7976 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle)) | |
| 7977 error ("Invalid endian: %s", XSYMBOL (endian)->name->data); | |
| 7960 ASET (attrs, coding_attr_utf_16_endian, endian); | 7978 ASET (attrs, coding_attr_utf_16_endian, endian); |
| 7961 | 7979 |
| 7962 category = (CONSP (bom) | 7980 category = (CONSP (bom) |
| 7963 ? coding_category_utf_16_auto | 7981 ? coding_category_utf_16_auto |
| 7964 : NILP (bom) | 7982 : NILP (bom) |
| 7965 ? (NILP (endian) | 7983 ? (EQ (endian, Qbig) |
| 7966 ? coding_category_utf_16_be_nosig | 7984 ? coding_category_utf_16_be_nosig |
| 7967 : coding_category_utf_16_le_nosig) | 7985 : coding_category_utf_16_le_nosig) |
| 7968 : (NILP (endian) | 7986 : (EQ (endian, Qbig) |
| 7969 ? coding_category_utf_16_be | 7987 ? coding_category_utf_16_be |
| 7970 : coding_category_utf_16_le)); | 7988 : coding_category_utf_16_le)); |
| 7971 } | 7989 } |
| 7972 else if (EQ (coding_type, Qiso_2022)) | 7990 else if (EQ (coding_type, Qiso_2022)) |
| 7973 { | 7991 { |
| 8405 DEFSYM (Qiso_2022, "iso-2022"); | 8423 DEFSYM (Qiso_2022, "iso-2022"); |
| 8406 | 8424 |
| 8407 DEFSYM (Qutf_8, "utf-8"); | 8425 DEFSYM (Qutf_8, "utf-8"); |
| 8408 | 8426 |
| 8409 DEFSYM (Qutf_16, "utf-16"); | 8427 DEFSYM (Qutf_16, "utf-16"); |
| 8410 DEFSYM (Qsignature, "signature"); | |
| 8411 DEFSYM (Qendian, "endian"); | |
| 8412 DEFSYM (Qbig, "big"); | 8428 DEFSYM (Qbig, "big"); |
| 8413 DEFSYM (Qlittle, "little"); | 8429 DEFSYM (Qlittle, "little"); |
| 8414 | 8430 |
| 8415 DEFSYM (Qshift_jis, "shift-jis"); | 8431 DEFSYM (Qshift_jis, "shift-jis"); |
| 8416 DEFSYM (Qbig5, "big5"); | 8432 DEFSYM (Qbig5, "big5"); |
