comparison src/coding.c @ 89420:c3e67ce6ee0f

(Qsignature, Qendian): Delete these variables. (syms_of_coding): Don't initialize them. (CATEGORY_MASK_UTF_16_AUTO): New macro. (detect_coding_utf_16): Add CATEGORY_MASK_UTF_16_AUTO in detect_info->found. (decode_coding_utf_16): Don't detect BOM here. (encode_coding_utf_16): Produce BOM if CODING_UTF_16_BOM (coding) is NOT utf_16_without_bom. (setup_coding_system): For a coding system of type utf-16, check if the attribute :endian is Qbig or not (not nil or not), and set CODING_REQUIRE_DETECTION_MASK if BOM detection is required. (detect_coding): If coding type is utf-16 and BOM detection is required, detect it. (Fdefine_coding_system_internal): For a coding system of type utf-16, check if the attribute :endian is Qbig or not (not nil or not).
author Kenichi Handa <handa@m17n.org>
date Tue, 06 May 2003 12:28:11 +0000
parents a9c2b3712863
children 3c978149859b
comparison
equal deleted inserted replaced
89419:18e57407a82b 89420:c3e67ce6ee0f
306 Lisp_Object Qbuffer_file_coding_system; 306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion; 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char; 308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided; 309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5; 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qsignature, Qendian, Qbig, Qlittle; 311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history; 312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes; 313 Lisp_Object Qvalid_codes;
314 314
315 extern Lisp_Object Qinsert_file_contents, Qwrite_region; 315 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
316 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument; 316 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1) 624 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2) 625 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else) 626 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else) 627 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8) 628 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
629 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
629 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be) 630 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
630 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le) 631 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
631 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig) 632 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
632 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig) 633 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
633 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset) 634 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
1355 ONE_MORE_BYTE (c1); 1356 ONE_MORE_BYTE (c1);
1356 ONE_MORE_BYTE (c2); 1357 ONE_MORE_BYTE (c2);
1357 1358
1358 if ((c1 == 0xFF) && (c2 == 0xFE)) 1359 if ((c1 == 0xFF) && (c2 == 0xFE))
1359 { 1360 {
1360 detect_info->found |= CATEGORY_MASK_UTF_16_LE; 1361 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1362 | CATEGORY_MASK_UTF_16_AUTO);
1361 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE; 1363 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1362 } 1364 }
1363 else if ((c1 == 0xFE) && (c2 == 0xFF)) 1365 else if ((c1 == 0xFE) && (c2 == 0xFF))
1364 { 1366 {
1365 detect_info->found |= CATEGORY_MASK_UTF_16_BE; 1367 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1368 | CATEGORY_MASK_UTF_16_AUTO);
1366 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE; 1369 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1367 } 1370 }
1368 no_more_source: 1371 no_more_source:
1369 return 1; 1372 return 1;
1370 } 1373 }
1385 int surrogate = CODING_UTF_16_SURROGATE (coding); 1388 int surrogate = CODING_UTF_16_SURROGATE (coding);
1386 Lisp_Object attr, eol_type, charset_list; 1389 Lisp_Object attr, eol_type, charset_list;
1387 1390
1388 CODING_GET_INFO (coding, attr, eol_type, charset_list); 1391 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1389 1392
1390 if (bom != utf_16_without_bom) 1393 if (bom == utf_16_with_bom)
1391 { 1394 {
1392 int c, c1, c2; 1395 int c, c1, c2;
1393 1396
1394 src_base = src; 1397 src_base = src;
1395 ONE_MORE_BYTE (c1); 1398 ONE_MORE_BYTE (c1);
1396 ONE_MORE_BYTE (c2); 1399 ONE_MORE_BYTE (c2);
1397 c = (c1 << 8) | c2; 1400 c = (c1 << 8) | c2;
1398 if (bom == utf_16_with_bom) 1401
1399 { 1402 if (endian == utf_16_big_endian
1400 if (endian == utf_16_big_endian 1403 ? c != 0xFEFF : c != 0xFFFE)
1401 ? c != 0xFEFF : c != 0xFFFE) 1404 {
1402 { 1405 /* The first two bytes are not BOM. Treat them as bytes
1403 /* We are sure that there's enouph room at CHARBUF. */ 1406 for a normal character. */
1404 *charbuf++ = c1; 1407 src = src_base;
1405 *charbuf++ = c2; 1408 coding->errors++;
1406 coding->errors++; 1409 }
1407 } 1410 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1408 } 1411 }
1409 else 1412 else if (bom == utf_16_detect_bom)
1410 { 1413 {
1411 if (c == 0xFEFF) 1414 /* We have already tried to detect BOM and failed in
1412 CODING_UTF_16_ENDIAN (coding) 1415 detect_coding. */
1413 = endian = utf_16_big_endian; 1416 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1414 else if (c == 0xFFFE)
1415 CODING_UTF_16_ENDIAN (coding)
1416 = endian = utf_16_little_endian;
1417 else
1418 {
1419 CODING_UTF_16_ENDIAN (coding)
1420 = endian = utf_16_big_endian;
1421 src = src_base;
1422 }
1423 }
1424 CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1425 } 1417 }
1426 1418
1427 while (1) 1419 while (1)
1428 { 1420 {
1429 int c, c1, c2; 1421 int c, c1, c2;
1492 Lisp_Object attrs, eol_type, charset_list; 1484 Lisp_Object attrs, eol_type, charset_list;
1493 int c; 1485 int c;
1494 1486
1495 CODING_GET_INFO (coding, attrs, eol_type, charset_list); 1487 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1496 1488
1497 if (bom == utf_16_with_bom) 1489 if (bom != utf_16_without_bom)
1498 { 1490 {
1499 ASSURE_DESTINATION (safe_room); 1491 ASSURE_DESTINATION (safe_room);
1500 if (big_endian) 1492 if (big_endian)
1501 EMIT_TWO_BYTES (0xFE, 0xFF); 1493 EMIT_TWO_BYTES (0xFE, 0xFF);
1502 else 1494 else
4857 val = AREF (attrs, coding_attr_utf_16_bom); 4849 val = AREF (attrs, coding_attr_utf_16_bom);
4858 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom 4850 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4859 : EQ (val, Qt) ? utf_16_with_bom 4851 : EQ (val, Qt) ? utf_16_with_bom
4860 : utf_16_without_bom); 4852 : utf_16_without_bom);
4861 val = AREF (attrs, coding_attr_utf_16_endian); 4853 val = AREF (attrs, coding_attr_utf_16_endian);
4862 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian 4854 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4863 : utf_16_little_endian); 4855 : utf_16_little_endian);
4864 CODING_UTF_16_SURROGATE (coding) = 0; 4856 CODING_UTF_16_SURROGATE (coding) = 0;
4865 coding->detector = detect_coding_utf_16; 4857 coding->detector = detect_coding_utf_16;
4866 coding->decoder = decode_coding_utf_16; 4858 coding->decoder = decode_coding_utf_16;
4867 coding->encoder = encode_coding_utf_16; 4859 coding->encoder = encode_coding_utf_16;
4868 coding->common_flags 4860 coding->common_flags
4869 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); 4861 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4862 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4863 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4870 } 4864 }
4871 else if (EQ (coding_type, Qccl)) 4865 else if (EQ (coding_type, Qccl))
4872 { 4866 {
4873 coding->detector = detect_coding_ccl; 4867 coding->detector = detect_coding_ccl;
4874 coding->decoder = decode_coding_ccl; 4868 coding->decoder = decode_coding_ccl;
5283 setup_coding_system (CODING_ID_NAME (this->id), coding); 5277 setup_coding_system (CODING_ID_NAME (this->id), coding);
5284 break; 5278 break;
5285 } 5279 }
5286 } 5280 }
5287 } 5281 }
5282 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5283 {
5284 Lisp_Object coding_systems;
5285 struct coding_detection_info detect_info;
5286
5287 coding_systems
5288 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5289 detect_info.found = detect_info.rejected = 0;
5290 if (CONSP (coding_systems)
5291 && detect_coding_utf_16 (coding, &detect_info)
5292 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5293 | CATEGORY_MASK_UTF_16_BE)))
5294 {
5295 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5296 setup_coding_system (XCAR (coding_systems), coding);
5297 else
5298 setup_coding_system (XCDR (coding_systems), coding);
5299 }
5300 }
5288 5301
5289 attrs = CODING_ID_ATTRS (coding->id); 5302 attrs = CODING_ID_ATTRS (coding->id);
5290 coding_type = CODING_ATTR_TYPE (attrs); 5303 coding_type = CODING_ATTR_TYPE (attrs);
5291 5304
5292 /* If we have not yet decided the EOL type, detect it now. But, the 5305 /* If we have not yet decided the EOL type, detect it now. But, the
7955 CHECK_CODING_SYSTEM (XCDR (bom)); 7968 CHECK_CODING_SYSTEM (XCDR (bom));
7956 } 7969 }
7957 ASET (attrs, coding_attr_utf_16_bom, bom); 7970 ASET (attrs, coding_attr_utf_16_bom, bom);
7958 7971
7959 endian = args[coding_arg_utf16_endian]; 7972 endian = args[coding_arg_utf16_endian];
7973 CHECK_SYMBOL (endian);
7974 if (NILP (endian))
7975 endian = Qbig;
7976 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
7977 error ("Invalid endian: %s", XSYMBOL (endian)->name->data);
7960 ASET (attrs, coding_attr_utf_16_endian, endian); 7978 ASET (attrs, coding_attr_utf_16_endian, endian);
7961 7979
7962 category = (CONSP (bom) 7980 category = (CONSP (bom)
7963 ? coding_category_utf_16_auto 7981 ? coding_category_utf_16_auto
7964 : NILP (bom) 7982 : NILP (bom)
7965 ? (NILP (endian) 7983 ? (EQ (endian, Qbig)
7966 ? coding_category_utf_16_be_nosig 7984 ? coding_category_utf_16_be_nosig
7967 : coding_category_utf_16_le_nosig) 7985 : coding_category_utf_16_le_nosig)
7968 : (NILP (endian) 7986 : (EQ (endian, Qbig)
7969 ? coding_category_utf_16_be 7987 ? coding_category_utf_16_be
7970 : coding_category_utf_16_le)); 7988 : coding_category_utf_16_le));
7971 } 7989 }
7972 else if (EQ (coding_type, Qiso_2022)) 7990 else if (EQ (coding_type, Qiso_2022))
7973 { 7991 {
8405 DEFSYM (Qiso_2022, "iso-2022"); 8423 DEFSYM (Qiso_2022, "iso-2022");
8406 8424
8407 DEFSYM (Qutf_8, "utf-8"); 8425 DEFSYM (Qutf_8, "utf-8");
8408 8426
8409 DEFSYM (Qutf_16, "utf-16"); 8427 DEFSYM (Qutf_16, "utf-16");
8410 DEFSYM (Qsignature, "signature");
8411 DEFSYM (Qendian, "endian");
8412 DEFSYM (Qbig, "big"); 8428 DEFSYM (Qbig, "big");
8413 DEFSYM (Qlittle, "little"); 8429 DEFSYM (Qlittle, "little");
8414 8430
8415 DEFSYM (Qshift_jis, "shift-jis"); 8431 DEFSYM (Qshift_jis, "shift-jis");
8416 DEFSYM (Qbig5, "big5"); 8432 DEFSYM (Qbig5, "big5");