Mercurial > emacs
comparison src/coding.c @ 88438:3a34b722dd71
(encode_coding_utf_8): Initialize produced_chars to 0.
(decode_coding_utf_16): Fix converting high and low bytes to
code-point.
(encode_coding_utf_16): Substitute coding->default_char for
non-Unicode characters.
(decode_coding): Don't call record_insert here.
(setup_coding_system): Initialize `surrogate' of
coding->spec.utf_16 to 0.
(EMIT_ONE_BYTE): Fix for multibyte case.
| author | Kenichi Handa <handa@m17n.org> |
|---|---|
| date | Fri, 08 Mar 2002 00:19:39 +0000 |
| parents | 6418a272b97e |
| children | 6b86cf30a0b9 |
comparison
equal
deleted
inserted
replaced
| 88437:ffec838ec027 | 88438:3a34b722dd71 |
|---|---|
| 44 /*** 0. General comments *** | 44 /*** 0. General comments *** |
| 45 | 45 |
| 46 | 46 |
| 47 CODING SYSTEM | 47 CODING SYSTEM |
| 48 | 48 |
| 49 Coding system is an encoding mechanism of one or more character | 49 Coding system is an object for a encoding mechanism that contains |
| 50 sets. Here's a list of coding system types supported by Emacs. | 50 information about how to convert byte sequence to character |
| 51 When we say "decode", it means converting a text encoded by some | 51 sequences and vice versa. When we say "decode", it means converting |
| 52 coding system into Emacs' internal format (emacs-utf-8), and when we | 52 a byte sequence of a specific coding system into a character |
| 53 say "encode", it means converting a text of emacs-utf-8 to some | 53 sequence that is represented by Emacs' internal coding system |
| 54 other coding system. | 54 `emacs-utf-8', and when we say "encode", it means converting a |
| 55 | 55 character sequence of emacs-utf-8 to a byte sequence of a specific |
| 56 Emacs represents a coding system by a Lisp symbol. Each symbol is a | 56 coding system. |
| 57 key to the hash table Vcharset_hash_table. This hash table | 57 |
| 58 associates the symbol to the corresponding detailed specifications. | 58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In |
| 59 | 59 C level, a coding system is represented by a vector of attributes |
| 60 Before using a coding system for decoding and encoding, we setup a | 60 stored in the hash table Vcharset_hash_table. The conversion from a |
| 61 structure of type `struct coding_system'. This structure keeps | 61 coding system symbol to attributes vector is done by looking up |
| 62 various information about a specific code conversion (e.g. the | 62 Vcharset_hash_table by the symbol. |
| 63 location of source and destination data). | 63 |
| 64 | 64 Coding systems are classified into the following types depending on |
| 65 Coding systems are classified into the following types by how to | 65 the mechanism of encoding. Here's a brief descrition about type. |
| 66 represent a character in a byte sequence. Here's a brief descrition | |
| 67 about type. | |
| 68 | |
| 69 o Emacs' internal format (emacs-utf-8) | |
| 70 | |
| 71 The extended UTF-8 which allows eight-bit raw bytes mixed with | |
| 72 character codes. Emacs holds characters in buffers and strings by | |
| 73 this format. | |
| 74 | 66 |
| 75 o UTF-8 | 67 o UTF-8 |
| 76 | 68 |
| 77 o UTF-16 | 69 o UTF-16 |
| 78 | 70 |
| 134 `carriage-return'. | 126 `carriage-return'. |
| 135 | 127 |
| 136 Since text characters encoding and end-of-line encoding are | 128 Since text characters encoding and end-of-line encoding are |
| 137 independent, any coding system described above can take any format | 129 independent, any coding system described above can take any format |
| 138 of end-of-line (except for no-conversion). | 130 of end-of-line (except for no-conversion). |
| 131 | |
| 132 STRUCT CODING_SYSTEM | |
| 133 | |
| 134 Before using a coding system for code conversion (i.e. decoding and | |
| 135 encoding), we setup a structure of type `struct coding_system'. | |
| 136 This structure keeps various information about a specific code | |
| 137 conversion (e.g. the location of source and destination data). | |
| 139 | 138 |
| 140 */ | 139 */ |
| 141 | 140 |
| 142 /* COMMON MACROS */ | 141 /* COMMON MACROS */ |
| 143 | 142 |
| 816 } while (0) | 815 } while (0) |
| 817 | 816 |
| 818 | 817 |
| 819 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */ | 818 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */ |
| 820 | 819 |
| 821 #define EMIT_TWO_BYTES(c1, c2) \ | 820 #define EMIT_TWO_BYTES(c1, c2) \ |
| 822 do { \ | 821 do { \ |
| 823 produced_chars += 2; \ | 822 produced_chars += 2; \ |
| 824 if (multibytep) \ | 823 if (multibytep) \ |
| 825 { \ | 824 { \ |
| 826 CHAR_STRING_ADVANCE ((int) (c1), dst); \ | 825 int ch; \ |
| 827 CHAR_STRING_ADVANCE ((int) (c2), dst); \ | 826 \ |
| 828 } \ | 827 ch = (c1); \ |
| 829 else \ | 828 if (ch >= 0x80) \ |
| 830 { \ | 829 ch = BYTE8_TO_CHAR (ch); \ |
| 831 *dst++ = (c1); \ | 830 CHAR_STRING_ADVANCE (ch, dst); \ |
| 832 *dst++ = (c2); \ | 831 ch = (c2); \ |
| 833 } \ | 832 if (ch >= 0x80) \ |
| 833 ch = BYTE8_TO_CHAR (ch); \ | |
| 834 CHAR_STRING_ADVANCE (ch, dst); \ | |
| 835 } \ | |
| 836 else \ | |
| 837 { \ | |
| 838 *dst++ = (c1); \ | |
| 839 *dst++ = (c2); \ | |
| 840 } \ | |
| 834 } while (0) | 841 } while (0) |
| 835 | 842 |
| 836 | 843 |
| 837 #define EMIT_THREE_BYTES(c1, c2, c3) \ | 844 #define EMIT_THREE_BYTES(c1, c2, c3) \ |
| 838 do { \ | 845 do { \ |
| 887 { | 894 { |
| 888 if (coding->src_pos < 0) | 895 if (coding->src_pos < 0) |
| 889 coding->source = GAP_END_ADDR + coding->src_pos_byte; | 896 coding->source = GAP_END_ADDR + coding->src_pos_byte; |
| 890 else | 897 else |
| 891 { | 898 { |
| 892 if (coding->src_pos < GPT | 899 struct buffer *buf = XBUFFER (coding->src_object); |
| 893 && coding->src_pos + coding->src_chars >= GPT) | 900 EMACS_INT beg_byte = BUF_BEG_BYTE (buf); |
| 894 move_gap_both (coding->src_pos, coding->src_pos_byte); | 901 EMACS_INT gpt_byte = BUF_GPT_BYTE (buf); |
| 895 coding->source = BYTE_POS_ADDR (coding->src_pos_byte); | 902 unsigned char *beg_addr = BUF_BEG_ADDR (buf); |
| 903 | |
| 904 coding->source = beg_addr + coding->src_pos_byte - 1; | |
| 905 if (coding->src_pos_byte >= gpt_byte) | |
| 906 coding->source += BUF_GAP_SIZE (buf); | |
| 896 } | 907 } |
| 897 } | 908 } |
| 898 else if (STRINGP (coding->src_object)) | 909 else if (STRINGP (coding->src_object)) |
| 899 { | 910 { |
| 900 coding->source = (XSTRING (coding->src_object)->data | 911 coding->source = (XSTRING (coding->src_object)->data |
| 1180 int multibytep = coding->dst_multibyte; | 1191 int multibytep = coding->dst_multibyte; |
| 1181 int *charbuf = coding->charbuf; | 1192 int *charbuf = coding->charbuf; |
| 1182 int *charbuf_end = charbuf + coding->charbuf_used; | 1193 int *charbuf_end = charbuf + coding->charbuf_used; |
| 1183 unsigned char *dst = coding->destination + coding->produced; | 1194 unsigned char *dst = coding->destination + coding->produced; |
| 1184 unsigned char *dst_end = coding->destination + coding->dst_bytes; | 1195 unsigned char *dst_end = coding->destination + coding->dst_bytes; |
| 1185 int produced_chars; | 1196 int produced_chars = 0; |
| 1186 int c; | 1197 int c; |
| 1187 | 1198 |
| 1188 if (multibytep) | 1199 if (multibytep) |
| 1189 { | 1200 { |
| 1190 int safe_room = MAX_MULTIBYTE_LENGTH * 2; | 1201 int safe_room = MAX_MULTIBYTE_LENGTH * 2; |
| 1288 int c, c1, c2; | 1299 int c, c1, c2; |
| 1289 | 1300 |
| 1290 src_base = src; | 1301 src_base = src; |
| 1291 ONE_MORE_BYTE (c1); | 1302 ONE_MORE_BYTE (c1); |
| 1292 ONE_MORE_BYTE (c2); | 1303 ONE_MORE_BYTE (c2); |
| 1293 c = (c1 << 16) | c2; | 1304 c = (c1 << 8) | c2; |
| 1294 if (bom == utf_16_with_bom) | 1305 if (bom == utf_16_with_bom) |
| 1295 { | 1306 { |
| 1296 if (endian == utf_16_big_endian | 1307 if (endian == utf_16_big_endian |
| 1297 ? c != 0xFFFE : c != 0xFEFF) | 1308 ? c != 0xFFFE : c != 0xFEFF) |
| 1298 { | 1309 { |
| 1331 break; | 1342 break; |
| 1332 | 1343 |
| 1333 ONE_MORE_BYTE (c1); | 1344 ONE_MORE_BYTE (c1); |
| 1334 ONE_MORE_BYTE (c2); | 1345 ONE_MORE_BYTE (c2); |
| 1335 c = (endian == utf_16_big_endian | 1346 c = (endian == utf_16_big_endian |
| 1336 ? ((c1 << 16) | c2) : ((c2 << 16) | c1)); | 1347 ? ((c1 << 8) | c2) : ((c2 << 8) | c1)); |
| 1337 if (surrogate) | 1348 if (surrogate) |
| 1338 { | 1349 { |
| 1339 if (! UTF_16_LOW_SURROGATE_P (c)) | 1350 if (! UTF_16_LOW_SURROGATE_P (c)) |
| 1340 { | 1351 { |
| 1341 if (endian == utf_16_big_endian) | 1352 if (endian == utf_16_big_endian) |
| 1402 | 1413 |
| 1403 while (charbuf < charbuf_end) | 1414 while (charbuf < charbuf_end) |
| 1404 { | 1415 { |
| 1405 ASSURE_DESTINATION (safe_room); | 1416 ASSURE_DESTINATION (safe_room); |
| 1406 c = *charbuf++; | 1417 c = *charbuf++; |
| 1407 if (c >= 0x110000) | 1418 if (c >= MAX_UNICODE_CHAR) |
| 1408 c = 0xFFFF; | 1419 c = coding->default_char; |
| 1409 | 1420 |
| 1410 if (c < 0x10000) | 1421 if (c < 0x10000) |
| 1411 { | 1422 { |
| 1412 if (big_endian) | 1423 if (big_endian) |
| 1413 EMIT_TWO_BYTES (c >> 8, c & 0xFF); | 1424 EMIT_TWO_BYTES (c >> 8, c & 0xFF); |
| 4502 : EQ (val, Qt) ? utf_16_with_bom | 4513 : EQ (val, Qt) ? utf_16_with_bom |
| 4503 : utf_16_without_bom); | 4514 : utf_16_without_bom); |
| 4504 val = AREF (attrs, coding_attr_utf_16_endian); | 4515 val = AREF (attrs, coding_attr_utf_16_endian); |
| 4505 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian | 4516 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian |
| 4506 : utf_16_little_endian); | 4517 : utf_16_little_endian); |
| 4518 CODING_UTF_16_SURROGATE (coding) = 0; | |
| 4507 coding->detector = detect_coding_utf_16; | 4519 coding->detector = detect_coding_utf_16; |
| 4508 coding->decoder = decode_coding_utf_16; | 4520 coding->decoder = decode_coding_utf_16; |
| 4509 coding->encoder = encode_coding_utf_16; | 4521 coding->encoder = encode_coding_utf_16; |
| 4510 coding->common_flags | 4522 coding->common_flags |
| 4511 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); | 4523 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); |
| 5454 coding->carryover_bytes = nbytes; | 5466 coding->carryover_bytes = nbytes; |
| 5455 while (nbytes-- > 0) | 5467 while (nbytes-- > 0) |
| 5456 *p++ = *src++; | 5468 *p++ = *src++; |
| 5457 } | 5469 } |
| 5458 coding->consumed = coding->src_bytes; | 5470 coding->consumed = coding->src_bytes; |
| 5459 } | |
| 5460 | |
| 5461 if (BUFFERP (coding->dst_object)) | |
| 5462 { | |
| 5463 record_insert (coding->dst_pos, coding->produced_char); | |
| 5464 } | 5471 } |
| 5465 | 5472 |
| 5466 return coding->result; | 5473 return coding->result; |
| 5467 } | 5474 } |
| 5468 | 5475 |
