comparison src/coding.c @ 88438:3a34b722dd71

(encode_coding_utf_8): Initialize produced_chars to 0. (decode_coding_utf_16): Fix converting high and low bytes to code-point. (encode_coding_utf_16): Substitute coding->default_char for non-Unicode characters. (decode_coding): Don't call record_insert here. (setup_coding_system): Initialize `surrogate' of coding->spec.utf_16 to 0. (EMIT_ONE_BYTE): Fix for multibyte case.
author Kenichi Handa <handa@m17n.org>
date Fri, 08 Mar 2002 00:19:39 +0000
parents 6418a272b97e
children 6b86cf30a0b9
comparison
equal deleted inserted replaced
88437:ffec838ec027 88438:3a34b722dd71
44 /*** 0. General comments *** 44 /*** 0. General comments ***
45 45
46 46
47 CODING SYSTEM 47 CODING SYSTEM
48 48
49 Coding system is an encoding mechanism of one or more character 49 Coding system is an object for a encoding mechanism that contains
50 sets. Here's a list of coding system types supported by Emacs. 50 information about how to convert byte sequence to character
51 When we say "decode", it means converting a text encoded by some 51 sequences and vice versa. When we say "decode", it means converting
52 coding system into Emacs' internal format (emacs-utf-8), and when we 52 a byte sequence of a specific coding system into a character
53 say "encode", it means converting a text of emacs-utf-8 to some 53 sequence that is represented by Emacs' internal coding system
54 other coding system. 54 `emacs-utf-8', and when we say "encode", it means converting a
55 55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 Emacs represents a coding system by a Lisp symbol. Each symbol is a 56 coding system.
57 key to the hash table Vcharset_hash_table. This hash table 57
58 associates the symbol to the corresponding detailed specifications. 58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 59 C level, a coding system is represented by a vector of attributes
60 Before using a coding system for decoding and encoding, we setup a 60 stored in the hash table Vcharset_hash_table. The conversion from a
61 structure of type `struct coding_system'. This structure keeps 61 coding system symbol to attributes vector is done by looking up
62 various information about a specific code conversion (e.g. the 62 Vcharset_hash_table by the symbol.
63 location of source and destination data). 63
64 64 Coding systems are classified into the following types depending on
65 Coding systems are classified into the following types by how to 65 the mechanism of encoding. Here's a brief descrition about type.
66 represent a character in a byte sequence. Here's a brief descrition
67 about type.
68
69 o Emacs' internal format (emacs-utf-8)
70
71 The extended UTF-8 which allows eight-bit raw bytes mixed with
72 character codes. Emacs holds characters in buffers and strings by
73 this format.
74 66
75 o UTF-8 67 o UTF-8
76 68
77 o UTF-16 69 o UTF-16
78 70
134 `carriage-return'. 126 `carriage-return'.
135 127
136 Since text characters encoding and end-of-line encoding are 128 Since text characters encoding and end-of-line encoding are
137 independent, any coding system described above can take any format 129 independent, any coding system described above can take any format
138 of end-of-line (except for no-conversion). 130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
139 138
140 */ 139 */
141 140
142 /* COMMON MACROS */ 141 /* COMMON MACROS */
143 142
816 } while (0) 815 } while (0)
817 816
818 817
819 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */ 818 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
820 819
821 #define EMIT_TWO_BYTES(c1, c2) \ 820 #define EMIT_TWO_BYTES(c1, c2) \
822 do { \ 821 do { \
823 produced_chars += 2; \ 822 produced_chars += 2; \
824 if (multibytep) \ 823 if (multibytep) \
825 { \ 824 { \
826 CHAR_STRING_ADVANCE ((int) (c1), dst); \ 825 int ch; \
827 CHAR_STRING_ADVANCE ((int) (c2), dst); \ 826 \
828 } \ 827 ch = (c1); \
829 else \ 828 if (ch >= 0x80) \
830 { \ 829 ch = BYTE8_TO_CHAR (ch); \
831 *dst++ = (c1); \ 830 CHAR_STRING_ADVANCE (ch, dst); \
832 *dst++ = (c2); \ 831 ch = (c2); \
833 } \ 832 if (ch >= 0x80) \
833 ch = BYTE8_TO_CHAR (ch); \
834 CHAR_STRING_ADVANCE (ch, dst); \
835 } \
836 else \
837 { \
838 *dst++ = (c1); \
839 *dst++ = (c2); \
840 } \
834 } while (0) 841 } while (0)
835 842
836 843
837 #define EMIT_THREE_BYTES(c1, c2, c3) \ 844 #define EMIT_THREE_BYTES(c1, c2, c3) \
838 do { \ 845 do { \
887 { 894 {
888 if (coding->src_pos < 0) 895 if (coding->src_pos < 0)
889 coding->source = GAP_END_ADDR + coding->src_pos_byte; 896 coding->source = GAP_END_ADDR + coding->src_pos_byte;
890 else 897 else
891 { 898 {
892 if (coding->src_pos < GPT 899 struct buffer *buf = XBUFFER (coding->src_object);
893 && coding->src_pos + coding->src_chars >= GPT) 900 EMACS_INT beg_byte = BUF_BEG_BYTE (buf);
894 move_gap_both (coding->src_pos, coding->src_pos_byte); 901 EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
895 coding->source = BYTE_POS_ADDR (coding->src_pos_byte); 902 unsigned char *beg_addr = BUF_BEG_ADDR (buf);
903
904 coding->source = beg_addr + coding->src_pos_byte - 1;
905 if (coding->src_pos_byte >= gpt_byte)
906 coding->source += BUF_GAP_SIZE (buf);
896 } 907 }
897 } 908 }
898 else if (STRINGP (coding->src_object)) 909 else if (STRINGP (coding->src_object))
899 { 910 {
900 coding->source = (XSTRING (coding->src_object)->data 911 coding->source = (XSTRING (coding->src_object)->data
1180 int multibytep = coding->dst_multibyte; 1191 int multibytep = coding->dst_multibyte;
1181 int *charbuf = coding->charbuf; 1192 int *charbuf = coding->charbuf;
1182 int *charbuf_end = charbuf + coding->charbuf_used; 1193 int *charbuf_end = charbuf + coding->charbuf_used;
1183 unsigned char *dst = coding->destination + coding->produced; 1194 unsigned char *dst = coding->destination + coding->produced;
1184 unsigned char *dst_end = coding->destination + coding->dst_bytes; 1195 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1185 int produced_chars; 1196 int produced_chars = 0;
1186 int c; 1197 int c;
1187 1198
1188 if (multibytep) 1199 if (multibytep)
1189 { 1200 {
1190 int safe_room = MAX_MULTIBYTE_LENGTH * 2; 1201 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1288 int c, c1, c2; 1299 int c, c1, c2;
1289 1300
1290 src_base = src; 1301 src_base = src;
1291 ONE_MORE_BYTE (c1); 1302 ONE_MORE_BYTE (c1);
1292 ONE_MORE_BYTE (c2); 1303 ONE_MORE_BYTE (c2);
1293 c = (c1 << 16) | c2; 1304 c = (c1 << 8) | c2;
1294 if (bom == utf_16_with_bom) 1305 if (bom == utf_16_with_bom)
1295 { 1306 {
1296 if (endian == utf_16_big_endian 1307 if (endian == utf_16_big_endian
1297 ? c != 0xFFFE : c != 0xFEFF) 1308 ? c != 0xFFFE : c != 0xFEFF)
1298 { 1309 {
1331 break; 1342 break;
1332 1343
1333 ONE_MORE_BYTE (c1); 1344 ONE_MORE_BYTE (c1);
1334 ONE_MORE_BYTE (c2); 1345 ONE_MORE_BYTE (c2);
1335 c = (endian == utf_16_big_endian 1346 c = (endian == utf_16_big_endian
1336 ? ((c1 << 16) | c2) : ((c2 << 16) | c1)); 1347 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1337 if (surrogate) 1348 if (surrogate)
1338 { 1349 {
1339 if (! UTF_16_LOW_SURROGATE_P (c)) 1350 if (! UTF_16_LOW_SURROGATE_P (c))
1340 { 1351 {
1341 if (endian == utf_16_big_endian) 1352 if (endian == utf_16_big_endian)
1402 1413
1403 while (charbuf < charbuf_end) 1414 while (charbuf < charbuf_end)
1404 { 1415 {
1405 ASSURE_DESTINATION (safe_room); 1416 ASSURE_DESTINATION (safe_room);
1406 c = *charbuf++; 1417 c = *charbuf++;
1407 if (c >= 0x110000) 1418 if (c >= MAX_UNICODE_CHAR)
1408 c = 0xFFFF; 1419 c = coding->default_char;
1409 1420
1410 if (c < 0x10000) 1421 if (c < 0x10000)
1411 { 1422 {
1412 if (big_endian) 1423 if (big_endian)
1413 EMIT_TWO_BYTES (c >> 8, c & 0xFF); 1424 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
4502 : EQ (val, Qt) ? utf_16_with_bom 4513 : EQ (val, Qt) ? utf_16_with_bom
4503 : utf_16_without_bom); 4514 : utf_16_without_bom);
4504 val = AREF (attrs, coding_attr_utf_16_endian); 4515 val = AREF (attrs, coding_attr_utf_16_endian);
4505 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian 4516 CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4506 : utf_16_little_endian); 4517 : utf_16_little_endian);
4518 CODING_UTF_16_SURROGATE (coding) = 0;
4507 coding->detector = detect_coding_utf_16; 4519 coding->detector = detect_coding_utf_16;
4508 coding->decoder = decode_coding_utf_16; 4520 coding->decoder = decode_coding_utf_16;
4509 coding->encoder = encode_coding_utf_16; 4521 coding->encoder = encode_coding_utf_16;
4510 coding->common_flags 4522 coding->common_flags
4511 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK); 4523 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5454 coding->carryover_bytes = nbytes; 5466 coding->carryover_bytes = nbytes;
5455 while (nbytes-- > 0) 5467 while (nbytes-- > 0)
5456 *p++ = *src++; 5468 *p++ = *src++;
5457 } 5469 }
5458 coding->consumed = coding->src_bytes; 5470 coding->consumed = coding->src_bytes;
5459 }
5460
5461 if (BUFFERP (coding->dst_object))
5462 {
5463 record_insert (coding->dst_pos, coding->produced_char);
5464 } 5471 }
5465 5472
5466 return coding->result; 5473 return coding->result;
5467 } 5474 }
5468 5475